# ramen_DB.csv preprocessing

In [None]:
import pandas as pd

# Read the data
df = pd.read_csv('ramen_DB.csv')

# 중복 제거
df = df.drop_duplicates(['식품명'], keep='first')

# 중복 제거 후 인덱스 재설정
df = df.reset_index(drop=True)

# 중복 제거 후 데이터 저장
df.to_csv('ramen_DB.csv', index=False, encoding='utf-8-sig')

# Create Embeddings and Save them to a File

In [None]:
import openai
import pandas as pd 
import csv
import config

with open('ramen_DB_final.json',encoding='UTF-8-sig') as f:
    temp = csv.loads(f.read())
df = pd.DataFrame(temp)   

openai.api_key = config.api_key
openai_model = 'text-embedding-ada-002'

overview_emb = []
for i in range(df.shape[0]):
    text = df['soup'][i]
    result = openai.Embedding.create(model=openai_model, input=text)
    result_emb = result['data'][0]['embedding']
    overview_emb.append(result_emb)

df.to_csv('ramen_DB_final_embedded.csv', encoding='utf-8-sig', index=False)

# Analyze the Embeddings and Create Result JSON Files

In [None]:
import pandas as pd
import torch
from sentence_transformers import util

df = pd.read_csv('ramen_DB_final_embedded.csv',encoding='utf-8-sig')

df["openai_emb"] = df["openai_emb"].apply(lambda x: x[1:-1].split(', '))
df["openai_emb"] = df["openai_emb"].apply(lambda x: list(map(float, x)))

In [None]:
def make_result(pref):
    temp_list = []

    for i in df.index:
        temp_ramen_name = df.loc[i, 'name']
        temp_emb = df.loc[i, 'openai_emb']

        if pref == '1_1':
            temp_df = df.copy()
        elif pref == '1_2':
            temp_df = pd.DataFrame(df[df['calories'] > df.loc[i, 'calories']])
        elif pref == '1_3':
            temp_df = pd.DataFrame(df[df['calories'] < df.loc[i, 'calories']])

        elif pref == '2_1':
            temp_df = pd.DataFrame(df[df['sodium'] > df.loc[i, 'sodium']])
        elif pref == '2_2':
            temp_df = pd.DataFrame(df[(df['sodium'] > df.loc[i, 'sodium']) & (df['calories'] > df.loc[i, 'calories'])])
        elif pref == '2_3':
            temp_df = pd.DataFrame(df[(df['sodium'] > df.loc[i, 'sodium']) & (df['calories'] < df.loc[i, 'calories'])])

        elif pref == '3_1':
            temp_df = pd.DataFrame(df[df['sodium'] < df.loc[i, 'sodium']])
        elif pref == '3_2':
            temp_df = pd.DataFrame(df[(df['sodium'] < df.loc[i, 'sodium']) & (df['calories'] > df.loc[i, 'calories'])])
        elif pref == '3_3':
            temp_df = pd.DataFrame(df[(df['sodium'] < df.loc[i, 'sodium']) & (df['calories'] < df.loc[i, 'calories'])])

        else:
            raise ValueError('Invalid pref')
        
        temp_df = temp_df.reset_index(drop=True)

        try:
            cos_scores = util.pytorch_cos_sim(temp_emb, temp_df['openai_emb'])[0]
        except:
            temp_list.append({})
            continue

        top_k = 10
        while top_k > 0:
            try:
                top_results = torch.topk(cos_scores, k=top_k)
                break
            except:
                top_k -= 1

        top_result_id = top_results.indices.tolist()
        top_result_score = top_results.values.tolist()

        temp_result_df = temp_df[temp_df.index.isin(top_result_id)]
        temp_result_df.index = top_result_id
        temp_result_df['score'] = top_result_score

        for j in temp_result_df.index:
            if (temp_ramen_name in temp_result_df.loc[j, 'name']) or (temp_result_df.loc[j, 'name'] in temp_ramen_name):
                temp_result_df = temp_result_df.drop(j)

        index_to_drop = []
        for j in temp_result_df.index:
            for k in temp_result_df.index:
                if (temp_result_df.loc[j, 'name'] in temp_result_df.loc[k, 'name']) and (j != k):
                    index_to_drop.append(k)
        
        index_to_drop = list(set(index_to_drop))
        temp_result_df = temp_result_df.drop(index_to_drop)

        temp_result_df = temp_result_df.head(5)

        temp_result_df = temp_result_df[['name', 'sodium', 'calories']].to_dict('records')
        
        temp_list.append(temp_result_df)
    
    return temp_list

In [None]:
# Sodium: I don't care
# Calories: I don't care

df_1_1 = df.copy()
df_1_1 = df_1_1[['name', 'sodium', 'calories']]

df_1_1['result'] = make_result('1_1')

df_1_1.to_json("./json/idc+idc.json", orient='records', force_ascii=False)
    

In [None]:
# Sodium: I don't care
# Calories: Higher

df_1_2 = df.copy()
df_1_2 = df_1_2[['name', 'sodium', 'calories']]

df_1_2['result'] = make_result('1_2')

df_1_2.to_json("./json/idc+higher.json", orient='records', force_ascii=False)

In [None]:
# Sodium: I don't care
# Calories: Lower

df_1_3 = df.copy()
df_1_3 = df_1_3[['name', 'sodium', 'calories']]

df_1_3['result'] = make_result('1_3')

df_1_3.to_json("./json/idc+lower.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Higher
# Calories: I don't care

df_2_1 = df.copy()
df_2_1 = df_2_1[['name', 'sodium', 'calories']]

df_2_1['result'] = make_result('2_1')

df_2_1.to_json("./json/higher+idc.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Higher
# Calories: Higher

df_2_2 = df.copy()
df_2_2 = df_2_2[['name', 'sodium', 'calories']]

df_2_2['result'] = make_result('2_2')

df_2_2.to_json("./json/higher+higher.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Higher
# Calories: Lower

df_2_3 = df.copy()
df_2_3 = df_2_3[['name', 'sodium', 'calories']]

df_2_3['result'] = make_result('2_3')

df_2_3.to_json("./json/higher+lower.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Lower
# Calories: I don't care

df_3_1 = df.copy()
df_3_1 = df_3_1[['name', 'sodium', 'calories']]

df_3_1['result'] = make_result('3_1')

df_3_1.to_json("./json/lower+idc.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Lower
# Calories: Higher

df_3_2 = df.copy()
df_3_2 = df_3_2[['name', 'sodium', 'calories']]

df_3_2['result'] = make_result('3_2')

df_3_2.to_json("./json/lower+higher.json", orient='records', force_ascii=False)

In [None]:
# Sodium: Lower
# Calories: Lower

df_3_3 = df.copy()
df_3_3 = df_3_3[['name', 'sodium', 'calories']]

df_3_3['result'] = make_result('3_3')

df_3_3.to_json("./json/lower+lower.json", orient='records', force_ascii=False)