In [5]:
import json
import pandas as pd

# Read the CSV file
df = pd.read_csv('../data/All_Diets.csv', sep=',')

# Drop any duplicate rows
df = df.drop_duplicates()

# Transform the column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Drop the specified not needed columns
df = df.drop(columns=['extraction_day', 'extraction_time'])

# Select 200 random rows
df = df.sample(n=200, random_state=1)

# Sort by 'diet_type'
df = df.sort_values(by='diet_type')

# Reindex the DataFrame
df['id'] = range(0, len(df))

# Move the 'id' column to the first position
cols = ['id'] + [col for col in df if col != 'id']
df = df[cols]

# Drop the initial index
df = df.reset_index(drop=True)

# Convert all values in the DataFrame to strings
df = df.astype(str)

# Write the DataFrame back to a CSV file
df.to_csv('../data/data.csv', index=False)

# Convert the DataFrame to a list of dictionaries
documents = df.to_dict(orient='records')

# Save the list of documents as a JSON file
with open('../data/documents.json', 'w') as json_file:
    json.dump(documents, json_file, indent=4)

# Display the DataFrame
df

Unnamed: 0,id,diet_type,recipe_name,cuisine_type,protein(g),carbs(g),fat(g)
0,0,dash,Potato Latkes Made Simple: A Twitter Recipe,kosher,31.55,110.84,118.28
1,1,dash,Avocado Dressing,american,4.18,18.97,118.42
2,2,dash,Bread Salad,american,44.09,153.84,86.03
3,3,dash,Ultimat Sparkler,world,0.32,16.32,0.08
4,4,dash,Yogurt Pops,american,9.07,34.05,7.9
...,...,...,...,...,...,...,...
195,195,vegan,Mushroom Beet Black Bean Burger,american,60.14,138.46,27.25
196,196,vegan,Vegan Creamy Coleslaw,american,6.76,40.99,68.13
197,197,vegan,Smoked Cauliflower Soup,american,49.78,134.47,17.29
198,198,vegan,Baby Artichokes with Lemon and Olive Oil,mediterranean,90.34,307.15,220.77
