In [1]:
%matplotlib inline

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import zipfile
import pathlib

# Dummy File

In [11]:
# Create a Pandas DataFrame with dummy data
df = pd.DataFrame({
    'id': range(1, 11),
    'name': ['John', 'Jane', 'Bob', 'Alice', 'Mark', 'Julia', 'Adam', 'Eve', 'Sam', 'Sara'],
    'age': np.random.randint(20, 50, 10),
    'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
    'salary': np.random.randint(30000, 80000, 10)
})
df

Unnamed: 0,id,name,age,gender,salary
0,1,John,22,M,53511
1,2,Jane,24,F,39480
2,3,Bob,49,M,49047
3,4,Alice,42,F,60648
4,5,Mark,24,M,73021
5,6,Julia,23,F,57079
6,7,Adam,21,M,39469
7,8,Eve,49,F,53834
8,9,Sam,48,M,57367
9,10,Sara,47,F,61350


In [12]:
def replace_values(df, col):
    df[col] = df[col].str.replace('M', 'S')

In [14]:
replace_values(df, 'gender')

In [18]:
df['age'].name

'age'

In [19]:
df.loc[:, 'age']

0    22
1    24
2    49
3    42
4    24
5    23
6    21
7    49
8    48
9    47
Name: age, dtype: int32

In [20]:
df.age

0    22
1    24
2    49
3    42
4    24
5    23
6    21
7    49
8    48
9    47
Name: age, dtype: int32

In [None]:
import pandas as pd
import json


In [None]:
def remove_advertisement(df, cols):
    """Removes the AD word"""
    result = pd.DataFrame()
    for col in cols:
        result[col] = df.loc[:, col].str.replace('ADVERTISEMENT', '')
    
    return result

def combine_json_to_dataframe(zip_file_path, num_words_cutoff = 20) -> pd.DataFrame:
    """
    Combines data from three JSON files containing recipes, loads them into a Pandas DataFrame, and 
    pre-processes the data for further analysis. Returns the resulting DataFrame.
    
    Args:
        zip_file_path (str): The file path of the zip file containing the three JSON files with recipe data.
        num_words_cutoff (int): The minimum number of words in a recipe to be included in the resulting DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing the combined data from the three JSON files with the 
        following columns: 'title', 'ingredients', 'instructions', 'full_text', and 'num_words'. All 
        recipes with fewer than 'num_words_cutoff' words are excluded.
    """
    # Open the zip file and load the JSON files
    with zipfile.ZipFile(zip_file_path) as z:
        with z.open('recipes_raw_nosource_fn.json') as f:
            fn_data = json.load(f)
        with z.open('recipes_raw_nosource_epi.json') as f:
            epi_data = json.load(f)
        with z.open('recipes_raw_nosource_ar.json') as f:
            ar_data = json.load(f)

    # Combine the data from the three JSON files
    data = {**fn_data, **epi_data, **ar_data}

    # Convert the data to a dataframe
    df = pd.DataFrame.from_dict(data, orient='index')

    # Add a new column with the concatenated text
    df['full_text'] = ('Recipe title: ' + 
                       df['title'] + 
                       '. Ingredients: ' + 
                       df['ingredients'].apply(lambda x: '; '.join(x)) + 
                       '. Instructions: ' + 
                       df['instructions'])
    
    df = (df.
          # remove adds
          pipe(remove_advertisement).
          # drop the picture_link column
          drop(['picture_link'], axis = 1).
          # give a num_words estimation
          assign(num_words = lambda d: d['full_text'].str.split().str.len()).
          # drop short recipes
          loc[lambda d: d['num_words'] > num_words_cutoff]
    )

    return df

if __name__ == "__main__":
    full_data = combine_json_to_dataframe("data/recipes_raw.zip")
    print(full_data.info())

In [26]:
path_to_zip = pathlib.Path().home() / 'Desktop/Others.zip'
path_to_zip.exists()

True

In [32]:
with zipfile.ZipFile(path_to_zip) as z:
    z.namelist

[<ZipInfo filename='moving_average.md' compress_type=deflate external_attr=0x20 file_size=2325 compress_size=718>, <ZipInfo filename='regex.md' compress_type=deflate external_attr=0x20 file_size=940 compress_size=433>]
C:\Users\a1056968\Desktop\Others.zip
[<ZipInfo filename='moving_average.md' compress_type=deflate external_attr=0x20 file_size=2325 compress_size=718>, <ZipInfo filename='regex.md' compress_type=deflate external_attr=0x20 file_size=940 compress_size=433>]
['moving_average.md', 'regex.md']
