In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import json

In [3]:
#import a csv file in a pandas dataframe
df = pd.read_csv('../Dataset/full_dataset.csv').head(40000)
df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
# get the datatypes of all the columns
print(df.dtypes)

Unnamed: 0      int64
title          object
ingredients    object
directions     object
link           object
source         object
NER            object
dtype: object


In [5]:
df.shape

(40000, 7)

In [6]:
# what is the datatype of title column?
print(df['title'].dtype)

# convert the title column to a string
df['title'] = df['title'].astype(str)

# e.g. How many unique job titles are there?
print(df['title'].nunique())



object
23484


In [7]:
# filters out rows with very short titles
# filters out rows with very short ingredients
# filters out rows with very short directions
# filters out rows with directions that have a step or mix all in them because that would not be a good instruction to give to the user.
remove1 = df.loc[df.title.map(lambda x: len(x)<4 )]
remove2 = df.loc[df.ingredients.map(lambda x: len(x)<2)]
remove3 = df.loc[df.directions.map(lambda x: len(x) < 2 or len(''.join(x)) < 30)]
remove4 = df.loc[df.directions.map(lambda x: re.search('(step|mix all)', ''.join(str(x)), re.IGNORECASE)!=None)]

In [8]:
len(remove1)+len(remove2)+len(remove3)+len(remove4)

3049

In [9]:
df.drop(remove1.index)
df.drop(remove2.index)
df.drop(remove3.index)
df.drop(remove4.index)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...,...,...,...,...,...
39995,39995,Easter Basket Cake,"[""1 pkg. Duncan Hines moist deluxe cake mix (a...","[""Preheat oven to 350\u00b0."", ""Grease and flo...",www.cookbooks.com/Recipe-Details.aspx?id=153212,Gathered,"[""Duncan"", ""chocolate layer cake frosting"", ""c..."
39996,39996,Rouladen(A Beautiful Make Ahead For A Buffet; ...,"[""4 to 5 lb. \""eye of round\"" roast"", ""2 large...","[""Have meat cutter cut roast (across grain) in...",www.cookbooks.com/Recipe-Details.aspx?id=752316,Gathered,"[""roast"", ""onions"", ""thyme"", ""black pepper"", ""..."
39997,39997,Seven Layer Cookies,"[""1/2 lb. melted butter"", ""2 c. graham cracker...","[""Layer in an 8 x 8-inch pan."", ""Do not stir.""...",www.cookbooks.com/Recipe-Details.aspx?id=728609,Gathered,"[""butter"", ""graham crackers"", ""coconut"", ""butt..."
39998,39998,Cavatini,"[""1 lb. hamburger"", ""1 lb. Italian sausage"", ""...","[""Brown hamburger and sausage; add garlic, oni...",www.cookbooks.com/Recipe-Details.aspx?id=879946,Gathered,"[""hamburger"", ""Italian sausage"", ""clove garlic..."


In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
df.shape

(40000, 7)

In [12]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."


In [13]:
import numpy as np
train, test = train_test_split(df, test_size=0.20) #use 8% for test set

In [14]:
print(train.shape, test.shape)

(32000, 7) (8000, 7)


In [15]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [16]:
# create a function to create a plaintext file from the dataset
def df_to_plaintext_file(input_df, output_file):
    print("Writing to", output_file)
    with open(output_file, 'w', encoding='utf-8') as f:
        for index, row in input_df.iterrows():
            if index%100000==0:
                print(index)
            if type(row.NER)!=str:
                continue
            title = row.title
            directions = json.loads(row.directions)
            ingredients = json.loads(row.ingredients)
            ner = json.loads(row.NER)
            res = "<RECIPE_START> <INPUT_START> " + " <NEXT_INPUT> ".join(ner) + " <INPUT_END> <INGR_START> " + \
              " <NEXT_INGR> ".join(ingredients) + " <INGR_END> <INSTR_START> " + \
              " <NEXT_INSTR> ".join(directions) + " <INSTR_END> <TITLE_START> " + title + " <TITLE_END> <RECIPE_END>"
            f.write("{}\n".format(res))


In [17]:
df_to_plaintext_file(train, 'unsupervised_train.txt')
df_to_plaintext_file(test, 'unsupervised_test.txt')

Writing to unsupervised_train.txt
0
Writing to unsupervised_test.txt
0


In [18]:
with open("unsupervised_train.txt", "r", encoding="utf-8") as file:
    # Read the first 2 lines
    first_record = file.readline()
    second_record = file.readline()

# Display the first 2 records
print("First Record:")
print(first_record)

print("\nSecond Record:")
print(second_record)

First Record:
<RECIPE_START> <INPUT_START> broccoli <NEXT_INPUT> cream of mushroom soup <NEXT_INPUT> mayonnaise <NEXT_INPUT> eggs <NEXT_INPUT> onion soup <NEXT_INPUT> Cheddar cheese <INPUT_END> <INGR_START> 2 small or 1 large frozen chopped broccoli <NEXT_INGR> 1 can cream of mushroom soup <NEXT_INGR> 3/4 c. mayonnaise (use only mayonnaise) <NEXT_INGR> 2 eggs, beaten <NEXT_INGR> 1/2 pkg. Lipton instant onion soup mix <NEXT_INGR> 1 c. grated Cheddar cheese <INGR_END> <INSTR_START> Cook broccoli. <NEXT_INSTR> Mix soup, mayonnaise and soup together. <NEXT_INSTR> Add broccoli and eggs; cover with grated cheese. <NEXT_INSTR> Bake uncovered at 325° for 45 minutes or until hot and fluffy. <INSTR_END> <TITLE_START> Broccoli Casserole <TITLE_END> <RECIPE_END>


Second Record:
<RECIPE_START> <INPUT_START> margarine <NEXT_INPUT> white shortening <NEXT_INPUT> sugar <NEXT_INPUT> milk <INPUT_END> <INGR_START> 1 stick margarine <NEXT_INGR> 1/2 c. white shortening <NEXT_INGR> 1 c. sugar <NEXT_INGR> 2/