In [1]:
import wikipedia, nltk, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
def filter_pos(raw_text, pos):
    '''
    This function will take in raw text and return a pandas DataFrame containing a unique set of words 
    whose parts of speech match what is defined
    raw_text: raw string from data source
    List pos: a list of strings that define what parts of speech is returned want
    '''
    columns = ['word', 'tag']
    tagged = nltk.pos_tag(nltk.word_tokenize(raw_text))
    df = pd.DataFrame([x for x in tagged], columns=columns)
    filtered = df[df.tag.isin(pos)].drop_duplicates().reset_index(drop=True)
    return filtered
            
            

In [135]:
def makeSampleDataset(list_of_foods, pos):
    dishes_row = []
    dd_columns = ['title', 'description']
    verified_list = []
    
    for i in range(len(list_of_foods)):
        wikipedia_title = (wikipedia.WikipediaPage(title = list_of_foods[i]).title)
        if wikipedia_title != None:
            verified_list.append(list_of_foods[i])
            
    for i in range(len(verified_list)):
        wikipedia_content = (wikipedia.WikipediaPage(title = list_of_foods[i]).content)
        filtered_item = filter_pos(wikipedia_content, pos)
        
        description = np.array([])
        for j in range(len(filtered_item)):
            description = np.append(description, filtered_item.word[j])
            
        tup = (list_of_foods[i], ', '.join(description))
        dishes_row.append(tup)
        
    dish_description_df = pd.DataFrame([x for x in dishes_row], columns = dd_columns)
    return dish_description_df
    

In [136]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, data):
    
    #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
    tfidf = TfidfVectorizer(stop_words='english')

    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(data['description'])

    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    #Construct a reverse map of indices and movie titles
    indices = pd.Series(data.index, index=data['title']).drop_duplicates()
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[item_indices]

In [137]:
pos = ['JJ', 'VB', 'VBG', 'VBP', 'VBZ']
pos_simple = ['JJ', 'NN']
# sampleData = makeSampleDataset(['filet mignon', 'steak', 'spaghetti', 'macaroni', 'apple', 'banana', 'pear'], pos_simple)

In [155]:
sampleData;

In [131]:
# get_recommendations('macaroni', sampleData)

2       spaghetti
5          banana
6            pear
1           steak
4           apple
0    filet mignon
Name: title, dtype: object

In [148]:
# foodList = pd.DataFrame(pd.read_csv('foodlist.csv'))
# foodList = foodList.loc[:, ~foodList.columns.str.contains('^Unnamed')]

In [149]:
#foodSampleData = makeSampleDataset(foodList['items'], pos_simple)

In [153]:
foodSampleData = pd.read_pickle('foodSampleData')

In [154]:
get_recommendations('Choco pie', foodSampleData)

10             BK Chicken Fries
121                        Oreo
54                  Corn flakes
115                   Milkshake
94     Italian-American cuisine
97                       Jell-O
43       Chocolate chip cookies
86                    Hamburger
84               Grilled cheese
65                     Doughnut
Name: title, dtype: object

In [152]:
foodSampleData.to_pickle('foodSampleData')

In [156]:
foodSampleData

Unnamed: 0,title,description
0,Apple Pie,"apple, pie, tart, principal, ingredient, occas..."
1,Bread,"staple, food, dough, flour, water, recorded, h..."
2,Ammonia cookie,"ammonia, cookie, leavener, Scandinavian-Americ..."
3,Cuisine of Antebellum America,"cuisine, change, American, eating, period, dif..."
4,Apple butter,"butter, concentrated, form, apple, sauce, slow..."
5,Apple sauce,"sauce, applesauce, flat, top, unpeeled, variet..."
6,Baked potato,"baked, potato, jacket, fluffy, interior, crisp..."
7,Barbecue,"barbeque, Australian, term, barbie, cooking, m..."
8,Bear claw (pastry),"bear, claw, sweet, yeast-raised, pastry, simil..."
9,Beef Manhattan,"dish, consisting, roast, beef, gravy, top, ste..."
