In [1]:
import wikipedia, nltk, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
def filter_pos(raw_text, pos):
    '''
    This function will take in raw text and return a pandas DataFrame containing a unique set of words 
    whose parts of speech match what is defined
    raw_text: raw string from data source
    List pos: a list of strings that define what parts of speech is returned want
    '''
    columns = ['word', 'tag']
    tagged = nltk.pos_tag(nltk.word_tokenize(raw_text))
    df = pd.DataFrame([x for x in tagged], columns=columns)
    filtered = df[df.tag.isin(pos)].drop_duplicates().reset_index(drop=True)
    return filtered
            
            

In [3]:
def makeSampleDataset(list_of_foods, pos):
    dishes_row = []
    dd_columns = ['title', 'description']
    verified_list = []
    
    for i in range(len(list_of_foods)):
        wikipedia_title = (wikipedia.WikipediaPage(title = list_of_foods[i]).title)
        if wikipedia_title != None:
            verified_list.append(list_of_foods[i])
            
    for i in range(len(verified_list)):
        # gets the content from wikipedia
        wikipedia_content = (wikipedia.WikipediaPage(title = list_of_foods[i]).content)
        filtered_item = filter_pos(wikipedia_content, pos)
        
        description = np.array([])
        for j in range(len(filtered_item)):
            description = np.append(description, filtered_item.word[j])
            
        tup = (list_of_foods[i], ', '.join(description))
        dishes_row.append(tup)
        
    dish_description_df = pd.DataFrame([x for x in dishes_row], columns = dd_columns)
    return dish_description_df
    

In [104]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, data):
    
    #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
    tfidf = TfidfVectorizer(stop_words='english')

    # TODO: Change this matrix into the form of columns = (item, descriptor, tfidf_score)
    
    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(data['description'])
    print('Size of tfidf_matrix: ', tfidf_matrix.shape)
    #tfidf_df = pd.DataFrame(tfidf_matrix)

    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    #Construct a reverse map of indices and movie titles
    indices = pd.Series(data.index, index=data['title']).drop_duplicates()
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[item_indices], tfidf_matrix

In [98]:
pos = ['JJ', 'VB', 'VBG', 'VBP', 'VvBZ']
pos_simple = ['JJ', 'NN']
# sampleData = makeSampleDataset(['filet mignon', 'steak', 'spaghetti', 'macaroni', 'apple', 'banana', 'pear'], pos_simple)

In [99]:
# get_recommendations('macaroni', sampleData)

In [100]:
# foodList = pd.DataFrame(pd.read_csv('foodlist.csv'))
# foodList = foodList.loc[:, ~foodList.columns.str.contains('^Unnamed')]

In [101]:
#foodSampleData = makeSampleDataset(foodList['items'], pos_simple)

In [102]:
foodSampleData = pd.read_pickle('foodSampleData')

In [115]:
recommendations, tfidf_csr_matrix = get_recommendations('Jerky', foodSampleData)

('Size of tfidf_matrix: ', (191, 7009))


In [116]:
tfidfDf = pd.DataFrame(tfidf_csr_matrix.todense())

In [187]:
tfidfDf[5][4]
# first one is column(descriptor) second one is row(foodId)

0.04963043229519751

In [205]:
columns = ['descriptor_id', 'food_id', 'scores'] 
formattedList = []
for i in range(tfidfDf.shape[1]):
    index = tfidfDf.loc[tfidfDf[i] > 0].index
    for j in range(len(index)):
        # i indicates id of descriptor(column), index[j] indicates id of food(row), 
        # also get the score for that pair
        newRow = (i, index[j], tfidfDf[i][index[j]])
        formattedList.append(newRow)
        
formattedDf = pd.DataFrame(formattedList, columns = columns)
#     index = list(index)
#     descriptor_indexOfItem = list((i, index))
#     print(descriptor_indexOfItem)

In [206]:
formattedDf

Unnamed: 0,descriptor_id,food_id,scores
0,0,116,0.089624
1,1,47,0.167793
2,2,158,0.113250
3,3,116,0.089624
4,4,47,0.167793
5,5,4,0.049630
6,5,116,0.074865
7,5,132,0.055504
8,5,167,0.090792
9,6,42,0.100625


In [82]:
foodSampleData.to_pickle('foodSampleData')

In [10]:
print(foodSampleData.iloc[0])

title                                                  Apple Pie
description    apple, pie, tart, principal, ingredient, occas...
Name: 0, dtype: object


In [33]:
foodSampleData.shape

(191, 2)