## EDA and Topic Modeling of Recipes 
- As a first stab, let's try to use only `Titles`

Importing libraries

In [None]:
import pickle
import os
from itertools import chain
import pandas as pd
import io

import urllib.request
import numpy as np
from tqdm import tqdm, tnrange, tqdm_notebook
tqdm.pandas()

from PIL import Image
import time
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from keras.preprocessing import image

import matplotlib.pyplot as plt
%matplotlib inline

from recipeScripts import *
from miscScripts import *

In [None]:
# Making sure that we have the nltk libraries
nltk.data.path.append('/Users/jhonsen/Documents/DS/nltk_data/')
nltk.download('wordnet', download_dir='/Users/jhonsen/Documents/DS/nltk_data/')

In [None]:
# Import dataframe
with open('df_epi_cleaner.pkl','rb') as fin:
    df = pickle.load(fin)

### Turn titles into lowercase letters

In [None]:
df['title'] = df['title'].apply(lambda word: word.lower())

In [None]:
# Plot the number of words in title
df.title_numWords.plot(kind='hist', bins=20)
plt.title('number of words in title');

# Most titles have words between 1 and 10

### Check different types of dishes based on `Titles`

In [None]:
# Checkout different cuisines of the world 
# printNum() is imported from recipeScripts.py
printNum(['korean','chinese','japanese','italian','french','mexican','indian','thai',
          'cajun','vietnamese','american','german','spanish','mediterranian','polish',
         'greek','jamaican','african','ethiopian','turkish','indonesian'],
                dish_type = 'cuisine').reset_index(drop=True).sort_values(by='number',
                                                                         ascending=False)

In [None]:
# Check out different types of dishes 
printNum(['salad','barbecue','roast','pizza','soup',
              'curry','pasta','antipasti','bbq',
              'stew','cake','cookie','wrap','sandwich',
          'chicken'],
        dish_type='type').reset_index(drop=True).sort_values(by='number', ascending=False)

### What are the most common words in recipe titles?

In [None]:
# Create words out of title
df['words'] = df['title'].progress_apply(lambda sent: [word for word in sent.split()])

In [None]:
# Create column with processed-Text
df['words'] = df['words'].progress_apply(preprocessText)

In [None]:
# Combine all words in titles into a single bag-of-words
# bag_of_texts = reduce(lambda x,y: x+y, df_title.texts.tolist()) #<<--takes long time
bag_of_texts = list(chain.from_iterable(df['words'].values))

In [None]:
# Plot most frequently-used words
freq_words = nltk.FreqDist(bag_of_texts)
freq_words.plot(20, cumulative=False)

---

### Vectorize Document

In [None]:
# Use this if we are using TfIdf on tokenized 
def dummy_func(doc):
    return doc

# df_title['texts'] is the already tokenized document
tokenized_docs = df['words'].tolist()

# vectorize documents with TF-IDF
tfidfVectorizer = TfidfVectorizer(analyzer='word',
                                  tokenizer=dummy_func,
                                  preprocessor=dummy_func,
                                  token_pattern=None,
                                  ngram_range=(1,1))
# Fit transform
tfidfVectorizer.fit(tokenized_docs)
dtm = tfidfVectorizer.transform(tokenized_docs)

In [None]:
# view the dtm in a dataframe
df_dtm = pd.DataFrame(dtm.toarray(),
             index= df.title,
            columns = tfidfVectorizer.get_feature_names())
df_dtm.head(3)

### Topic Modeling with NMF

In [None]:
# Build an NMF with 20 topics

nmf = NMF(20)
nmf.fit(dtm)
nmf_topics = nmf.transform(dtm)

In [None]:
# Check dataframe of words in dtm   
df_topics = pd.DataFrame(nmf.components_.round(3),
                         index=[str(k+1) for k in range(20)],
                         columns= tfidfVectorizer.get_feature_names())
df_topics

In [None]:
# Create a dataframe of Observations vs TOPICS

df_obs_topics = pd.DataFrame(nmf_topics.round(3),
                index= df.title,
                columns = [str(k+1) for k in range(20)])
df_obs_topics.head(3)

In [None]:
# Checking keywords in topics
# display_topics() is imported from miscScripts.py 
display_topics(nmf, tfidfVectorizer.get_feature_names(), no_top_words=8)

# The keywords shown below (topics) have reasonable consistency 

### Add a `label` column using the NMF topics

In [None]:
# Add labels to the datafame
## labelTopic() is imported from recipeScripts.py
df_labeled = labelTopic(df_obs_topics)

In [None]:
# Further group the 20 topics into 5 main categories
# subcatToMaincat() is imported from recipeScripts.py
df['subCat'] = df_labeled['label']
df = subcatToMaincat(df)

### Pickle new dataframe

In [None]:
# Save new Category columns

with open('df_epi_cleaner-5.pkl','wb') as fout:
    pickle.dump(df_all, fout)