In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [3]:
df=pd.read_pickle('data.pkl')

In [16]:
df.head()

Unnamed: 0,Rating,Text,Year,Month,text_clean,text_len
0,5,I have bought several of the Vitality canned d...,2011,4,"[bought, several, vitality, canned, dog, food,...",263
1,1,Product arrived labeled as Jumbo Salted Peanut...,2012,9,"[product, arrived, labeled, jumbo, salted, pea...",190
2,4,This is a confection that has been around a fe...,2008,8,"[confection, around, century, light, pillowy, ...",509
3,2,If you are looking for the secret ingredient i...,2011,6,"[looking, secret, ingredient, robitussin, beli...",219
4,5,Great taffy at a great price. There was a wid...,2012,10,"[great, taffy, great, price, wide, assortment,...",140


## TFIDF
Main Goal: Measure how important a word or phrase is within a collection of documents. It essentially weigh down terms that appear frequently and scale up unique terms.

TF Term Frequency how often a term occurs

IDF Inverse Document Frequency how important a term is

In our case, 
1. discard words that appear in > 80% of the reviews
2. discard words that appear in < 10 reviews

In [5]:
# Get the review with 1-star
review = (list(df[df.Rating==1].text_clean.map(lambda x : ' '.join(x))) )
# Perform a TFIDF Transformation of the review with an n-gram range of 1-2 and discard words appear in >80% or <10 reviews
tv = TfidfVectorizer(ngram_range=(1, 2), stop_words='english',max_df=0.8,min_df=10)
X_description = tv.fit_transform(review)

### Observing the TFIDF Weights

In [6]:
#  create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(tv.get_feature_names(), tv.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
                    dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

# Lowest TFIDF Scores
tfidf.sort_values(by=['tfidf'], ascending=True).head(10)

Unnamed: 0,tfidf
like,2.094545
product,2.16788
taste,2.24902
br,2.618644
good,2.708697
dont,2.748679
buy,2.78655
flavor,2.863429
food,2.934591
time,3.003943


In [7]:
# HIghest TFIDF Scores
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)

Unnamed: 0,tfidf
mg caffeine,9.466263
creamy like,9.466263
bag pricecomparing,9.466263
pop time,9.466263
seriously buy,9.466263
flavored cup,9.466263
seriously ill,9.466263
seriously like,9.466263
package contained,9.466263
understand company,9.466263


## Topic Modelling:  Latent Dirichlet Allocation

### Grid Search Hyperparameter Tuning
Try differnet n_components parameters to find the best number of topics

In [8]:
#################################################
## Grid Search Hyperparameter Tuning

## Define Search Parammeters
params = {'n_components': [3,4,5,6,7]}

## Do the Grid Search
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=params)
model.fit(X_description)

## find the best model
best_model = model.best_estimator_

print("Best Model's Parameters: ", model.best_params_)

Best Model's Parameters:  {'n_components': 3}


In [12]:
feature_names = tv.get_feature_names()
num_words = 20
all_topics = {}
for idx, topic in enumerate(best_model.components_):
    all_topics["Topic %d"%(idx+1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-num_words - 1:-1]]
    
topics = pd.DataFrame(all_topics)
topics

Unnamed: 0,Topic 1,Topic 2,Topic 3
0,dog,taste,coffee
1,treat,like,product
2,food,br,box
3,china,flavor,amazon
4,coffee,product,tea
5,product,food,like
6,eat,ingredient,order
7,like,sugar,taste
8,chew,good,br
9,chicken,taste like,price
