# Approach

Goal: Find the right combination of data-to-vector methods and topic models to get models that best represent the data.

Text data to vector methods:
* Count Vectorizer: Counts the number of times a word appears in a document.
* Tfidf Vectorizer: Considers the overall document weightage.

Topic models:
* Non-negative Matrix Factorization (NMF)
* Latent Dirichlet Allocation (LDA)
* CorEx

# Import Data and Packages

In [1]:
import pandas as pd 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,wid,age,country,gender,marital,parenthood,reflection_period,...,text_cleaned,verbs,nouns,proto_agent,passive_agent,root_verb,direct_object,parenthood_cat,gender_cat,marital_cat
0,0,0,0,1,37,USA,m,married,y,24h,...,"['wife', 'celebrating', 'year', 'anniversary',...",[celebrating],"[wife, year, anniversary, today]",[wife],[],[celebrating],[anniversary],1,1,1
1,1,1,1,1,37,USA,m,married,y,24h,...,"['mother', 'called', 'blue', 'tell', 'proud']","[called, tell]","[mother, blue]","[mother, she]",[],[called],[me],1,1,1
2,2,2,2,1,37,USA,m,married,y,24h,...,"['today', 'took', 'day', 'time', 'job', 'brunc...","[took, go, have]","[Today, day, part, time, job, brunch, date, wife]",[i],[],[took],"[day, date]",1,1,1
3,3,3,3,1,37,USA,m,married,y,24h,...,"['just', 'got', 'bonus', 'mturk', 'task']",[got],"[bonus, mturk, task]",[I],[],[got],[bonus],1,1,1
4,4,4,4,1,37,USA,m,married,y,24h,...,"['wife', 'cooked', 'surprise', 'dinner', 'work']","[cooked, take, work]","[wife, surprise, dinner]",[wife],[],[cooked],[dinner],1,1,1


# NMF

In [61]:
def nmf(column, num_categories, vector_type, model_type):
    # Vectorize the data
    vectorizer = vector_type
    doc_word = vectorizer.fit_transform(column)
    
    # Instantiate the model
    model = model_type(num_categories)
    doc_topic = model.fit_transform(doc_word)
    
    # Make a list of the words for classifications
    words = vectorizer.get_feature_names()
    t = model.components_.argsort(axis=1)[:,-1:-7:-1]
    topic_words = [[words[e] for e in l] for l in t]
    
    count = 0
    for topic_list in topic_words:
        print('Topic ', count, ': ', *topic_list)
        count += 1

In [15]:
extra_stoppers = ['wa', 'able', 'day', 'nice', 'month', 'year', 'today', 'week',
                 'yesterday', 'ha', 'moment', 'life', 'like', 'just', 'lot',
                 'spend', 'spent', 'spending']

## Count Vectorizer

In [16]:
nmf(df.text_cleaned, 5, CountVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 6, CountVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 7, CountVectorizer(stop_words = extra_stoppers), NMF)

Topic  1 :  time family long home make work
Topic  2 :  got work job home finally son
Topic  3 :  friend old best birthday party school
Topic  4 :  new bought car job game house
Topic  5 :  went family movie dinner shopping night
 
Topic  1 :  time family long make home night
Topic  2 :  got job finally son night school
Topic  3 :  friend old best birthday party school
Topic  4 :  new bought car job house game
Topic  5 :  went family movie dinner shopping night
Topic  6 :  work home make came project getting
 
Topic  1 :  time long family movement exam person
Topic  2 :  got job finally promotion free sleep
Topic  3 :  friend best old birthday party met
Topic  4 :  new bought car job game phone
Topic  5 :  went movie shopping temple dinner enjoyed
Topic  6 :  work home project job received early
Topic  7 :  family home make said came son


## Tfidf Vectorizer

In [10]:
nmf(df.text_cleaned, 5, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 6, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 7, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 8, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 9, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 10, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 11, TfidfVectorizer(stop_words = extra_stoppers), NMF)

Topic  1 :  friend birthday best old party met
Topic  2 :  got work job promotion raise home
Topic  3 :  went movie shopping temple walk park
Topic  4 :  new bought car job game purchased
Topic  5 :  time dinner family long night wife
 
Topic  1 :  friend birthday best old party met
Topic  2 :  work home early project received finished
Topic  3 :  went movie shopping temple walk park
Topic  4 :  new bought car job game purchased
Topic  5 :  time dinner family long night wife
Topic  6 :  got job sleep promotion finally raise
 
Topic  1 :  friend birthday best old party met
Topic  2 :  work home early project received finished
Topic  3 :  went movie shopping temple walk family
Topic  4 :  new bought car job game purchased
Topic  5 :  dinner family wife night ate favorite
Topic  6 :  got job sleep promotion finally raise
Topic  7 :  time long daughter spend son family
 
Topic  1 :  friend old best met seen talked
Topic  2 :  work home early project received finished
Topic  3 :  went movie

In [18]:
print(' ')
nmf(df.text_cleaned, 12, TfidfVectorizer(stop_words = extra_stoppers), NMF)

print(' ')
nmf(df.text_cleaned, 13, TfidfVectorizer(stop_words = extra_stoppers), NMF)

 
Topic  1 :  friend best old met seen talked
Topic  2 :  work early project finished raise received
Topic  3 :  went shopping temple walk park trip
Topic  4 :  new bought car purchased phone house
Topic  5 :  dinner ate night wife family delicious
Topic  6 :  got sleep promotion finally free raise
Topic  7 :  time long family seen quality havent
Topic  8 :  birthday family party celebrated gift surprise
Topic  9 :  home dog came son daughter took
Topic  10 :  movie watched favorite watching watch tv
Topic  11 :  game video played won playing play
Topic  12 :  job interview received getting new offer
 
Topic  1 :  friend best old met seen talked
Topic  2 :  work early project finished raise received
Topic  3 :  went shopping temple walk park trip
Topic  4 :  new bought car purchased phone house
Topic  5 :  dinner ate night delicious wife favorite
Topic  6 :  got sleep promotion finally free raise
Topic  7 :  time long family seen quality finally
Topic  8 :  birthday family party celebr

In [17]:
print(' ')
nmf(df.text_cleaned, 14, TfidfVectorizer(stop_words = extra_stoppers), NMF)

 
Topic  1 :  friend best old met seen talked
Topic  2 :  work project early finished raise received
Topic  3 :  went shopping temple walk trip date
Topic  4 :  new bought car purchased phone house
Topic  5 :  dinner ate night wife delicious family
Topic  6 :  got sleep promotion finally free raise
Topic  7 :  time long family seen quality finally
Topic  8 :  birthday family party celebrated gift surprise
Topic  9 :  daughter son wife school morning old
Topic  10 :  movie watched favorite watching watch tv
Topic  11 :  game video played won playing play
Topic  12 :  job interview received getting new offer
Topic  13 :  home came husband visit house sister
Topic  14 :  dog walk took park long outside


# LDA

## Count Vectorizer

In [58]:
def lda(column, num_categories):
    
    # Vectorize the data
    # LDA can only use raw term counts for LDA because it is a probabilistic 
    # graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=extra_stoppers)
    tf = tf_vectorizer.fit_transform(column)
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    # Instantiate the model
    lda = LatentDirichletAllocation(n_components=num_categories, max_iter=5, 
                                    learning_method='online', learning_offset=50.,
                                    random_state=0).fit(tf)
    
    # Print top words for each topic
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic %d:" % (topic_idx), " ".join([tf_feature_names[i] for i in topic.argsort()[:-6 - 1:-1]]))
    
    #no_top_words = 6
    #display_topics(lda, tf_feature_names, no_top_words)

In [60]:
lda(df.text_cleaned, 10)

Topic 0: work got job finally new car
Topic 1: wife having took lunch boyfriend got
Topic 2: friend birthday went past movie enjoyed
Topic 3: went old event started time child
Topic 4: time dinner family night long food
Topic 5: best won watched said son free
Topic 6: came dog going favorite finished trip
Topic 7: home received ago did sister got
Topic 8: daughter morning school got time make
Topic 9: new game getting love bought watching
