# Sheet 3

# The Hitchhiker's Guide to Happiness

## Abstract

Using the “Happy moments” dataset we will train multiple classifiers predicting what kind of moments made a person happy, given their demographic information, and classifying descriptions of happy moments using NLP. We expect to have reliable models for these tasks at the end of our analysis and to be able to give more (scientific) insight into what makes different kinds of people happy.

In [83]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import subprocess
#%%
print(subprocess.getoutput("python3 -m spacy download en"))

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('stopwords')
nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/anaconda3/envs/datalore-user/lib/python3.7/site-packages/en_core_web_sm -->
/opt/anaconda3/envs/datalore-user/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/datalore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [84]:
happy_moments_data = pd.read_csv("cleaned_hm.csv")
happy_moments_data

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection
...,...,...,...,...,...,...,...,...,...
100530,128762,112,24h,My husband announced he is getting a decent bo...,My husband announced he is getting a decent bo...,True,1,,affection
100531,128763,714,24h,Had a can of Pepsi to drink.,Had a can of Pepsi to drink.,True,1,,enjoy_the_moment
100532,128764,3934,24h,Cuddling with my girlfriend last night.,Cuddling with my girlfriend last night.,True,1,affection,affection
100533,128765,1629,24h,I had a great meeting yesterday at work with m...,I had a great meeting yesterday at work with m...,True,1,,bonding


In [85]:
happy_moments_data.ground_truth_category.unique()

array([nan, 'bonding', 'leisure', 'affection', 'enjoy_the_moment',
       'achievement', 'nature', 'exercise'], dtype=object)

In [86]:
import string

# punctuation_regex = re.compile('[%s]' % re.escape(string.punctuation))

# Convert to list
#data = df.content.values.tolist()
data = happy_moments_data.cleaned_hm.values.tolist()

# Remove punctuation
# data = [punctuation_regex.sub('', x) for x in data]

# Remove new line characters
# data = [re.sub(r'\s+', ' ', sent) for sent in data]

pprint(data[:1])

['I went on a successful date with someone I felt sympathy and connection '
 'with.']


In [87]:
stop_list = gensim.parsing.preprocessing.STOPWORDS

domain_specific_stop_words = ['happy', 'day', 'got', 'went', 'today', 
 'made', 'one', 'two', 'time', 'last', 'first', 'going', 'getting', 'took', 'found', 
 'lot', 'really', 'saw', 'see', 'month', 'week', 'day', 'yesterday', 'year', 'ago', 
 'now', 'still', 'since', 'something', 'great', 'good', 'long', 'thing', 'toi', 'without', 
 'yesteri', '2s', 'toand', 'ing']

domain_specific_stop_words.extend(['happiest', "new", "moment", 
        #"life", "like",
    ]
)

stop_list = stop_list.union(domain_specific_stop_words)

stop_list

frozenset({'2s',
           'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'ago',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
      

In [88]:
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    # return stemmer.stem(WordNetLemmatizer().lemmatize(text))
    return WordNetLemmatizer().lemmatize(text)
    

# stop_list = [lemmatize_stemming(x) for x in stop_list]

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if len(token) >= 2 and token not in stop_list:
            result.append(lemmatize_stemming(token))
            
    return result

In [90]:
processed_data = []

for x in data:
    processed_data.append(preprocess(x))

processed_data

[['successful', 'date', 'felt', 'sympathy', 'connection'],
 ['son', 'mark', 'examination'],
 ['gym', 'morning', 'yoga'],
 ['talk', 'friend', 'flaky', 'lately', 'understood', 'evening', 'hanging'],
 ['grandchild', 'butterfly', 'display', 'crohn', 'conservatory'],
 ['meditated', 'night'],
 ['recipe', 'peasant', 'bread', 'came', 'spectacular'],
 ['gift', 'elder', 'brother', 'surprising'],
 ['mom', 'birthday', 'enjoyed'],
 ['watching', 'cupcake', 'war', 'teen', 'child'],
 ['came', 'rd', 'place', 'duty', 'video', 'game'],
 ['completed', 'mile', 'run', 'break', 'make', 'feel', 'strong'],
 ['movie', 'friend', 'fun'],
 ['shorting', 'gold', 'trade'],
 ['hearing',
  'song',
  'nearly',
  'impossible',
  'angry',
  'looking',
  'thought',
  'eas',
  'angry',
  'feeling',
  'move',
  'direction',
  'happiness',
  'headed',
  'positive',
  'direction',
  'youall',
  'world'],
 ['son', 'performed', 'test', 'preparation'],
 ['helped', 'neighbour', 'fix', 'car', 'damage'],
 ['managed', 'final', 'troph

In [91]:
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(processed_data, min_count=1, threshold=1)

bigram = Phraser(phrases)
print(bigram[processed_data[0]])

# Build the bigram and trigram models
bigram = gensim.models.Phrases(processed_data, min_count=2, threshold=1) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[processed_data], threshold=1)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[processed_data[0]]])

['successful_date', 'felt', 'sympathy', 'connection']
['successful_date', 'felt', 'sympathy', 'connection']


In [92]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [93]:
bigrammed_moments = make_bigrams(processed_data)

trigrammed_moments = make_trigrams(processed_data)
trigrammed_moments

[['successful_date', 'felt', 'sympathy', 'connection'],
 ['son', 'mark', 'examination'],
 ['gym_morning', 'yoga'],
 ['talk_friend', 'flaky', 'lately', 'understood', 'evening', 'hanging'],
 ['grandchild', 'butterfly', 'display', 'crohn', 'conservatory'],
 ['meditated', 'night'],
 ['recipe', 'peasant', 'bread', 'came', 'spectacular'],
 ['gift', 'elder_brother', 'surprising'],
 ['mom_birthday', 'enjoyed'],
 ['watching', 'cupcake', 'war', 'teen', 'child'],
 ['came', 'rd_place', 'duty', 'video_game'],
 ['completed_mile', 'run', 'break', 'make_feel', 'strong'],
 ['movie_friend', 'fun'],
 ['shorting', 'gold', 'trade'],
 ['hearing_song',
  'nearly_impossible_angry_looking',
  'thought_eas_angry_feeling',
  'move_direction_happiness_headed',
  'positive_direction',
  'youall_world'],
 ['son_performed', 'test', 'preparation'],
 ['helped', 'neighbour', 'fix_car', 'damage'],
 ['managed', 'final', 'trophy', 'game_playing'],
 ['hot', 'kiss_girl', 'friend', 'night'],
 ['bcaas',
  'came_mail',
  'yay'

In [94]:
processed_data

[['successful', 'date', 'felt', 'sympathy', 'connection'],
 ['son', 'mark', 'examination'],
 ['gym', 'morning', 'yoga'],
 ['talk', 'friend', 'flaky', 'lately', 'understood', 'evening', 'hanging'],
 ['grandchild', 'butterfly', 'display', 'crohn', 'conservatory'],
 ['meditated', 'night'],
 ['recipe', 'peasant', 'bread', 'came', 'spectacular'],
 ['gift', 'elder', 'brother', 'surprising'],
 ['mom', 'birthday', 'enjoyed'],
 ['watching', 'cupcake', 'war', 'teen', 'child'],
 ['came', 'rd', 'place', 'duty', 'video', 'game'],
 ['completed', 'mile', 'run', 'break', 'make', 'feel', 'strong'],
 ['movie', 'friend', 'fun'],
 ['shorting', 'gold', 'trade'],
 ['hearing',
  'song',
  'nearly',
  'impossible',
  'angry',
  'looking',
  'thought',
  'eas',
  'angry',
  'feeling',
  'move',
  'direction',
  'happiness',
  'headed',
  'positive',
  'direction',
  'youall',
  'world'],
 ['son', 'performed', 'test', 'preparation'],
 ['helped', 'neighbour', 'fix', 'car', 'damage'],
 ['managed', 'final', 'troph

In [95]:
# Create Corpus
texts = processed_data
# texts = bigrammed_moments
#texts = trigrammed_moments

# Create Dictionary
id2word = corpora.Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [96]:
# Build LDA model

def build_lda_model(id2word, corpus, texts, num_topics):
    return gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=42,
                                           alpha='auto',
                                           per_word_topics=True
                                           )

In [97]:
# lda_model = build_lda_model(id2word, corpus, texts, 6)

In [98]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = build_lda_model(id2word, corpus, texts, num_topics)

        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')

        coherence_value = coherencemodel.get_coherence()

        print(f'Model for {num_topics} topics: Coherence Score of {coherence_value}')

        coherence_values.append(coherence_value)

    return model_list, coherence_values

In [99]:
limit=10; start=2; step=1;

In [100]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)

Model for 2 topics: Coherence Score of 0.18426313023943336
Model for 3 topics: Coherence Score of 0.21841791461826254
Model for 4 topics: Coherence Score of 0.2719678806124178
Model for 5 topics: Coherence Score of 0.32508237466359935
Model for 6 topics: Coherence Score of 0.3348233495676687
Model for 7 topics: Coherence Score of 0.3399355543022682
Model for 8 topics: Coherence Score of 0.4032337987500104
Model for 9 topics: Coherence Score of 0.34169225376208795


In [101]:
chosen_model = model_list[6]

In [102]:
import matplotlib.pyplot as plt

# Show graph
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.savefig('coherence_score.png', dpi=400)
plt.show()

In [108]:
# Visualize the topics 
pyLDAvis.enable_notebook() 
vis = pyLDAvis.gensim.prepare(chosen_model, corpus, id2word)
vis

In [128]:
# inputSentence = 'Today I submitted my final project presentation'
# inputSentence = 'I got promoted and now have a new job'
# inputSentence = 'I played a videogame with a friend'
# inputSentence = 'I spent some quality time with my girlfriend'
# inputSentence = 'I bought a new car'
# inputSentence = 'I\'ve gone on a world trip and traveled to America'
# inputSentence = 'I drove my daughter to school'
# inputSentence = 'I went on vacation with my family'
inputSentence = 'I went on vacation with my wife'
# inputSentence = 'I had dinner at a fancy restaurant'
inputSentence = preprocess(inputSentence)

print(inputSentence)

doc_vector = id2word.doc2bow(inputSentence)
print(doc_vector)
doc_topics = chosen_model[doc_vector]

print(data[doc_topics[2][0][0]])
# print(data[doc_topics[2][1][0]])
# print(data[doc_topics[2][2][0]])
# print(data[doc_topics[2][3][0]])

doc_topics

['vacation', 'family']
[(101, 1), (426, 1)]
Seeing most of my friends and being able to catch up with them filled me with joy.


([(0, 0.06495233),
  (1, 0.087256044),
  (2, 0.046474356),
  (3, 0.049616467),
  (4, 0.056518186),
  (5, 0.3320512),
  (6, 0.051607408),
  (7, 0.311524)],
 [(101, [5]), (426, [7])],
 [(101, [(5, 0.9895234)]), (426, [(7, 0.9948802)])])

In [111]:
# Print the Keyword in the 10 topics
pprint(chosen_model.print_topics())
doc_lda = chosen_model[corpus]

[(0,
  '0.123*"friend" + 0.032*"old" + 0.018*"best" + 0.016*"year" + 0.015*"met" + '
  '0.015*"phone" + 0.014*"brother" + 0.014*"birthday" + 0.012*"talked" + '
  '0.012*"seen"'),
 (1,
  '0.070*"work" + 0.022*"able" + 0.016*"working" + 0.014*"morning" + '
  '0.014*"hour" + 0.013*"finally" + 0.013*"night" + 0.012*"finished" + '
  '0.011*"ve" + 0.010*"sleep"'),
 (2,
  '0.053*"game" + 0.026*"won" + 0.026*"video" + 0.021*"favorite" + '
  '0.019*"played" + 0.018*"book" + 0.017*"watch" + 0.016*"playing" + '
  '0.014*"team" + 0.013*"afternoon"'),
 (3,
  '0.052*"dinner" + 0.025*"lunch" + 0.022*"ate" + 0.016*"favorite" + '
  '0.016*"delicious" + 0.014*"night" + 0.014*"movement" + 0.013*"restaurant" + '
  '0.013*"coffee" + 0.013*"nice"'),
 (4,
  '0.028*"movie" + 0.028*"car" + 0.027*"bought" + 0.022*"money" + '
  '0.021*"watched" + 0.020*"job" + 0.015*"able" + 0.014*"received" + '
  '0.014*"store" + 0.013*"food"'),
 (5,
  '0.026*"family" + 0.025*"home" + 0.023*"dog" + 0.017*"came" + 0.015*"event" 

In [112]:
from gensim.similarities import Similarity
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile

index_tmpfile = get_tmpfile("index")
index = Similarity(index_tmpfile, corpus, num_features=len(id2word))

In [129]:


sims = index[id2word.doc2bow(inputSentence)]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims[:3])

print(data[sims[0][0]])
print(data[sims[1][0]])
print(data[sims[2][0]])

[(31193, 0.99999994), (34856, 0.99999994), (40448, 0.99999994)]
I just went on a 4 day vacation with my family.
I went on a vacation with  my family.
Going on vacation with my family made me happy.
