# Import Packages

In [2]:
import pandas as pd

In [3]:
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
from nltk.stem import SnowballStemmer 

In [5]:
from nltk.stem import WordNetLemmatizer 

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaitlynzeichick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaitlynzeichick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load and Merge Spreadsheets

In [77]:
demographic = pd.read_csv('demographic.txt')

In [78]:
happiness = pd.read_csv('cleaned_hm.txt')

In [79]:
df = pd.merge(demographic, happiness, left_on='wid', right_on='wid', how='left')

In [80]:
pd.set_option("display.max_colwidth", -1)

In [81]:
df

Unnamed: 0,wid,age,country,gender,marital,parenthood,hmid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,1,37.0,USA,m,married,y,27854.0,24h,My wife and i are celebrating our ten year anniversary today.,My wife and i are celebrating our ten year anniversary today.,True,1.0,,affection
1,1,37.0,USA,m,married,y,28054.0,24h,My mother called out of the blue to tell me how proud she is of me.,My mother called out of the blue to tell me how proud she is of me.,True,1.0,affection,affection
2,1,37.0,USA,m,married,y,28254.0,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,True,1.0,,affection
3,1,37.0,USA,m,married,y,28489.0,24h,I just got a $21 bonus on a mturk task!,I just got a $21 bonus on a mturk task!,True,1.0,,achievement
4,1,37.0,USA,m,married,y,28991.0,24h,My wife cooked me a surprise dinner to take to work with me.,My wife cooked me a surprise dinner to take to work with me.,True,1.0,,affection
5,1,37.0,USA,m,married,y,29493.0,24h,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,True,2.0,,affection
6,1,37.0,USA,m,married,y,29851.0,24h,The stock i started investing in went up slightly.,The stock i started investing in went up slightly.,True,1.0,,achievement
7,1,37.0,USA,m,married,y,30850.0,24h,My oldest son got accepted into a great High School and is excited about it.,My oldest son got accepted into a great High School and is excited about it.,True,1.0,,affection
8,1,37.0,USA,m,married,y,31849.0,24h,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,True,1.0,,affection
9,1,37.0,USA,m,married,y,33433.0,24h,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,True,1.0,,affection


# Clean Text

## Removing Columns

In [82]:
df.drop(columns=['original_hm', 'modified', 'num_sentence', 'ground_truth_category', 
                 'predicted_category', 'hmid'], inplace = True)

In [83]:
df.rename(columns={'cleaned_hm': 'text'}, inplace = True)

In [84]:
df.head()

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.


## Null Values

In [85]:
df.dropna(how = 'any', inplace = True)

In [86]:
df.shape

(100082, 8)

## Punctuation

In [87]:
df['text_cleaned'] = df['text']

In [88]:
def remove_punctuation(row):
    """
    Remove all punctuation from the text.
    """
    text = row['text_cleaned']
    return text.translate(str.maketrans('', '', string.punctuation))
    
df['text_cleaned'] = df.apply(lambda row: remove_punctuation(row), axis=1)

## Words with Numbers

In [89]:
df.head()

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,My wife and i are celebrating our ten year anniversary today
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,My mother called out of the blue to tell me how proud she is of me
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,I just got a 21 bonus on a mturk task
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,My wife cooked me a surprise dinner to take to work with me


In [90]:
def remove_numbers(row):
    """
    Remove all words that include numbers.
    """
    list_words = row['text_cleaned'].split()
    sentence = row['text_cleaned'].split()
    for word in sentence:
        for character in word:
            if character.isdigit():
                try:
                    list_words.remove(word)
                except ValueError:
                    pass
    final_words = ' '.join(list_words)
    return final_words


df['text_cleaned'] = df.apply(lambda row: remove_numbers(row), axis=1)

In [91]:
df

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,My wife and i are celebrating our ten year anniversary today
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,My mother called out of the blue to tell me how proud she is of me
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,I just got a bonus on a mturk task
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,My wife cooked me a surprise dinner to take to work with me
5,1,37.0,USA,m,married,y,24h,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,I bought my son a gift from the MTA store and left it on his bed so when he got home from school He texted me a little while ago and told me he loved it
6,1,37.0,USA,m,married,y,24h,The stock i started investing in went up slightly.,The stock i started investing in went up slightly
7,1,37.0,USA,m,married,y,24h,My oldest son got accepted into a great High School and is excited about it.,My oldest son got accepted into a great High School and is excited about it
8,1,37.0,USA,m,married,y,24h,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,My wife surprised me and bought me a new Iphone so i could upgrade from the
9,1,37.0,USA,m,married,y,24h,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,My parents told me they will be stopping by to celebrate my sons birthday with us this weekend


## Lower Case

In [92]:
df['text_cleaned'] = df['text_cleaned'].str.lower()

In [96]:
df.head()

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,my wife and i are celebrating our ten year anniversary today
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,my mother called out of the blue to tell me how proud she is of me
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,today i took the day off from my part time job to go and have a brunch date with my wonderful wife
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,i just got a bonus on a mturk task
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,my wife cooked me a surprise dinner to take to work with me


## Tokenize

In [101]:
df['text_cleaned'] = df.apply(lambda row: nltk.word_tokenize(row['text_cleaned']), axis=1)

In [102]:
df

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,"[my, wife, and, i, are, celebrating, our, ten, year, anniversary, today]"
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,"[my, mother, called, out, of, the, blue, to, tell, me, how, proud, she, is, of, me]"
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,"[today, i, took, the, day, off, from, my, part, time, job, to, go, and, have, a, brunch, date, with, my, wonderful, wife]"
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,"[i, just, got, a, bonus, on, a, mturk, task]"
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,"[my, wife, cooked, me, a, surprise, dinner, to, take, to, work, with, me]"
5,1,37.0,USA,m,married,y,24h,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,"[i, bought, my, son, a, gift, from, the, mta, store, and, left, it, on, his, bed, so, when, he, got, home, from, school, he, texted, me, a, little, while, ago, and, told, me, he, loved, it]"
6,1,37.0,USA,m,married,y,24h,The stock i started investing in went up slightly.,"[the, stock, i, started, investing, in, went, up, slightly]"
7,1,37.0,USA,m,married,y,24h,My oldest son got accepted into a great High School and is excited about it.,"[my, oldest, son, got, accepted, into, a, great, high, school, and, is, excited, about, it]"
8,1,37.0,USA,m,married,y,24h,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,"[my, wife, surprised, me, and, bought, me, a, new, iphone, so, i, could, upgrade, from, the]"
9,1,37.0,USA,m,married,y,24h,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,"[my, parents, told, me, they, will, be, stopping, by, to, celebrate, my, sons, birthday, with, us, this, weekend]"


# Processing

## Stemming

In [None]:
stemmer = SnowballStemmer("english")   

In [None]:
def stem(row):
    stem_list = []
    words = row['text_cleaned']
    for word in words:
        stem_list.append(stemmer.stem(word))
    return stem_list

df['text_cleaned'] = df.apply(lambda row: stem(row), axis=1)

## Lemmatize

In [103]:
lemmatizer = WordNetLemmatizer() 

In [104]:
def lemmatize(row):
    lemm_list = []
    words = row['text_cleaned']
    for word in words:
        lemm_list.append(lemmatizer.lemmatize(word))
    return lemm_list

df['text_cleaned'] = df.apply(lambda row: lemmatize(row), axis=1)

In [105]:
df.head()

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,"[my, wife, and, i, are, celebrating, our, ten, year, anniversary, today]"
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,"[my, mother, called, out, of, the, blue, to, tell, me, how, proud, she, is, of, me]"
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,"[today, i, took, the, day, off, from, my, part, time, job, to, go, and, have, a, brunch, date, with, my, wonderful, wife]"
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,"[i, just, got, a, bonus, on, a, mturk, task]"
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,"[my, wife, cooked, me, a, surprise, dinner, to, take, to, work, with, me]"


## Stop Words

In [106]:
from sklearn.feature_extraction import text 

In [107]:
specific_stop_words = ['good', 'great', 'really', 'happy', 'felt', 'feel', 'happiness',
                        'happiest']

# Add specific stop words to general english stop words
stop_words = text.ENGLISH_STOP_WORDS.union(specific_stop_words)
stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [110]:
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: [item for item in x if item not in stop_words])

In [111]:
df

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,"[wife, celebrating, year, anniversary, today]"
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,"[mother, called, blue, tell, proud]"
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,"[today, took, day, time, job, brunch, date, wonderful, wife]"
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,"[just, got, bonus, mturk, task]"
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,"[wife, cooked, surprise, dinner, work]"
5,1,37.0,USA,m,married,y,24h,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,"[bought, son, gift, mta, store, left, bed, got, home, school, texted, little, ago, told, loved]"
6,1,37.0,USA,m,married,y,24h,The stock i started investing in went up slightly.,"[stock, started, investing, went, slightly]"
7,1,37.0,USA,m,married,y,24h,My oldest son got accepted into a great High School and is excited about it.,"[oldest, son, got, accepted, high, school, excited]"
8,1,37.0,USA,m,married,y,24h,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,"[wife, surprised, bought, new, iphone, upgrade]"
9,1,37.0,USA,m,married,y,24h,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,"[parent, told, stopping, celebrate, son, birthday, u, weekend]"


## Linguistic Annotations

In [112]:
import spacy

  from ._conv import register_converters as _register_converters


In [115]:
nlp = spacy.load("en_core_web_sm")

In [126]:
# For testing purposes
df_test = df.head(10).copy(deep=True)

In [160]:
def print_speech(row):
    doc = nlp(row['text']) 
    root_verb = []
    for token in doc:
        print(token, token.pos_, token.dep_, token.tag_)
    
df_test['root_verb'] = df_test.apply(lambda row: print_speech(row), axis=1)

My PRON poss PRP$
wife NOUN nsubj NN
and CCONJ cc CC
i PRON conj PRP
are AUX aux VBP
celebrating VERB ROOT VBG
our PRON poss PRP$
ten NUM nummod CD
year NOUN compound NN
anniversary NOUN dobj NN
today NOUN npadvmod NN
. PUNCT punct .
My PRON poss PRP$
mother NOUN nsubj NN
called VERB ROOT VBD
out SCONJ prep IN
of ADP prep IN
the DET det DT
blue NOUN pobj NN
to PART aux TO
tell VERB advcl VB
me PRON dobj PRP
how ADV advmod WRB
proud ADJ acomp JJ
she PRON nsubj PRP
is AUX ccomp VBZ
of ADP prep IN
me PRON pobj PRP
. PUNCT punct .
Today NOUN npadvmod NN
i PRON nsubj PRP
took VERB ROOT VBD
the DET det DT
day NOUN dobj NN
off ADV advmod RB
from ADP prep IN
my PRON poss PRP$
part NOUN compound NN
time NOUN compound NN
job NOUN pobj NN
to PART aux TO
go VERB advcl VB
and CCONJ cc CC
have VERB conj VB
a DET det DT
brunch NOUN compound NN
date NOUN dobj NN
with ADP prep IN
my PRON poss PRP$
wonderful ADJ amod JJ
wife NOUN pobj NN
. PUNCT punct .
I PRON nsubj PRP
just ADV advmod RB
got VERB ROOT 

In [153]:
def find_verbs(row):
    doc = nlp(row['text']) 
    verbs = []
    for token in doc:
        if token.pos_ == 'VERB':
            verbs.append(token)
    return verbs
    
df['verbs'] = df.apply(lambda row: find_verbs(row), axis=1)

In [154]:
def find_nouns(row):
    doc = nlp(row['text']) 
    nouns = []
    for token in doc:
        if token.pos_ == 'NOUN':
            nouns.append(token)
    return nouns
    
df['nouns'] = df.apply(lambda row: find_nouns(row), axis=1)

In [155]:
def find_proto_agent(row):
    doc = nlp(row['text']) 
    nsubj = []
    for token in doc:
        if token.dep_ == 'nsubj':
            nsubj.append(token)
    return nsubj
    
df['proto_agent'] = df.apply(lambda row: find_proto_agent(row), axis=1)

In [156]:
def find_passive_agent(row):
    doc = nlp(row['text']) 
    nsubjpass = []
    for token in doc:
        if token.dep_ == 'nsubjpass':
            nsubjpass.append(token)
    return nsubjpass
    
df['passive_agent'] = df.apply(lambda row: find_passive_agent(row), axis=1)

In [157]:
def find_root_verb(row):
    doc = nlp(row['text']) 
    root_verb = []
    for token in doc:
        if token.dep_ == 'ROOT':
            root_verb.append(token)
    return root_verb
    
df['root_verb'] = df.apply(lambda row: find_root_verb(row), axis=1)

In [162]:
def find_direct_object(row):
    doc = nlp(row['text']) 
    dir_obj = []
    for token in doc:
        if token.dep_ == 'dobj':
            dir_obj.append(token)
    return dir_obj
    
df['direct_object'] = df.apply(lambda row: find_direct_object(row), axis=1)

In [163]:
df

Unnamed: 0,wid,age,country,gender,marital,parenthood,reflection_period,text,text_cleaned,verbs,nouns,proto_agent,passive_agent,root_verb,direct_object
0,1,37.0,USA,m,married,y,24h,My wife and i are celebrating our ten year anniversary today.,"[wife, celebrating, year, anniversary, today]",[celebrating],"[wife, year, anniversary, today]",[wife],[],[celebrating],[anniversary]
1,1,37.0,USA,m,married,y,24h,My mother called out of the blue to tell me how proud she is of me.,"[mother, called, blue, tell, proud]","[called, tell]","[mother, blue]","[mother, she]",[],[called],[me]
2,1,37.0,USA,m,married,y,24h,Today i took the day off from my part time job to go and have a brunch date with my wonderful wife.,"[today, took, day, time, job, brunch, date, wonderful, wife]","[took, go, have]","[Today, day, part, time, job, brunch, date, wife]",[i],[],[took],"[day, date]"
3,1,37.0,USA,m,married,y,24h,I just got a $21 bonus on a mturk task!,"[just, got, bonus, mturk, task]",[got],"[bonus, mturk, task]",[I],[],[got],[bonus]
4,1,37.0,USA,m,married,y,24h,My wife cooked me a surprise dinner to take to work with me.,"[wife, cooked, surprise, dinner, work]","[cooked, take, work]","[wife, surprise, dinner]",[wife],[],[cooked],[dinner]
5,1,37.0,USA,m,married,y,24h,I bought my son a gift from the MTA store and left it on his bed so when he got home from school. He texted me a little while ago and told me he loved it.,"[bought, son, gift, mta, store, left, bed, got, home, school, texted, little, ago, told, loved]","[bought, left, got, texted, told, loved]","[son, gift, store, bed, school, while]","[I, he, He, he]",[],"[bought, texted]","[son, gift, it, me, me, it]"
6,1,37.0,USA,m,married,y,24h,The stock i started investing in went up slightly.,"[stock, started, investing, went, slightly]","[started, investing, went]",[stock],"[stock, i]",[],[went],[]
7,1,37.0,USA,m,married,y,24h,My oldest son got accepted into a great High School and is excited about it.,"[oldest, son, got, accepted, high, school, excited]","[accepted, is]",[son],[],[son],[accepted],[]
8,1,37.0,USA,m,married,y,24h,My wife surprised me and bought me a new I-phone 6 so i could upgrade from the 4.,"[wife, surprised, bought, new, iphone, upgrade]","[surprised, bought, upgrade]","[wife, I, phone]","[wife, i]",[],[surprised],"[me, phone]"
9,1,37.0,USA,m,married,y,24h,My parents told me they will be stopping by to celebrate my son's birthday with us this weekend.,"[parent, told, stopping, celebrate, son, birthday, u, weekend]","[told, stopping, celebrate]","[parents, son, birthday, weekend]","[parents, they]",[],[told],"[me, birthday]"


# Clean Demographic Information

In [262]:
df = pd.read_csv('cleaned_dataset.csv', dtype={"age": str, "country": str,
                                              "gender": str, "marital": str,
                                              "parenthood": str})

In [263]:
df.shape

(100082, 16)

In [264]:
def clean_age(row):
    if row['age'] == 'prefer not to say' or row['age'] == 'čá' or row['age'] == '60yrs':
        return np.NaN
    else:
        age = float(row['age'])
        age = int(age)
        if (age > 100) or (age < 18):
            return np.NaN
        return age
        
df['age'] = df.apply(lambda row: clean_age(row), axis=1)

In [265]:
df['age'].isna().sum()

210

In [267]:
df.dropna(subset=['age'], inplace=True)

In [268]:
df['age'].unique()

array([37., 29., 25., 32., 35., 34., 61., 27., 45., 30., 40., 31., 28.,
       36., 44., 23., 26., 42., 41., 57., 24., 38., 43., 48., 53., 46.,
       54., 65., 59., 22., 47., 39., 21., 33., 55., 69., 68., 20., 49.,
       56., 62., 70., 83., 74., 51., 19., 52., 66., 58., 18., 60., 50.,
       72., 63., 67., 64., 77., 98., 73., 79., 80., 84., 88., 71., 75.,
       76., 95., 81., 78.])

In [269]:
df.shape

(99872, 16)

# Save CSV File

In [270]:
df.to_csv('cleaned_dataset.csv')

In [139]:
from spacy import displacy

about_interest_text =  df_test['text'][1]
about_interest_doc = nlp(about_interest_text)
displacy.render(about_interest_doc, style='dep', jupyter=True)