In [None]:
from collections import Counter
import lib
import pandas as pd
import nltk
import itertools

## Is happiness seasonal?
Determine whether or not people mention different seasons more in relation to their happiness. We provide the list of seasons. Which season makes people happiest?

In [None]:
demographics = lib.load_demographics()
happy_moments = lib.load_happy_moments()

joined_data = pd.merge(demographics, happy_moments, left_on='wid', right_on='wid').set_index('hmid')

def get_hm_tokens(happy_moments):
    hm_tokens = {}
    for row in happy_moments.itertuples():
        hm_tokens[row.hmid] = []
        for token in nltk.word_tokenize(row.cleaned_hm):
            hm_tokens[row.hmid].append(token.lower())
    return hm_tokens

hm_tokens = get_hm_tokens(happy_moments)
all_tokens = list(itertools.chain.from_iterable(hm_tokens.values()))

In [None]:
seasons = ['spring', 'summer', 'fall', 'winter']

In [None]:
def count_seasons(all_tokens):
    counts = Counter()
    for token in all_tokens:
        if token in seasons:
            counts[token] += 1
    print(counts)
count_seasons(all_tokens)

### The magic of tokenization
nltk's word tokenize algorithm is trained to handle special cases like punctuation. However, a simple "tokenizer" is python's `string.split` function, which splits a string on white space.

Create a list of tokens using `string.split` instead of `nltk.word_tokenize`. Do you get different results? Look at the CSV files (<span style="color:red">TODO:</span> make sure this is possible in collab) - can you see why?

In [None]:
split_tokens = itertools.chain.from_iterable([[s.lower() for s in text.split()] for text in happy_moments['cleaned_hm']])
count_seasons(split_tokens)