# Workshop 1: analysis of natural language using string objects

In [None]:
s = "Hi my name is Kristoffer, how are you?"
print(s)

In [None]:
import os
wd = os.getcwd()
fileobjects = sorted(os.listdir(wd + "/DATA"))
fname = fileobjects[1]
with open("DATA/" + fname, 'r') as f:
    s = f.read()

In [None]:
import re
content = re.sub(r"\d","",s)
content = content.lower()

In [None]:
tokenizer = re.compile(r"\W+")
unigrams = tokenizer.split(content)

clean_unigrams = []
for unigram in unigrams:
    if len(unigram) > 1:
        clean_unigrams.append(unigram)

In [None]:
import pandas as pd

labmt = pd.read_csv('TOOLS/labmt_dict.csv',sep='\t', encoding='utf-8', index_col=0)
labmt.head()

avg = labmt.happiness_average.mean()
sent_dict = (labmt.happiness_average - avg).to_dict()

sent_vec = []
for unigram in clean_unigrams:
    sent_vec.append(sent_dict.get(unigram,0.0))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(sent_vec)
ax.axhline(y = np.mean(sent_vec), c="r",linewidth=1)
ax.set_xlabel('Time')
ax.set_ylabel('Sentiment')

plt.show()
plt.close()


In [None]:
def slicer(input, n = 100, cut_off = False):
    """
    slice tokenized text in slices of n tokens
    - end cut off for full length normalization
    """
    slices = []
    for i in range(0,len(input),n):
        slices.append(input[i:(i+n)])
    if cut_off:
        del slices[-1]
    return slices

slices = slicer(clean_unigrams)

In [None]:
def sentiment_score(unigrams):
    labmt = pd.read_csv('TOOLS/labmt_dict.csv',sep='\t', encoding='utf-8', index_col=0)
    avg = labmt.happiness_average.mean()
    sent_dict = (labmt.happiness_average - avg).to_dict()
    sent_vec = []
    for unigram in unigrams:
        sent_vec.append(sent_dict.get(unigram,0.0))
    return(sent_vec)

sent_vec = []
for slc in slices:
    sent_vec.append(sum(sentiment_score(slc)))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(sent_vec)
ax.axhline(y = np.mean(sent_vec), c="r",linewidth=1)
ax.set_xlabel('Time')
ax.set_ylabel('Sentiment (sliced)')

plt.show()
plt.close()


In [None]:
def smooth(l, n = 5):
    """moving average filter with window size n"""
    sigma = 0
    res = list( 0 for x in l)
    for i in range(0 , n):
        sigma = sigma + l[i]
        res[i] = sigma / (i + 1)
    for i in range( n, len(l) ):
        sigma = sigma - l[i - n] + l[i]
        res[i] = sigma / n
    return res

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(smooth(sent_vec))
ax.axhline(y = np.mean(smooth(sent_vec, n = 10)), c="r",linewidth=1)
ax.set_xlabel('Time')
ax.set_ylabel('Sentiment (sliced)')

plt.show()
plt.close()