## Hack the Crisis

Twitter data component.

In [None]:
import pickle
import glob
import re
import pandas as pd
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

Read tweets.

In [None]:
data = pd.read_csv("dataset/swe_tweets.csv")
data['timestamp'] =  pd.to_datetime(data['timestamp'])
data = data[['timestamp','tweet']]

Create docs.

In [None]:
daydata = data.groupby([data['timestamp'].dt.date])

In [None]:
for day,data in daydata:
    with open("docs/" + str(day) + ".txt", "w") as docfile:
        for t in data.tweet:
            docfile.write(str(t) + "\n")

Sentiment scores per day.

In [None]:
# Prepare categorisation
with open('scoring_tools/categories.pickle', 'rb') as handle:
    category_dict = pickle.load(handle)
with open('scoring_tools/wordfeatures.pickle', 'rb') as handle:
    wordfeatures_dict = pickle.load(handle)

In [None]:
# Implement Swedish translation (only of the used categories)
transl_df = pd.read_csv("scoring_tools/swe_words.csv")

In [None]:
enwords = transl_df.en.tolist()
swewords = transl_df.sv.tolist()
transl_dict = dict(zip(enwords, swewords))

In [None]:
swedish_scoring = {}

for w in enwords:
    k = transl_dict.get(w)
    v = wordfeatures_dict.get(w)
    swedish_scoring[k]=v

In [None]:
# Get the daily docs
days = [f.split("/")[1].split(".")[0] for f in glob.glob("docs/*.txt")]
docs = [open(file,"r").read() for file in glob.glob("docs/*.txt")]
docs = [re.sub("\n"," ",d) for d in docs]
daydocs = pd.DataFrame(zip(days,docs))
daydocs.columns=['day','text']
daydocs = daydocs.sort_values(by="day").reset_index(drop=True)

In [None]:
days = daydocs.day.tolist()
docs = daydocs.text.tolist()

In [None]:
dataset = []

for c,doc in enumerate(docs):
    print("\r" + str(c+1) + "/" + str(len(docs)), end="")
        
    data_dict = {}
    
    data_dict['doc'] = doc
    doc = doc.split()
    doc = [w.lower().strip() for w in doc]
    doc = [re.sub("\.|,|\:|/|\"|\?|-|…|'|\(|\)|\!|\+","", w) for w in doc]

    docfeatures = []
    for w in doc:
        if w in swedish_scoring:
            feats = swedish_scoring[w]
            try:
                for f in feats:
                    docfeatures.append(f)
            except:
                pass

    for df in docfeatures:
        label = category_dict[df]
        label = label.split()[0]
        
    counts = Counter(docfeatures)
    for k,v in counts.items():
        category = category_dict[k]
        category = category.split()[0]
        count = v
        proportion_by_wordcount = (v/len(doc))*100 # this gives same scores as in LIWC's own software
        
        data_dict[category] = round(proportion_by_wordcount, 2)
    
    dataset.append(data_dict)

In [None]:
scores_df = pd.DataFrame.from_records(dataset).fillna(0)
scores_df['day'] = days

In [None]:
scores_df = scores_df[['day',
                    'health',
                    'death',
                    'bio',
                    'body',
                    'affect',
                    'anger',
                    'swear',
                    'anx',
                    'sad',
                    'feel',
                    'friend',
                    'family',
                    'social',
                    'money',
                    'work',
                    'relig',
                    'power',
                    'cause',
                    'certain',
                    'insight',
                    'compare',
                    'risk',
                    'interrog',
                    'focusfuture',
                    'tentat',
                    'quant']
                  ]

In [None]:
# add VADER posemo and negemo
posemo = []
negemo = []


analyzer = SentimentIntensityAnalyzer()
for c,doc in enumerate(docs):
    docpos = []
    docneg = []
    
    sents = sent_tokenize(doc)
    for sent in sents:
        vs = analyzer.polarity_scores(sent)
        docpos.append(vs.get('pos'))
        docneg.append(vs.get('neg'))
    docpos = sum(docpos)/len(docpos)
    docneg = sum(docneg)/len(docneg)
    posemo.append(docpos)
    negemo.append(docneg)
    print("\r" + str(c+1) + "/" + str(len(docs)), end="")

In [None]:
scores_df['posemo'] = posemo
scores_df['negemo'] = negemo

In [None]:
scores_df["diseaseIndex"] = scores_df.health+scores_df.death+scores_df.bio+scores_df.body
scores_df["emotionIndex"] = scores_df.negemo+scores_df.affect+scores_df.anger+scores_df.swear+scores_df.anx+scores_df.sad+scores_df.posemo+scores_df.feel
scores_df["relationIndex"] = scores_df.friend+scores_df.family+scores_df.social
scores_df["economyIndex"] = scores_df.money+scores_df.work
scores_df["politicalIndex"] = scores_df.relig+scores_df.power+scores_df.cause+scores_df.certain+scores_df.insight+scores_df.compare+scores_df.risk+scores_df.interrog+scores_df.focusfuture+scores_df.tentat+scores_df.quant

In [None]:
cols_to_norm = list(scores_df.columns)
cols_to_norm.remove('day')

In [None]:
scores_df[cols_to_norm] = scores_df[cols_to_norm].apply(lambda x: 2*(x - x.min()) / (x.max() - x.min())-1)

In [None]:
scores_df

In [None]:
scores_df.posemo.plot()

In [None]:
scores_df.insight.plot()

In [None]:
scores_df.relationIndex.plot()

In [None]:
scores_df.to_csv("tabular-data-output.csv")

#### Parse to fit frontend

In [254]:
twitter_data = {}

# the key to append to the dict
for (idx,row) in scores_df.iterrows():
    twitter_data[row.day]={
        "disease": {
            "diseaseIndex":row.diseaseIndex,
            "health":row.health,
            "death":row.death,
            "bio":row.bio,
            "body":row.body
        },
        
        "emotion": {
            "emotionIndex":row.emotionIndex,
            "feel":row.feel,
            "negemo":row.negemo,
            "posemo":row.posemo,
            "affect":row.affect,
            "swear":row.swear,
            "anxiety":row.anx,
            "sad":row.sad,
            "anger":row.anger
        },
        
        "relation": {
            "relationIndex":row.relationIndex,
            "friend":row.friend,
            "family":row.family,
            "social":row.social
        },
        
        "economy": {
            "economyIndex":row.economyIndex,
            "money":row.money,
            "work":row.work
        },
        
         "political": {
            "politicalIndex":row.politicalIndex,
            "power":row.power,
            "cause":row.cause,
            "certain":row.certain,
            "insight":row.insight,
            "compare":row.compare,
            "risk":row.risk,
            "interrog":row.interrog,
            "focusfuture":row.focusfuture,
            "relig":row.relig,
            "tentat":row.tentat,
            "quant":row.quant
         }
    }

In [255]:
with open('data-delivery-to-frontend.txt', 'w') as file:
    file.write(str(twitter_data))