In [1]:
# analyze the wellness score for one user

import json
import os
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

def split_data(data):
    split_data = {}

    for tweet in data['tweets']:
        month_string = tweet['created'][3:11]
        if month_string not in split_data.keys():
            split_data[month_string] = {}
            split_data[month_string]['tweets'] = []
            split_data[month_string]['retweets'] = []
            split_data[month_string]['replies'] = []

        split_data[month_string]['tweets'].append(tweet)

    for tweet in data['retweets']:
        month_string = tweet['created'][3:11]
        if month_string not in split_data.keys():
            split_data[month_string] = {}
            split_data[month_string]['tweets'] = []
            split_data[month_string]['retweets'] = []
            split_data[month_string]['replies'] = []

        split_data[month_string]['retweets'].append(tweet)

    for tweet in data['replies']:
        month_string = tweet['created'][3:11]
        if month_string not in split_data.keys():
            split_data[month_string] = {}
            split_data[month_string]['tweets'] = []
            split_data[month_string]['retweets'] = []
            split_data[month_string]['replies'] = []

        split_data[month_string]['replies'].append(tweet)

    return split_data

# Return proportion of negative sentiment tweets
def sentiment_score(tweet_list):
    if len(tweet_list) > 0:
        analyzer = SentimentIntensityAnalyzer()
        neg_count = 0
        for tweet in tweet_list:
            vs = analyzer.polarity_scores(tweet['text'])
            # print("{:-<65} {}".format(tweet['text'], str(vs)))
            if vs['compound'] <= -0.05:
                neg_count += 1

        return neg_count / len(tweet_list)
    return 0.0

# Return proportion of tweets made in the middle of the night
def insomnia_index(tweet_list):
    if len(tweet_list) > 0:
        insomia_count = 0
        for tweet in tweet_list:
            if int(tweet['created'][14]) > 0 and int(tweet['created'][14]) < 6:
                insomia_count += 1
        return insomia_count / len(tweet_list)
    return 0.0


# Data is a dictionary with entries: tweets, retweets, and replies.
def calculate_wellness(data):

    # Get tweet score
    tweet_sentiment = sentiment_score(data['tweets'])
    tweet_insomia = insomnia_index(data['tweets'])
    tweet_score = tweet_sentiment + tweet_insomia*0.5

    # Get retweet score
    retweet_sentiment = sentiment_score(data['retweets'])
    retweet_insomia = insomnia_index(data['retweets'])
    retweet_score = retweet_sentiment + retweet_insomia*0.5

    # Get reply score
    reply_sentiment = sentiment_score(data['replies'])
    reply_insomia = insomnia_index(data['tweets'])
    reply_score = reply_sentiment + reply_insomia*0.5

    return 0.5*tweet_score + 0.25*retweet_score + 0.25*reply_score

def user_wellness(username, friendname):
    # should return a dictionary that gives a wellness score for each week
    # and each month that the user was active.

    # Read tweet data
    data_path = os.path.join(os.path.join(username, 'friend_data'), friendname + '.json')
    data = {}
    with open(data_path) as data_json:
        data = json.load(data_json)

    # Split tweets into time intervals (weeks and months)
    data = split_data(data)

    # Create output dictionary
    wellness_scores = dict.fromkeys(data.keys())

    # Pass data intervals to function for calculating wellness
    for month in data.keys():
        wellness_scores[month] = calculate_wellness(data[month])

    return wellness_scores

if __name__ == '__main__':
    # pass in the username of the account you want to download
    data = user_wellness("eegilbert","munmun10")
    wellness = pd.DataFrame.from_dict(data, orient='index')

    # write the json file
    # with open("munmun10" + '_wellness.json', 'w') as outfile:
    #     json.dump(data, outfile, indent=2)

In [2]:
#Read tweet data
def read_tweet_data(username, friendname):
    data_path = os.path.join(os.path.join(username, 'friend_data'), friendname + '.json')
    data = {}
    with open(data_path) as data_json:
        data = json.load(data_json)
    data = split_data(data)
    return data

In [3]:
tweets = read_tweet_data("eegilbert","munmun10")

In [4]:
wellness = wellness.rename(columns = {0:'wellness_score'})
wellness

Unnamed: 0,wellness_score
Apr-2020,0.380376
Mar-2020,0.693945
Feb-2020,0.210227
Jan-2020,0.256944
Dec-2019,0.367063
...,...
Dec-2012,0.321429
Nov-2012,0.253535
Oct-2012,0.350072
Sep-2012,0.216346


In [5]:
wellness['is_well'] = np.where(wellness['wellness_score'] < wellness.wellness_score.mean(), 0, 1)

In [6]:
wellness = wellness.reset_index().rename(columns = {'index':'date'})
wellness

Unnamed: 0,date,wellness_score,is_well
0,Apr-2020,0.380376,1
1,Mar-2020,0.693945,1
2,Feb-2020,0.210227,0
3,Jan-2020,0.256944,0
4,Dec-2019,0.367063,1
...,...,...,...
86,Dec-2012,0.321429,0
87,Nov-2012,0.253535,0
88,Oct-2012,0.350072,0
89,Sep-2012,0.216346,0


In [7]:
list_of_dates = wellness.date.tolist()

In [8]:
list_of_tweets = []
for month in list_of_dates:
    for i in range(len(tweets[month]['tweets'])):
        list_of_tweets.append(tweets[month]['tweets'][i])

In [9]:
df = pd.DataFrame(list_of_tweets)

In [10]:
df = df.drop(columns=['id_str'])

In [11]:
df['created'] = df['created'].str.replace(r'([\(\[]).*?([\)\]])','')

In [12]:
from datetime import datetime

In [13]:
date = pd.to_datetime(df.created)

In [14]:
df['year'] = date.dt.year.astype(str)
df['month'] = date.dt.month.astype(str)

In [15]:
df['month'] = df['month'].str.replace(r'1','Jan')
df['month'] = df['month'].str.replace(r'2','Feb')
df['month'] = df['month'].str.replace(r'3','Mar')
df['month'] = df['month'].str.replace(r'4','Apr')
df['month'] = df['month'].str.replace(r'5','May')
df['month'] = df['month'].str.replace(r'6','Jun')
df['month'] = df['month'].str.replace(r'7','Jul')
df['month'] = df['month'].str.replace(r'8','Aug')
df['month'] = df['month'].str.replace(r'9','Sep')
df['month'] = df['month'].str.replace(r'10','Oct')
df['month'] = df['month'].str.replace(r'11','Nov')
df['month'] = df['month'].str.replace(r'12','Dec')

In [16]:
df['date'] = df['month'] + '-' + df['year']
df = df.drop(columns=['month','year'])

In [17]:
text_by_date = df.groupby(['date'])['text'].sum().to_frame()
text_by_date = text_by_date.reset_index()

In [18]:
df = wellness.merge(text_by_date,how='left',left_on='date',right_on='date')
df = df.dropna()

In [19]:
df

Unnamed: 0,date,wellness_score,is_well,text
0,Apr-2020,0.380376,1,while the community will no longer be meeting ...
1,Mar-2020,0.693945,1,excited to share paper on the causal factors o...
2,Feb-2020,0.210227,0,not a fan of the title of this piece but the w...
3,Jan-2020,0.256944,0,i just found out that our opioid usealternativ...
7,Sep-2019,0.294048,0,wow our paper who is the human in humancentere...
...,...,...,...,...
82,Apr-2013,0.338979,0,sign of old age cant recover from jetlag even ...
83,Mar-2013,0.375616,1,doctors use big data to improve cancer treatme...
84,Feb-2013,0.285686,0,ms dynamics crm online users get option for ip...
85,Jan-2013,0.306452,0,wohoo nice deadlines for submission to got ext...


In [20]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [21]:
vectorized_data = count_vectorizer.fit_transform(df.text)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
target = df[['is_well']]

In [24]:
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, target, test_size=0.4, random_state=0)

In [25]:
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [26]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier

In [27]:
#Create a svm Classifier
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))

In [28]:
#Train the model using the training sets
clf_output = clf.fit(data_train, targets_train)
clf_output

OneVsRestClassifier(estimator=SVC(C=100.0, break_ties=False, cache_size=200,
                                  class_weight='balanced', coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma=0.01, kernel='linear', max_iter=-1,
                                  probability=True, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False),
                    n_jobs=None)

In [29]:
clf.score(data_test, targets_test)

0.5185185185185185