In [1]:
import pandas as pd
import numpy as np
import json
import os
import string
import re
import random

import nltk
from nltk import word_tokenize
from nltk.corpus import (wordnet, stopwords)
from nltk.stem.snowball import SnowballStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import (LogisticRegression, LogisticRegressionCV)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import (TfidfVectorizer, CountVectorizer)
from sklearn.metrics import (confusion_matrix, 
                             recall_score, 
                             f1_score, 
                             accuracy_score, 
                             precision_score,
                             roc_curve, auc, roc_auc_score)

import warnings
warnings.filterwarnings('ignore')

from datetime import *
from bisect import bisect

In [2]:
##### Functions to parse the time
def parse_datetime(raw_datetime, dtformat):
    if len(raw_datetime)<5:
        return np.nan, 365, np.nan, np.nan    
    datetime_obj = datetime.strptime(raw_datetime,dtformat) 
    return datetime_obj.hour

def bizhour(hh):
    biz = ['Late','Early','Business','Evening', 'Late']
    breakpoints = [5, 10, 17,20]
    return biz[bisect(breakpoints, hh)]

def get_descr_bizhour(hhmat):
    return np.array(list(map(bizhour, hhmat)))

In [3]:
#Karthik's code
class RepeatReplacer(object):
    """ Removes repeating characters until a valid word is found.
    >>> replacer = RepeatReplacer()
    >>> replacer.replace(‘looooove’)
    ‘love’
    >>> replacer.replace(‘oooooh’)
    ‘ooh’
    >>> replacer.replace(‘goose’)
    ‘goose’
    """

    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word

        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
    
def process_tweets (tweets, textcol):
    processed_text = []
    for text in tweets[textcol]:
        #replace hyperlinks - leaves xa0 off for some reason
        test = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' hyperlink ', text).replace('\xa0', '')
        #replace mentions
        test = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', ' mention ', test)
        #replace picture links
        test = re.sub(r'pic.twitter\S+', ' image ', test)
        test = "".join((char for char in test if char not in string.punctuation + '—–-…’0123456789')).lower()\
        #remove repeat letters
        tokens = [RepeatReplacer().replace(w) for w in word_tokenize(test)]
        #stem the words
        tokens = [SnowballStemmer("english").stem(w) for w in tokens]
        #get rid of stop words
        filtered_string = ' '.join([w for w in tokens if not w in set(stopwords.words('english'))])
        
        #add this to the text
        processed_text.append(filtered_string)

    tweets[textcol] = processed_text
    
    return tweets

def tfidf_vector (tweets, textcol):
    tf = TfidfVectorizer(analyzer='word', min_df = 15, stop_words = 'english')

    tfidf_matrix =  tf.fit_transform(tweets[textcol])
    feature_names = tf.get_feature_names() 
    dense = tfidf_matrix.todense()
    df = pd.DataFrame(dense)
    df.columns = feature_names
    
    return df

def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    neg = []
    neu = []
    pos = []
    for sentence in text:
        vs = analyzer.polarity_scores(sentence)
        neg.append(vs['neg'])
        neu.append(vs['neu'])
        pos.append(vs['pos'])
    return neg, pos, neu

def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

In [4]:
jsons_data = pd.DataFrame()

directory = '/Users/jenniferpolson/Documents/School/2018-W/BE 223B/Project 1/tweet_files-1/'
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        df = pd.DataFrame(json.load(open(directory + str(filename))))
        df['tweet_id'] = df['user_record_id'].map(str) + '_' + df.index.astype(str)
        jsons_data = jsons_data.append(df) 

#jsons_data['Negative Sentiment'], jsons_data['Positive Sentiment'], jsons_data['Neutral Sentiment'] = sentiment_analysis(jsons_data['text'])
#jsons_data = jsons_data[jsons_data['Neutral Sentiment'] != 1]

In [5]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        #for doc_index in top_doc_indices:
        #    print (documents[doc_index])

df = process_tweets(jsons_data, 'text')
documents = df.text
#documents = jsons_data.text

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run LDA
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

#n_top_words = 8
#n_top_documents = 4
#display_topics(lda_H, lda_W, tf_feature_names, documents, n_top_words, n_top_documents)

In [6]:
#create LDA column names
cols = ['LDA_%d' % n for n in np.arange(no_topics)]

#add the the topic model information to the tweet matrix
all_data = pd.concat([jsons_data, pd.DataFrame(lda_W, index = jsons_data.index, columns = cols) ], axis = 1)

#create empty dataframe to store concatenated tweet information
tweets_concat = pd.DataFrame()

for tid in all_data.user_record_id.unique():
    df = all_data.loc[all_data['user_record_id'] == tid]
    wk_list = df.weekday.tolist()
    n_tweets = len(df.index)

    hours = np.zeros(len(df.index))

    for i,rd in enumerate(df['time']):
        hours[i]=parse_datetime(rd,'%H:%M:%S')

    timeofday = list(get_descr_bizhour(hours))

    tweets_concat = tweets_concat.append({'n_tweets': n_tweets,
                                          'text':df['text'].str.cat(sep=', '), 
                                          'likes':df.likes.astype(int).sum(), 
                                          'replies':df.replies.astype(int).sum(), 
                                          'retweets':df.retweets.astype(int).sum(), 
                                          'weekday_mean': df.weekday.astype(int).mean(),
                                          'wkday_0': wk_list.count(0)/n_tweets,
                                          'wkday_1': wk_list.count(1)/n_tweets,
                                          'wkday_2': wk_list.count(2)/n_tweets,
                                          'wkday_3': wk_list.count(3)/n_tweets,
                                          'wkday_4': wk_list.count(4)/n_tweets,
                                          'wkday_5': wk_list.count(5)/n_tweets,
                                          'wkday_6': wk_list.count(6)/n_tweets,
                                          'time_late': timeofday.count('Late')/n_tweets,
                                          'time_early': timeofday.count('Early')/n_tweets,
                                          'time_business': timeofday.count('Business')/n_tweets,
                                          'time_evening': timeofday.count('Evening')/n_tweets,
                                          'user_id':df.user_record_id.iloc[0]}, ignore_index = True)

for topic in cols:
    tweets_concat[topic] = all_data.groupby(['user_record_id'])[topic].mean().tolist()
    
tweets_concat.to_csv('tweets_concatenated.csv')

In [7]:
def create_model_df (tweets, labels, 
                     textcol = 'text', idcol = 'user_id', 
                     repcol = 'replies', lcol = 'likes', retcol = 'retweets', 
                     bincol = 'variable'):
    df = process_tweets(tweets, textcol)
    #generate tfidf, concatenate with processed tweets
    new_features = pd.concat([df, tfidf_vector(df, textcol)], axis = 1)
    new_features = df
    new_features.index = new_features[idcol]
    #generate sentiment
    new_features['Negative Sentiment'], new_features['Positive Sentiment'], new_features['Neutral Sentiment'] = sentiment_analysis(tweets[textcol])
    #match and merge with labels
    full_data = pd.merge(new_features, labels, how='inner', on=None, left_on=None, right_on=None,
                         left_index=True, right_index=True).drop([idcol], axis=1)

    ratio = []
    for index, row in full_data.iterrows():
        div = safe_div(row[repcol], (row[lcol] + row[retcol]))
        ratio.append(div)

    full_data['ratio'] = ratio
    #this gets rid of duplicate columns
    full_data = full_data.loc[:,~full_data.columns.duplicated()]
    #binarize
    full_data['binary_label'] = (full_data[bincol] >= 3).astype(int)
    #full_data = full_data.drop(full_data[full_data.variable == 3].index)
    
    return full_data

In [8]:
labels = pd.read_csv("twitter-data-deidentified.csv", index_col='record_id')
full_data = create_model_df(tweets_concat, labels)

In [10]:
#export everything thus far
#full_data.drop('text', axis = 1).to_csv('features_id_text.csv')
#full_data.to_csv("full_data.csv")