In [1]:
import pandas as pd
import numpy as np
import json
import os
import string
import re
import random

import nltk
from nltk import word_tokenize
from nltk.corpus import (wordnet, stopwords)
from nltk.stem.snowball import SnowballStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (confusion_matrix, 
                             recall_score, 
                             f1_score, 
                             accuracy_score, 
                             precision_score,
                             roc_curve, auc, roc_auc_score)

import warnings
warnings.filterwarnings('ignore')

In [2]:
jsons_data = pd.DataFrame(columns = ['user_record_id', 'text', 'likes', 'replies', 'retweets'])
tweets_concat = pd.DataFrame(columns = ['user_id', 'n_tweets', 'text', 'likes', 'replies', 'retweets'])

directory = '/Users/jenniferpolson/Documents/School/2018-W/BE 223B/Project 1/tweet_files/'
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        df = pd.DataFrame(json.load(open(directory + str(filename))))
        df['tweet_id'] = df['user_record_id'].map(str) + '_' + df.index.astype(str)
        jsons_data = jsons_data.append(df) 
        tweets_concat = tweets_concat.append({'user_id':df.user_record_id.iloc[0], 
                                              'n_tweets': len(df.index),
                                              'text':df['text'].str.cat(sep=', '), 
                                              'likes':df.likes.astype(int).sum(), 
                                              'replies':df.replies.astype(int).sum(), 
                                              'retweets':df.retweets.astype(int).sum()}, ignore_index = True)

In [3]:
#Karthik's code
class RepeatReplacer(object):
    """ Removes repeating characters until a valid word is found.
    >>> replacer = RepeatReplacer()
    >>> replacer.replace(‘looooove’)
    ‘love’
    >>> replacer.replace(‘oooooh’)
    ‘ooh’
    >>> replacer.replace(‘goose’)
    ‘goose’
    """

    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word

        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
    
    
def process_tweets (tweets, colname):
    processed_text = []
    for text in tweets[colname]:
        #replace hyperlinks - leaves xa0 off for some reason
        test = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' hyperlink ', text).replace('\xa0', '')
        #replace mentions
        test = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', ' mention ', test)
        #replace picture links
        test = re.sub(r'pic.twitter\S+', ' image ', test)
        test = "".join((char for char in test if char not in string.punctuation + '—–-…’0123456789')).lower()\

        #remove repeat letters
        tokens = [RepeatReplacer().replace(w) for w in word_tokenize(test)]
        #employ stemmer
        stemmer = SnowballStemmer("english")
        tokens = [stemmer.stem(w) for w in tokens]
        #get rid of stop words
        filtered_string = ' '.join([w for w in tokens
                             if not w in set(stopwords.words('english'))])

        processed_text.append(filtered_string)

    tweets[colname] = processed_text
    
    return tweets

def tfidf_vector (tweets, colname, n_terms):
    tf = TfidfVectorizer(analyzer='word', min_df = 0, stop_words = 'english')

    tfidf_matrix =  tf.fit_transform(tweets[colname])
    feature_names = tf.get_feature_names() 
    dense = tfidf_matrix.todense()
    df = pd.DataFrame(dense)
    df.columns = feature_names
    df = df.append(df.sum(numeric_only=True), ignore_index=True)

    filtered_terms = df.transpose().nlargest(n_terms, df.transpose().iloc[:,:(len(df)-1)]).transpose()
    return filtered_terms

In [4]:
def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    neg = []
    neu = []
    pos = []
    for sentence in text:
        vs = analyzer.polarity_scores(sentence)
        neg.append(vs['neg'])
        neu.append(vs['neu'])
        pos.append(vs['pos'])
    return neg, pos, neu

In [5]:
df = process_tweets(tweets_concat, 'text')
new_features = pd.concat([df, tfidf_vector(df, 'text', 100)[:-1]], axis = 1)

new_features['Negative Sentiment'], new_features['Positive Sentiment'], new_features['Neutral Sentiment'] = sentiment_analysis(tweets_concat.text)
#match the labels
labels = pd.read_csv("twitter-data-deidentified.csv")
labels.index = labels.record_id
new_features.index = new_features.user_id
full_data = pd.merge(new_features, labels, how='inner', on=None, left_on=None, right_on=None,
                     left_index=True, right_index=True).drop(['record_id', 'user_id'], axis=1)
#binarize
full_data['binary_label'] = (full_data['variable'] >= 3).astype(int)

In [6]:
#split into folds
def create_folds(full_data, n): 
    nlist = list(range(0,len(full_data)-1))
    random.shuffle(nlist)
    fold_size = len(nlist) / n
    ints = [nlist[int(round(fold_size * i)): 
                  int(round(fold_size * (i + 1)))] 
             for i in range(n)]
    fold = [full_data.iloc[ints[i],:] for i in range(n)]
    return fold

In [7]:
def binary_metrics (test, label, pred):
    acc = accuracy_score(test[label], test[pred])
    f1 = f1_score(test[label], test[pred])
    prec = precision_score(test[label], test[pred])
    rec = recall_score(test[label], test[pred])
    roc_auc = roc_auc_score(test[label], test[pred])
    
    tn, fp, fn, tp = confusion_matrix(test[label], test[pred]).ravel()
    
    return roc_auc, acc, prec, rec, f1

def rfc_metrics (test, label, pred):
    tn, fp, fn, tp = confusion_matrix(test[label], test[pred]).ravel()
    
    roc_auc, acc, prec, rec, f1 = binary_metrics(test, label, pred)
    
    roc = roc_curve(test[label], test[pred])
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    n_classes = test[label].nunique()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test[label], test[pred])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test[label].ravel(), test[pred].ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    plt.figure(figsize=(8,5))
    lw = 2
    plt.plot(fpr[1], tpr[1], color='gold',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[1])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Example')
    plt.legend(loc="lower right")
    plt.show()
    
    show_confusion_matrix(test, label, pred)
    
    return details

def show_confusion_matrix(test, label, pred):
    
    C = confusion_matrix(test[label], test[pred])
    tn, fp, fn, tp = C.ravel()
    
    NP = fn+tp # Num positive examples
    NN = tn+fp # Num negative examples
    N  = NP+NN

    fig = plt.figure(figsize=(8,8))
    ax  = fig.add_subplot(111)
    ax.imshow(C, cmap=plt.cm.gray)

    # Draw the grid boxes
    ax.set_xlim(-0.5,2.5)
    ax.set_ylim(2.5,-0.5)
    ax.plot([-0.5,2.5],[0.5,0.5], '-k', lw=2)
    ax.plot([-0.5,2.5],[1.5,1.5], '-k', lw=2)
    ax.plot([0.5,0.5],[-0.5,2.5], '-k', lw=2)
    ax.plot([1.5,1.5],[-0.5,2.5], '-k', lw=2)


    # Set xlabels
    ax.set_xlabel('Predicted Label', fontsize=16)
    ax.set_xticks([0,1,2])
    ax.set_xticklabels(['Show', 'No Show'])
    ax.xaxis.set_label_position('top')
    ax.xaxis.tick_top()
    # These coordinate might require some tinkering. Ditto for y, below.
    ax.xaxis.set_label_coords(0.34,1.06)

    # Set ylabels
    ax.set_ylabel('True Label', fontsize=16, rotation=90)
    ax.set_yticklabels(['Show', 'No Show'])
    ax.set_yticks([0,1,2])
    ax.yaxis.set_label_coords(-0.09,0.65)


    # Fill in initial metrics: tp, tn, etc...
    ax.text(0,0,'True Negatives: %d\n(Total Negatives: %d)'%(tn,NN),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,1,'False Negatives: %d'%fn,
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,0,'False Positives: %d'%fp,
           va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,1,'True Positives: %d\n(Total Positives: %d)'%(tp,NP),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    # Fill in secondary metrics: accuracy, true pos rate, etc...
    ax.text(2,0,'True Negative Rate' + '\n' +'(Specificity):%.2f'%(tn / (fp+tn+0.)),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,1,'True Positive Rate' + '\n' + '(Sensitivity):%.2f'%(tp / (tp+fn+0.)),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,2,'F-1 Score: %.2f'%(round(2*tp/((2*tp) + fp + fn),3)),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,2,'Negative Predictive ' + '\n' + 'Value: %.2f'%(1-fn/(fn+tn+0.)),
           va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,2,'Positive Predictive ' + '\n' + 'Value: %.2f'%(tp/(tp+fp+0.)),
            va='center', ha='center', bbox=dict(fc='w',boxstyle='round,pad=1'))


    plt.tight_layout()
    plt.show()
    
    return None

In [8]:
def run_model(full_data, test, label, pred, plot = False):

    train = pd.concat([full_data, test]).drop_duplicates(keep=False)
    trainArr = train.drop(['variable', 'binary_label', 'text'], axis = 1).as_matrix()
    trainRes = train.as_matrix(['binary_label'])
    testArr = test.drop(['variable', 'binary_label'], axis = 1).as_matrix()
    #gridsearch

    param_test1 = {'C':[0.001,0.01,0.1,1,10,100],
                  }
    gridsearch = GridSearchCV(estimator = LogisticRegression(),
                              param_grid = param_test1)

    gridsearch.fit(trainArr, 
                   trainRes)

    clf_lr = LogisticRegression(C=list(gridsearch.best_params_.values())[0])
    data = test
    clf_lr.fit(trainArr, trainRes)
    data[pred] = clf_lr.predict(testArr)
    
    if plot:
        roc_auc, acc, prec, rec, f1 = rfc_metrics(data, label, pred)
    else:
        roc_auc, acc, prec, rec, f1 = binary_metrics(data, label, pred)
    
    return [roc_auc, acc, prec, rec, f1]

In [9]:
fold1, fold2, fold3, fold4, fold5 = create_folds(full_data.drop('text', axis = 1),5)
label, pred = 'binary_label', 'predictions'

b1 = run_model(full_data, fold1, label, pred)
b2 = run_model(full_data, fold2, label, pred)
b3 = run_model(full_data, fold3, label, pred)
b4 = run_model(full_data, fold4, label, pred)
b5 = run_model(full_data, fold5, label, pred)

In [10]:
binary = pd.DataFrame([b1, b2, b3, b4, b5])
binary.columns = ['ROC AUC Score','Accuracy', 'Recall', 'Precision', 'F-1 Score']
binary.index = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
binary

Unnamed: 0,ROC AUC Score,Accuracy,Recall,Precision,F-1 Score
Fold 1,0.457576,0.5,0.333333,0.181818,0.235294
Fold 2,0.625,0.52,1.0,0.25,0.4
Fold 3,0.412121,0.461538,0.2,0.090909,0.125
Fold 4,0.455128,0.44,0.333333,0.076923,0.125
Fold 5,0.571429,0.538462,1.0,0.142857,0.25
