In [1]:
import pandas as pd
import re
from sklearn.feature_extraction import FeatureHasher
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import math
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from time import time

In [2]:
def get_cm(dat):
    vals = dat.values.tolist()
    dat['TP'] = list(map(lambda y: 1 if y[0] == y[1] and y[0] == True else 0, vals))
    dat['TN'] = list(map(lambda y: 1 if y[0] == y[1] and y[0] == False else 0, vals))
    dat['FP'] = list(map(lambda y: 1 if y[0] != y[1] and y[0] == False else 0, vals))
    dat['FN'] = list(map(lambda y: 1 if y[0] != y[1] and y[0] == True else 0, vals))
    tp = sum(dat['TP'])
    tn = sum(dat['TN'])
    fp = sum(dat['FP'])
    fn = sum(dat['FN'])
    tpr = tp/(tp+fn)
    tnr = tn/(tn+fp)
    stats = pd.DataFrame(columns=['TP', 'FP', 'TN', 'FN', 'Sensitivity', 'Specificity'],
                data=[[tp, fp, tn, fn, tpr, tnr]])
    return stats

def get_ba(dat):
    cm = get_cm(dat).values.tolist()[0]
    sens = float(cm[4])
    spec = float(cm[5])
    return (sens+spec)/2

def get_roc(actual, predicted, title="ROC"):
    """
    Plot ROC curve based on actuals and estimated probabilities.
    """
    fpr, tpr, threshold = roc_curve(actual, predicted)    
    roc_auc = auc(fpr, tpr)
    plt.title(title)
    lbl = 'AUC: {0:.3f}'.format(roc_auc)
    plt.plot(fpr, tpr, 'b', label = lbl)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()  
    
def hash_features(matrix, n_features, column_name='Hash'):
    hasher = FeatureHasher(n_features=n_features)
    hash_vector = hasher.transform(matrix)
    hashed = hash_vector.toarray()
    cols = []
    for i in range(len(hashed[0])):
        cols.append(column_name+str(i))

    return pd.DataFrame(columns=cols, data=hashed)

def fit_lda(df, target='Score', n_components=3):
    train = df.sample(frac=0.7, random_state=123) #random state is a seed value
    test = df.drop(train.index)

    train_X = train.drop(target, 1).values
    train_y = train[target].values

    test_X = test.drop(target, 1).values
    test_y = test[target].values
    
    try:
        lda = LDA(n_components=n_components)
        lda_fit = lda.fit(train_X, train_y)
        return lda_fit.score(test_X, test_y)

    except:
        return 0
    
def clean(s):
        s = s.lower()
        s = re.sub("^\@\w+\s+", "", s) # remove tweeter name (e.g. "@elephantbird ")
        s = re.sub("(^\s+|\s+$)", "", s) # trim leading/trailing spaces
        s = re.sub("[^a-z]", " ", s) # remove all but letters and spaces
        return s
        


In [3]:
data_dir = "C:\\Users\\bonfardeci-j\\Documents\\DL\\Data Science Group Study\\data\\"
filepath = data_dir+"twitter_dataset.csv"
imdb = data_dir+"imdb_sentiment.csv"

In [4]:
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,sentiment_label,tweet_text
0,4,"@elephantbird Hey dear, Happy Friday to You A..."
1,4,Ughhh layin downnnn Waiting for zeina to co...
2,0,"@greeniebach I reckon he'll play, even if he's..."
3,0,@vaLewee I know! Saw it on the news!
4,0,very sad that http://www.fabchannel.com/ has c...


In [5]:
col_name = "tweet_text"
target_col = "sentiment_label"
# Replace punctuation, special characters and digits with space
sentiment = df[col_name].values
df[col_name] = list( map(clean, sentiment) )
df.head()

Unnamed: 0,sentiment_label,tweet_text
0,4,hey dear happy friday to you already had you...
1,4,ughhh layin downnnn waiting for zeina to co...
2,0,i reckon he ll play even if he s not b...
3,0,i know saw it on the news
4,0,very sad that http www fabchannel com has c...


In [45]:
# Find optimal number of feature hashes for highest balanced accuracy
matrix = list( map(lambda review: {'tweet_text': review}, df[col_name].values) )

best_ba = 0
optimal_n_features = 0
max_features = 500

start = time()

for i in range(50, max_features, 50):
    hash_df = pd.concat([df, hash_features(matrix=matrix, n_features=i, column_name=col_name)], 1).drop(col_name, 1)
    hash_df = hash_df.dropna()
    ba = fit_lda(hash_df, target_col)
    
    if(ba > best_ba):
        best_ba = ba
        optimal_n_features = i   
        print(ba, optimal_n_features, (time() - start)/1000.00)
    

print(best_ba, optimal_n_features) #

0.508395833333 50 0.0015580408573150635
0.508395833333 50


In [35]:
hash_df = pd.concat([df, hash_features(matrix=matrix, n_features=optimal_n_features, column_name=col_name)], 1)
hash_df = hash_df.dropna()
hash_df.to_csv(data_dir+"twitter_sentiment_hash_opt.csv")