In [24]:
from basic_nlp import *
from json_io import *
import datetime
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, StratifiedShuffleSplit
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectPercentile, f_classif
import re
from random import shuffle
from sklearn.ensemble import VotingClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA
import warnings

## Function to run on each tweet

In [2]:
def keyToStr(d):
    new = {}
    for key,value in d.items():
        new[" ".join(key)] = value
    return new

from re import sub

TWEET_LINK_RE = "https://t.co/(\w)+"
TWEET_HANDLE_RE = "@(\w)+"
HASHTAG_RE = "#(\w)+"

def feature(tweet):
    tweet = sub(TWEET_HANDLE_RE, "NameTOK", tweet)
    tweet = sub(TWEET_LINK_RE, "LinkTOK", tweet)
    tweet = sub(HASHTAG_RE, "", tweet)
    tweet = sub("the", "", tweet)
    tweet = sub("The", "", tweet)
    tokens = tokenize(tweet)
    ull = upperLowerLen(tokens)
    cases = wordCases(ull)
    tagged = pos(tokens)
    chunked = chunk(tagged)
    (tokens, postags) = tokNoNE(chunked)
    puncuationFreqDict = punctuationFeatures(tweet)
    suffreq = dict(freq(tokenSuffixes(tokens)))
    normSuffFreq = {}
    norm2SuffFreq = {}
    sumSuf = sum(suffreq.values())
    for key, val in suffreq.items():
        normSuffFreq[key] = val/sumSuf
        norm2SuffFreq[key] = val/len(tokens)
    
    sent = {
        'fullSent' : sentimentGrams([tokens]),
        'halfSent1' : sentimentGrams([tokens[:int(len(tokens)/2)]]),
        'halfSent2' : sentimentGrams([tokens[int(len(tokens)/2):]]),
        'thirdSent1' : sentimentGrams([tokens[:int(len(tokens)/3)]]),
        'thirdSent2' : sentimentGrams([tokens[int(len(tokens)/3):2*int(len(tokens)/3)]]),
        'thirdSent3' : sentimentGrams([tokens[2*int(len(tokens)/3):]]),
        'quartSent1' : sentimentGrams([tokens[:int(len(tokens)/4)]]),
        'quartSent2' : sentimentGrams([tokens[int(len(tokens)/4):2*int(len(tokens)/4)]]),
        'quartSent3' : sentimentGrams([tokens[2*int(len(tokens)/4):3*int(len(tokens)/4)]]),
        'quartSent4' : sentimentGrams([tokens[3*int(len(tokens)/4):]])
    }
    sentCompound = {}
    for key, val in sent.items():
        sentCompound[key+"Vader"] = val[0]['Vader']['compound'] + 1
        sentCompound[key+"LiuHu"] = val[0]['LiuHu']['compound'] + 1
    
    capFreq = capLetterFreq(ull)
    allCapsFreq = cases.count('AC')/len(cases)
    normSuffFreq = keyToStr(normSuffFreq)
    norm2SuffFreq = keyToStr(norm2SuffFreq)
    toksuff = keyToStr(dict(freq(tokenSuffixes(tokens))))
    unigrams = keyToStr(dict(freq(grams(tokens, 1))))
    bigrams = keyToStr(dict(freq(grams(tokens, 2))))
    trigrams = keyToStr(dict(freq(grams(tokens, 3))))
    unigramsPos = keyToStr(dict(freq(grams(postags, 1))))
    bigramsPos = keyToStr(dict(freq(grams(postags, 2))))
    trigramsPos = keyToStr(dict(freq(grams(postags, 3))))
    feat = {}
    feat.update(unigrams)
    feat.update(bigrams)
    feat.update(trigrams)
    feat.update(unigramsPos)
    feat.update(bigramsPos)
    feat.update(trigramsPos)
    feat.update(puncuationFreqDict)
    feat.update(toksuff)
    feat.update(normSuffFreq)
    feat.update(norm2SuffFreq)
    feat.update(sentCompound)
    feat.update({"capFreq":capFreq, "allCapsFreq":allCapsFreq})
    return feat

## Run features on tweets

In [7]:
start = datetime.datetime.now()
sarcasticTweets = tweet_iterate("../json/sarcastic/unique.json", key="text")
sarcasticFeats = [(feature(repr(tweet)), True) for tweet in sarcasticTweets]
print((datetime.datetime.now()-start).total_seconds())

213.051007


In [12]:
start = datetime.datetime.now()
seriousTweets = list(tweet_iterate("../json/non_sarcastic/unique.json", key="text"))
shuffle(seriousTweets)
seriousFeats = [(feature(repr(tweet)), False) for tweet in seriousTweets[:len(sarcasticFeats)]]
print((datetime.datetime.now()-start).total_seconds())

232.341769


## Save features from tweets

In [43]:
featTup = sarcasticFeats + seriousFeats
shuffle(featTup)
pickle.dump(featTup, open('pickledfeatures/feats.pickle', 'wb'))
(features, sarcasm) = list(zip(*featTup))

## Load features from tweets

In [2]:
pk = pickle.load(open('pickledfeatures/feats.pickle', 'rb'))
(features, sarcasm) = list(zip(*pk))

## Create Vectorizer and Save

In [60]:
dv = DictVectorizer()
(X,y) = (dv.fit_transform(features), np.array(sarcasm))
pickle.dump((X,y,dv), open('pickledfeatures/allFeaturesAndVectorizer.pickle', 'wb'))

## Load Vectorizer and features

In [2]:
(X,y,dv) = pickle.load(open('pickledfeatures/allFeaturesAndVectorizer.pickle', 'rb'))

## Randomly split vector

In [35]:
sss = StratifiedShuffleSplit(n_splits=10, train_size=0.8)

## Train & Test

In [37]:
warnings.filterwarnings("ignore")
results = {}
startTime=datetime.datetime.now()
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    startItterationTime=datetime.datetime.now()
    print("\n\nStarting itteration %d: "%(i)+ str(startItterationTime))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Features before reduction: " + str(X_train.shape))
    reducer = SelectPercentile(score_func=f_classif, percentile=1)
    X_train = reducer.fit_transform(X_train,y_train)
    X_test = reducer.transform(X_test)
    print("Features after reduction: " + str(str(X_train.shape)))
    classifiers = [
        LogisticRegression(n_jobs=-1),
        #LogisticRegression(solver='sag', max_iter=1000, n_jobs=-1, warm_start=True),
        #SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1),
        #BernoulliNB(alpha=0.2, binarize=0.4),
        #MultinomialNB(alpha=0),
    ]
    print("\n\nStarting to train...")
    for n, c in enumerate(classifiers):
            s = datetime.datetime.now()
            c.fit(X_train, y_train)
            time = (datetime.datetime.now()-s).total_seconds()
            results[(i,n,str(type(c)))] = {'trainTime':time}
            print("Trained:\t%d\t%s\tTime: %d" % (n,str(type(c)), time))
    print("\n\nStarting to test...")
    for n, c in enumerate(classifiers):
            s = datetime.datetime.now()
            score = c.score(X_test, y_test)
            time = (datetime.datetime.now()-s).total_seconds()
            results[(i,n,str(type(c)))]['testTime'] = time
            results[(i,n,str(type(c)))]['score'] = score
            print("Tested:\t%d\t%s\tTime: %d\tScore:\t%f" % (n,str(type(c)), time, score))
    pickle.dump((classifiers,dv),open('pickled/'+re.sub(r'[ :<>".*,|\/]+', "", str(startTime))+" "+str(i)+'.pickle', 'wb'))
    
    vc1 = VotingClassifier(estimators=[(' '.join([str(n),str(type(c))]),c) for n, c in enumerate(classifiers)],
                          voting='soft',
                          weights=[3,2,1,1,1])
    vc2 = VotingClassifier(estimators=[(' '.join([str(n),str(type(c))]),c) for n, c in enumerate(classifiers)],
                         voting='hard')
    vc1.fit(X_train, y_train)
    vc2.fit(X_train, y_train)
    print('VC1:%f'%vc1.score(X_test, y_test))
    print('VC2:%f'%vc2.score(X_test, y_test))
    
    stopItterationTime=datetime.datetime.now()
    print("Itteration time:\t%d" % (stopItterationTime-startItterationTime).total_seconds())
    print("Total elapsed time:\t%d" % (stopItterationTime-startTime).total_seconds())



Starting itteration 0: 2017-04-06 15:51:04.126401
Features before reduction: (58206, 1307531)
Features after reduction: (58206, 13075)


Starting to train...
Trained:	0	<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 19
Trained:	1	<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 77
Trained:	2	<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>	Time: 0
Trained:	3	<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0
Trained:	4	<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0


Starting to test...
Tested:	0	<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 0	Score:	0.804975
Tested:	1	<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 0	Score:	0.808824
Tested:	2	<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>	Time: 0	Score:	0.782848
Tested:	3	<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.761682
Tested:	4	<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.751649
VC1:0.801677


In [33]:
warnings.filterwarnings("ignore")
results = {}
startTime=datetime.datetime.now()
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    startItterationTime=datetime.datetime.now()
    print("\n\nStarting itteration %d: "%(i)+ str(startItterationTime))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Features before reduction: " + str(X_train.shape))
    reducer = SelectPercentile(score_func=f_classif, percentile=1)
    X_train = reducer.fit_transform(X_train,y_train)
    X_test = reducer.transform(X_test)
    print("Features after reduction: " + str(str(X_train.shape)))
    classifiers = [
        #(SGDClassifier(penalty='elasticnet', n_jobs=-1), {'loss':['log','modified_huber','perceptron'], 'penalty':['none','l1','elasticnet','l2']}), 
        #(BernoulliNB(),{'alpha':list(np.arange(0,20,0.1)), 'binarize':list(np.arange(0.1,0.9,0.1))}),
        #(MultinomialNB(),{'alpha':list(np.arange(0,20,0.1))})
    ]    
    print("\n\nStarting to train & Test...")
    for n, (c,options) in enumerate(classifiers):
            try:
                clf= RandomizedSearchCV(c,options, cv=10, n_iter=100)
                s = datetime.datetime.now()
                clf.fit(X_train, y_train)
            except ValueError:
                clf= GridSearchCV(c,options, cv=10)
                s = datetime.datetime.now()
                clf.fit(X_train, y_train)
            time = (datetime.datetime.now()-s).total_seconds()
            y_true, y_pred = y_test, clf.predict(X_test)
            print("\t%d\t%s\tTime: %d\tParams::\t%s" % (n,str(type(c)), time, str(clf.best_params_)))
            print(classification_report(y_true, y_pred))
    
    stopItterationTime=datetime.datetime.now()
    print("Itteration time:\t%d" % (stopItterationTime-startItterationTime).total_seconds())
    print("Total elapsed time:\t%d" % (stopItterationTime-startTime).total_seconds())



Starting itteration 0: 2017-04-06 13:33:04.153329
Features before reduction: (58206, 1307531)
Features after reduction: (58206, 13075)


Starting to train & Test...
	0	<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>	Time: 31	Params::	{'penalty': 'none', 'loss': 'perceptron'}
             precision    recall  f1-score   support

      False       0.80      0.71      0.75      3638
       True       0.74      0.82      0.78      3638

avg / total       0.77      0.76      0.76      7276

	1	<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 210	Params::	{'binarize': 0.40000000000000002, 'alpha': 0.20000000000000001}
             precision    recall  f1-score   support

      False       0.76      0.77      0.77      3638
       True       0.77      0.76      0.77      3638

avg / total       0.77      0.77      0.77      7276

	2	<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 90	Params::	{'alpha': 0.0}
             precision    recall  f1-score   support

      False  

KeyboardInterrupt: 