In [1]:
import pandas as pd
import math
import numpy as np
from time import time
import string
import itertools
from html import unescape
import preprocessor as p
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn import cross_validation
from sklearn import svm
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

#### Read data from Excel file and create data frame

In [2]:
def read_excel_file(filename, sheetname):
    return pd.read_excel(filename, sheetname)

In [3]:
def read_from_input_data (filename):
    tweet_df_obama = read_excel_file(filename, 0)
    tweet_df_romney = read_excel_file(filename, 1)
    
    tweet_df_obama['Class'] = tweet_df_obama['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df_obama['Class'] = tweet_df_obama['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df_obama = tweet_df_obama.loc[tweet_df_obama.Class=='1'].append(tweet_df_obama.loc[tweet_df_obama.Class=='-1']).append(tweet_df_obama.loc[tweet_df_obama.Class=='0'])
    
    tweet_data_df_obama.dropna(inplace = True)
    
    tweet_df_romney['Class'] = tweet_df_romney['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df_romney['Class'] = tweet_df_romney['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df_romney = tweet_df_romney.loc[tweet_df_romney.Class=='1'].append(tweet_df_romney.loc[tweet_df_romney.Class=='-1']).append(tweet_df_romney.loc[tweet_df_romney.Class=='0'])
    
    tweet_data_df_romney.dropna(inplace = True)
    
    return tweet_data_df_obama, tweet_data_df_romney

In [4]:
def split_training_data(df, train_data_prcnt):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test

In [5]:
tweet_df_obama, tweet_df_romney = read_from_input_data("training-Obama-Romney-tweets.xlsx")
tweet_df_obama_test, tweet_df_romney_test = read_from_input_data("testing-Obama-Romney-tweets.xlsx")
exclude = set(string.punctuation)

In [6]:
print("Train Obama Data")
print("-1 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='1'])))
print(" 0 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='0'])))

print("\nTrain Romney Data")

print("-1 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='1'])))
print(" 0 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='0'])))

print("\nTest Obama Data")
print("-1 : " + str(len(tweet_df_obama_test.loc[tweet_df_obama_test.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_obama_test.loc[tweet_df_obama_test.Class=='1'])))
print(" 0 : " + str(len(tweet_df_obama_test.loc[tweet_df_obama_test.Class=='0'])))

print("\nTest Romney Data")

print("-1 : " + str(len(tweet_df_romney_test.loc[tweet_df_romney_test.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_romney_test.loc[tweet_df_romney_test.Class=='1'])))
print(" 0 : " + str(len(tweet_df_romney_test.loc[tweet_df_romney_test.Class=='0'])))

Train Obama Data
-1 : 1968
 1 : 1679
 0 : 1977

Train Romney Data
-1 : 2893
 1 : 1075
 0 : 1680

Test Obama Data
-1 : 688
 1 : 582
 0 : 681

Test Romney Data
-1 : 960
 1 : 385
 0 : 555


In [7]:
def clean_tweet_text(text):
    
    if not pd.isnull(text):
        # Remove html escape characters and replace with their meaning
        text = unescape(text)
        # Decode tweet to utf-8 format

        text = text.encode("ascii", "ignore").decode("utf8")

        # Clean data using tweet preprocessor and convert to lower case
        text = str.lower(p.clean(text))

        # Remove characters
        text = text.replace("<e>", "")
        text = text.replace("</e>", "")
        text = text.replace("<a>", "")
        text = text.replace("</a>", "")

        # Remove multiple repetition of a character in word
        text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

        # Remove punctuation
        text = ''.join(ch for ch in text if ch not in exclude)
    return text

In [8]:
tweet_df_obama['Anootated tweet'] = tweet_df_obama['Anootated tweet'].map(clean_tweet_text)
tweet_df_romney['Anootated tweet'] = tweet_df_romney['Anootated tweet'].map(clean_tweet_text)

tweet_df_obama_test['Anootated tweet'] = tweet_df_obama_test['Anootated tweet'].map(clean_tweet_text)
tweet_df_romney_test['Anootated tweet'] = tweet_df_romney_test['Anootated tweet'].map(clean_tweet_text)


#### Slipt data into train and test data

In [9]:
# tweet_random_df_obama = tweet_df_obama.copy()

# for i in range(0, 500):
#     split1_df, split2_df = split_training_data(tweet_random_df_obama, 50)
#     tweet_random_df_obama = pd.concat([split1_df, split2_df])

# train_df_obama, test_df_obama = split_training_data(tweet_random_df_obama, 75)

print(len(tweet_df_obama))
print(len(tweet_df_obama_test))

# tweet_random_df_romney = tweet_df_romney.copy()

# for i in range(0, 500):
#     split1_df, split2_df = split_training_data(tweet_random_df_romney, 50)
#     tweet_random_df_romney = pd.concat([split1_df, split2_df])

# train_df_romney, test_df_romney = split_training_data(tweet_random_df_romney, 75)

print(len(tweet_df_romney))
print(len(tweet_df_romney_test))

5624
1951
5648
1900


#### Convert Bag of Words model to sparce vectors

In [10]:
count_vect = CountVectorizer(stop_words='english', max_features=5000 )
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=5000)
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features = 5000)
tfidf_transformer = TfidfTransformer()

In [11]:
X_train_counts_obama = count_vect.fit_transform(tweet_df_obama['Anootated tweet'])
X_test_counts_obama = count_vect.transform(tweet_df_obama_test['Anootated tweet'])

X_train_obama = tfidf_transformer.fit_transform(X_train_counts_obama)
X_test_obama = tfidf_transformer.transform(X_test_counts_obama)

y_train_obama = tweet_df_obama['Class']
y_test_obama = tweet_df_obama_test['Class']


X_train_counts_romney = count_vect.fit_transform(tweet_df_romney['Anootated tweet'])
X_test_counts_romney = count_vect.transform(tweet_df_romney_test['Anootated tweet'])

X_train_romney = tfidf_transformer.fit_transform(X_train_counts_romney)
X_test_romney = tfidf_transformer.transform(X_test_counts_romney)

y_train_romney = tweet_df_romney['Class']
y_test_romney = tweet_df_romney_test['Class']


### Classification function

In [12]:
def pipeline_function(clf, X_train, X_test, y_train, y_test, categories):
    print('*' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=categories))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


### Naive Bayes Algorithm

In [13]:
results = []
categories_obama = np.unique(y_train_obama.values)
# Train sparse Naive Bayes classifiers
print('*' * 80)
print("Naive Bayes Obama")
results.append(pipeline_function(MultinomialNB(alpha=.01), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

********************************************************************************
Naive Bayes Obama
********************************************************************************
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.017s
test time:  0.000s
accuracy:   0.519
dimensionality: 5000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.54      0.58      0.56       688
          0       0.49      0.48      0.48       681
          1       0.54      0.50      0.52       582

avg / total       0.52      0.52      0.52      1951

confusion matrix:
[[397 189 102]
 [206 325 150]
 [139 153 290]]



In [14]:
results = []
categories_romney = np.unique(y_train_romney.values)
# Train sparse Naive Bayes classifiers
print('*' * 80)
print("Naive Bayes Romney")
results.append(pipeline_function(MultinomialNB(alpha=.01), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

********************************************************************************
Naive Bayes Romney
********************************************************************************
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.016s
test time:  0.000s
accuracy:   0.568
dimensionality: 5000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.78      0.68       960
          0       0.46      0.34      0.39       555
          1       0.56      0.36      0.44       385

avg / total       0.55      0.57      0.55      1900

confusion matrix:
[[751 156  53]
 [312 188  55]
 [177  68 140]]



### k-Nearest Neighbour Classification


In [15]:
# Train k-Nearest Neighbour classifiers
results = []
print('*' * 80)
print("k Nearest Neighbour Obama")
results.append(pipeline_function(KNeighborsClassifier(n_neighbors=10, n_jobs = -1), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

********************************************************************************
k Nearest Neighbour Obama
********************************************************************************
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.009s
test time:  1.005s
accuracy:   0.482
classification report:
             precision    recall  f1-score   support

         -1       0.63      0.25      0.36       688
          0       0.47      0.60      0.52       681
          1       0.45      0.62      0.52       582

avg / total       0.52      0.48      0.47      1951

confusion matrix:
[[173 279 236]
 [ 66 406 209]
 [ 35 185 362]]



In [16]:
results = []
print('*' * 80)
print("k Nearest Neighbour Romney")
results.append(pipeline_function(KNeighborsClassifier(n_neighbors=10, n_jobs = -1), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

********************************************************************************
k Nearest Neighbour Romney
********************************************************************************
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.006s
test time:  1.099s
accuracy:   0.558
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.78      0.69       960
          0       0.41      0.39      0.40       555
          1       0.64      0.25      0.36       385

avg / total       0.56      0.56      0.54      1900

confusion matrix:
[[751 187  22]
 [310 214  31]
 [171 118  96]]



### Random Forest Classification

In [17]:
# Train Random Forest classifiers
results = []
print('=' * 80)
print("Random Forest Obama")
results.append(pipeline_function(RandomForestClassifier(n_estimators=100), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

Random Forest Obama
********************************************************************************
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 4.874s
test time:  0.143s
accuracy:   0.534
classification report:
             precision    recall  f1-score   support

         -1       0.60      0.54      0.57       688
          0       0.49      0.57      0.52       681
          1       0.52      0.49      0.51       582

avg / total       0.54      0.53      0.53      1951

confusion matrix:
[[370 201 117]
 [152 387 142]
 [ 91 206 285]]



In [18]:
# Train Random Forest classifiers
results = []
print('=' * 80)
print("Random Forest Romney")
results.append(pipeline_function(RandomForestClassifier(n_estimators=100), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

Random Forest Romney
********************************************************************************
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 5.251s
test time:  0.148s
accuracy:   0.588
classification report:
             precision    recall  f1-score   support

         -1       0.60      0.85      0.70       960
          0       0.50      0.31      0.38       555
          1       0.64      0.34      0.44       385

avg / total       0.58      0.59      0.56      1900

confusion matrix:
[[814 113  33]
 [342 173  40]
 [197  58 130]]



### SGD Model

In [19]:
# Train SGD Classifier
results = []
print('*' * 80)
print("SGD Model Obama")
results.append(pipeline_function(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1'), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

********************************************************************************
SGD Model Obama
********************************************************************************
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.133s
test time:  0.000s
accuracy:   0.547
dimensionality: 5000
density: 0.279267
classification report:
             precision    recall  f1-score   support

         -1       0.57      0.57      0.57       688
          0       0.51      0.53      0.52       681
          1       0.56      0.54      0.55       582

avg / total       0.55      0.55      0.55      1951

confusion matrix:
[[394 188 106]
 [184 361 136]
 [114 156 312]]



In [20]:
# Train SGD Classifier
results = []
print('*' * 80)
print("SGD Model Romney")
results.append(pipeline_function(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1'), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

********************************************************************************
SGD Model Romney
********************************************************************************
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.119s
test time:  0.000s
accuracy:   0.562
dimensionality: 5000
density: 0.261533
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.78      0.68       960
          0       0.43      0.32      0.37       555
          1       0.53      0.37      0.44       385

avg / total       0.54      0.56      0.54      1900

confusion matrix:
[[744 153  63]
 [312 179  64]
 [158  83 144]]



### Linear SVM Classifier

In [21]:
# Train Linear SVM Classifier
print('*' * 80)
print("Linear SVM Model Obama")
results.append(pipeline_function(LinearSVC(loss='squared_hinge', penalty='l2',
                                            dual=False, tol=1e-3),
                                X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

********************************************************************************
Linear SVM Model Obama
********************************************************************************
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
train time: 0.058s
test time:  0.000s
accuracy:   0.536
dimensionality: 5000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.55      0.57      0.56       688
          0       0.51      0.51      0.51       681
          1       0.56      0.52      0.54       582

avg / total       0.54      0.54      0.54      1951

confusion matrix:
[[392 186 110]
 [201 348 132]
 [122 155 305]]



In [22]:
# Train Linear SVM Classifier
print('*' * 80)
print("Linear SVM Model Romney")
results.append(pipeline_function(LinearSVC(loss='squared_hinge', penalty='l2',
                                            dual=False, tol=1e-3),
                                X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

********************************************************************************
Linear SVM Model Romney
********************************************************************************
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
train time: 0.056s
test time:  0.000s
accuracy:   0.556
dimensionality: 5000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.64      0.70      0.67       960
          0       0.43      0.41      0.42       555
          1       0.51      0.42      0.46       385

avg / total       0.55      0.56      0.55      1900

confusion matrix:
[[670 208  82]
 [255 225  75]
 [130  93 162]]

