In [1]:
import pandas as pd
import math
import numpy as np
from time import time
import string
import itertools
from html import unescape
import preprocessor as p
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn import cross_validation
from sklearn import svm
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

#### Read data from Excel file and create data frame

In [2]:
def readExcelFile(filename):
    orig_excel_df_sh_1 = pd.read_excel(filename, sheetname=0)
    orig_excel_df_sh_2 = pd.read_excel(filename, sheetname=1)

    orig_excel_df = orig_excel_df_sh_1.copy()
    orig_excel_df = orig_excel_df.append(orig_excel_df_sh_2)
    return orig_excel_df

In [3]:
def extractReqData(orig_excel_df):
    del orig_excel_df['date']
    del orig_excel_df['time']
    return orig_excel_df

In [4]:
def readFormatInputData (filename):
    tweet_df = readExcelFile(filename)
    del tweet_df['date']
    del tweet_df['time']
    
    tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df = tweet_df.loc[tweet_df.Class=='1'].append(tweet_df.loc[tweet_df.Class=='-1']).append(tweet_df.loc[tweet_df.Class=='0'])
    
    tweet_data_df.dropna(inplace = True)

    return tweet_data_df

In [5]:
def splitTrainingData(df, train_data_prcnt):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test

In [6]:
tweet_df = readFormatInputData("training-Obama-Romney-tweets.xlsx")
exclude = set(string.punctuation)

In [7]:
print("-1 : " + str(len(tweet_df.loc[tweet_df.Class=='-1'])))
print(" 1 : " + str(len(tweet_df.loc[tweet_df.Class=='1'])))
print(" 0 : " + str(len(tweet_df.loc[tweet_df.Class=='0'])))

-1 : 4861
 1 : 2754
 0 : 3657


In [8]:
tweet_df = readExcelFile("training-Obama-Romney-tweets.xlsx")
del tweet_df['date']
del tweet_df['time']

tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))

tweet_data_df = tweet_df.loc[tweet_df.Class=='-1']

split1_df, split2_df = splitTrainingData(tweet_data_df, (len(tweet_df.loc[tweet_df.Class=='1']) / len(tweet_df.loc[tweet_df.Class=='-1'])) * 100)
tweet_data_df = split1_df.copy()

tweet_data_df = tweet_data_df.append(tweet_df.loc[tweet_df.Class=='1'])

tweet_zero_df = tweet_df.loc[tweet_df.Class=='0']
split1_df, split2_df = splitTrainingData(tweet_zero_df, (len(tweet_df.loc[tweet_df.Class=='1']) / len(tweet_df.loc[tweet_df.Class=='0'])) * 100)
tweet_data_df = pd.concat([tweet_data_df, split1_df])

tweet_data_df.dropna(inplace = True)

tweet_df = tweet_data_df.copy()

In [9]:
def cleanTweetText(text):
    
    if not pd.isnull(text):
        # Remove html escape characters and replace with their meaning
        text = unescape(text)
        # Decode tweet to utf-8 format

        text = text.encode("ascii", "ignore").decode("utf8")

        # Clean data using tweet preprocessor and convert to lower case
        text = str.lower(p.clean(text))

        # Remove characters
        text = text.replace("<e>", "")
        text = text.replace("</e>", "")
        text = text.replace("<a>", "")
        text = text.replace("</a>", "")

        # Remove multiple repetition of a character in word
        text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

        # Remove punctuation
        text = ''.join(ch for ch in text if ch not in exclude)
    return text

In [10]:
tweet_df['Anootated tweet'] = tweet_df['Anootated tweet'].map(cleanTweetText)

In [11]:
print("-1 : " + str(len(tweet_df.loc[tweet_df.Class=='-1'])))
print(" 1 : " + str(len(tweet_df.loc[tweet_df.Class=='1'])))
print(" 0 : " + str(len(tweet_df.loc[tweet_df.Class=='0'])))

-1 : 2772
 1 : 2754
 0 : 2785


#### Slipt data into train and test data

In [12]:
tweet_random_df = tweet_df.copy()
#tweet_random_df = pd.DataFrame.drop_duplicates(tweet_random_df)

for i in range(0, 500):
    split1_df, split2_df = splitTrainingData(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])

train_df, test_df = splitTrainingData(tweet_random_df, 75)
#set(train_df['Anootated tweet']).intersection(test_df['Anootated tweet'])

#### Convert Bag of Words model to sparce vectors

In [13]:
count_vect = CountVectorizer(stop_words='english', max_features=500 )
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=500)
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features = 2000)
tfidf_transformer = TfidfTransformer()

In [14]:
X_train_counts = count_vect.fit_transform(train_df['Anootated tweet'])
X_test_counts = count_vect.transform(test_df['Anootated tweet'])

# X_train_counts = hash_vectorizer.fit_transform(train_df['Anootated tweet'])
# X_test_counts = hash_vectorizer.transform(test_df['Anootated tweet'])

# X_train_counts = vectorizer.fit_transform(train_df['Anootated tweet'])
# X_test_counts = vectorizer.transform(test_df['Anootated tweet'])

X_train = tfidf_transformer.fit_transform(X_train_counts)
X_test = tfidf_transformer.transform(X_test_counts)

y_train = train_df['Class']
y_test = test_df['Class']

In [15]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

### Classification function

In [16]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#         if feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=categories))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


### Naive Bayes Algorithm

In [17]:
results = []
categories = np.unique(y_train.values)
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.022s
test time:  0.000s
accuracy:   0.531
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.48      0.56      0.52       668
          0       0.50      0.41      0.45       699
          1       0.60      0.63      0.61       708

avg / total       0.53      0.53      0.53      2075

confusion matrix:
[[373 164 131]
 [250 285 164]
 [148 116 444]]



### k-Nearest Neighbour Classification

In [18]:
# Train k-Nearest Neighbour classifiers
print('=' * 80)
print("kNN")
results.append(benchmark(KNeighborsClassifier(n_neighbors=10, n_jobs = -1)))

kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.008s
test time:  1.384s
accuracy:   0.455
classification report:
             precision    recall  f1-score   support

         -1       0.48      0.28      0.35       668
          0       0.40      0.65      0.49       699
          1       0.56      0.44      0.49       708

avg / total       0.48      0.45      0.45      2075

confusion matrix:
[[184 370 114]
 [119 451 129]
 [ 82 317 309]]



### Random Forest Classification

In [19]:
# Train Random Forest classifiers
print('=' * 80)
print("Random Forest")
results.append(benchmark(RandomForestClassifier(n_estimators=100)))

Random Forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 3.527s
test time:  0.104s
accuracy:   0.529
classification report:
             precision    recall  f1-score   support

         -1       0.51      0.56      0.54       668
          0       0.48      0.48      0.48       699
          1       0.60      0.55      0.57       708

avg / total       0.53      0.53      0.53      2075

confusion matrix:
[[373 176 119]
 [219 339 141]
 [133 189 386]]



### SGD Model

In [20]:
# Train SGD Classifier
print('=' * 80)
print("SGD Model")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1')))

SGD Model
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.170s
test time:  0.000s
accuracy:   0.526
dimensionality: 500
density: 0.730000
classification report:
             precision    recall  f1-score   support

         -1       0.50      0.49      0.50       668
          0       0.50      0.46      0.48       699
          1       0.57      0.62      0.59       708

avg / total       0.52      0.53      0.52      2075

confusion matrix:
[[329 183 156]
 [196 324 179]
 [127 142 439]]



### Linear SVM Classifier

In [21]:
# Train Linear SVM Classifier
print('=' * 80)
print("Linear SVM Model")
results.append(benchmark(LinearSVC(loss='l2', penalty='l2',
                                            dual=False, tol=1e-3)))

Linear SVM Model
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.063s
test time:  0.000s
accuracy:   0.514
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.48      0.52      0.50       668
          0       0.48      0.46      0.47       699
          1       0.59      0.56      0.58       708

avg / total       0.52      0.51      0.51      2075

confusion matrix:
[[348 194 126]
 [231 324 144]
 [151 162 395]]





### Cross Validation (10 fold) on Linear SVM

In [22]:
# print('The scikit-learn version is {}.'.format(sklearn.__version__))

tweet_random_df = tweet_df.copy()
len(tweet_random_df)
for i in range(0, 100):
    split1_df, split2_df = splitTrainingData(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = LinearSVC(loss='l2', penalty='l2',dual=False, tol=1e-3)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



[ 0.51491109  0.50926273  0.5255323   0.53831795  0.53195528  0.52820111
  0.51032541  0.51665605  0.50321802  0.54050534]
Accuracy: 0.52 (+/- 0.02)


### Cross Validation (10 fold) on Naive Bayes

In [23]:
clf = MultinomialNB(alpha = 0.01)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.51932427  0.52600686  0.521578    0.53475984  0.52127401  0.51726834
  0.50077517  0.51779986  0.47561899  0.55112091]
Accuracy: 0.52 (+/- 0.04)


### Cross Validation (10 fold) on Random Forest

In [24]:
clf = RandomForestClassifier(n_estimators=100)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.52671174  0.53067783  0.52927018  0.51947334  0.5515291   0.51810845
  0.51657814  0.52599158  0.5071992   0.53518984]
Accuracy: 0.53 (+/- 0.02)


### Cross Validation (10 fold) on Logistic Regression

In [25]:
clf = SGDClassifier(alpha=.0001, n_iter=50, penalty='l1')
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.51130847  0.5083669   0.51208124  0.52683791  0.53430661  0.51757969
  0.50372526  0.49712372  0.44395876  0.52941589]
Accuracy: 0.51 (+/- 0.05)


In [26]:
KNeighborsClassifier(n_neighbors=10)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')