In [33]:
import pandas as pd
import numpy as np
from time import time
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

#### Read data from Excel file and create data frame

In [34]:
def readExcelFile(filename):
    orig_excel_df_sh_1 = pd.read_excel(filename, sheetname=0)
    orig_excel_df_sh_2 = pd.read_excel(filename, sheetname=1)

    orig_excel_df = orig_excel_df_sh_1.copy()
    orig_excel_df = orig_excel_df.append(orig_excel_df_sh_2)
    return orig_excel_df

In [35]:
def extractReqData(orig_excel_df):
    del orig_excel_df['date']
    del orig_excel_df['time']
    return orig_excel_df

In [36]:
def readFormatInputData (filename):
    tweet_df = readExcelFile(filename)
    del tweet_df['date']
    del tweet_df['time']
    
    tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df['Class'] = tweet_df['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df = tweet_df.loc[tweet_df.Class=='1'].append(tweet_df.loc[tweet_df.Class=='-1'])

    return tweet_data_df

In [37]:
tweet_df = readFormatInputData("training-Obama-Romney-tweets.xlsx")

    Anootated tweet                                         Unnamed: 3  \
 0              NaN    1: positive, -1: negative, 0: neutral, 2: mixed   
 1   10:28:53-05:00  Kirkpatrick, who wore a baseball cap embroider...   
 2   10:09:00-05:00  Question: If <e>Romney</e> and <e>Obama</e> ha...   
 3   10:04:30-05:00  #<e>obama</e> debates that Cracker Ass Cracker...   
 4   10:00:36-05:00  RT @davewiner Slate: Blame <e>Obama</e> for fo...   
 5   09:50:08-05:00  @Hollivan @hereistheanswer  Youre missing the ...   
 6   09:48:54-05:00  <e>Mitt Romney</e> made all of his money himse...   
 7   10:00:16-05:00  I was raised as a Democrat  left the party yea...   
 8   09:48:07-05:00  The <e>Obama camp</e> can't afford to lower ex...   
 9   09:52:47-05:00  Tonight's debate has that "Game 7" feel! This ...   
 10  10:12:50-05:00  <e>Obama</e> pot <a>policy</a> disappointing -...   
 11  10:12:11-05:00  Not all of Hollywood has his back! RT @RedAler...   
 12  10:06:14-05:00  @hblodget i'd be 

KeyError: 'Class'

#### Slipt data into train and test data

In [28]:
def splitTrainingData(df, train_data_prcnt=80):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test

In [29]:
train_df, test_df = splitTrainingData(tweet_df)

NameError: name 'tweet_df' is not defined

#### Convert Bag of Words model to sparce vectors

In [30]:
count_vect = CountVectorizer(stop_words='english')
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features = 4000)
tfidf_transformer = TfidfTransformer()

In [31]:
X_train_counts = count_vect.fit_transform(train_df['Anootated tweet'])
X_test_counts = count_vect.transform(test_df['Anootated tweet'])

# X_train_counts = hash_vectorizer.fit_transform(train_df['Anootated tweet'])
# X_test_counts = hash_vectorizer.transform(test_df['Anootated tweet'])

# X_train_counts = vectorizer.fit_transform(train_df['Anootated tweet'])
# X_test_counts = vectorizer.transform(test_df['Anootated tweet'])

X_train = tfidf_transformer.fit_transform(X_train_counts)
X_test = tfidf_transformer.transform(X_test_counts)

y_train = train_df['Class']
y_test = test_df['Class']

NameError: name 'train_df' is not defined

In [180]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

### Classification function

In [143]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#         if feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=categories))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

### Naive Bayes Algorithm

In [196]:
results = []
categories = np.unique(y_train.values)
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.048s
test time:  0.001s
accuracy:   0.743
dimensionality: 11567
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.76      0.87      0.81       966
          1       0.70      0.52      0.60       554

avg / total       0.74      0.74      0.73      1520

confusion matrix:
[[839 127]
 [264 290]]



### k-Nearest Neighbour Classification

In [216]:
# Train k-Nearest Neighbour classifiers
print('=' * 80)
print("kNN")
results.append(benchmark(KNeighborsClassifier(n_neighbors=10)))

kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.019s
test time:  0.490s
accuracy:   0.759
classification report:
             precision    recall  f1-score   support

         -1       0.78      0.86      0.82       966
          1       0.71      0.58      0.64       554

avg / total       0.75      0.76      0.75      1520

confusion matrix:
[[833 133]
 [233 321]]



### Random Forest Classification

In [197]:
# Train Random Forest classifiers
print('=' * 80)
print("Random Forest")
results.append(benchmark(RandomForestClassifier(n_estimators=100)))

Random Forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 12.541s
test time:  0.276s
accuracy:   0.762
classification report:
             precision    recall  f1-score   support

         -1       0.79      0.86      0.82       966
          1       0.71      0.59      0.64       554

avg / total       0.76      0.76      0.76      1520

confusion matrix:
[[832 134]
 [227 327]]



### SGD Model

In [210]:
# Train SGD Classifier
print('=' * 80)
print("SGD Model")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1')))

SGD Model
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.119s
test time:  0.000s
accuracy:   0.761
dimensionality: 11567
density: 0.118354
classification report:
             precision    recall  f1-score   support

         -1       0.79      0.85      0.82       966
          1       0.70      0.61      0.65       554

avg / total       0.76      0.76      0.76      1520

confusion matrix:
[[820 146]
 [217 337]]



### Linear SVM Classifier

In [215]:
# Train Linear SVM Classifier
print('=' * 80)
print("Linear SVM Model")
results.append(benchmark(LinearSVC(loss='l2', penalty='l1',
                                            dual=False, tol=1e-3)))

Linear SVM Model
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.162s
test time:  0.001s
accuracy:   0.761
dimensionality: 11567
density: 0.174462
classification report:
             precision    recall  f1-score   support

         -1       0.80      0.84      0.82       966
          1       0.69      0.62      0.66       554

avg / total       0.76      0.76      0.76      1520

confusion matrix:
[[812 154]
 [209 345]]



