In [1]:
import pandas as pd
import math
import numpy as np
from time import time
import string
import itertools
from html import unescape
import preprocessor as p
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn import cross_validation
from sklearn import svm
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

#### Read data from Excel file and create data frame

In [2]:
def read_excel_file(filename, sheetname):
    return pd.read_excel(filename, sheetname)

In [3]:
def remove_unwanted_columns(tweet_df):
    del tweet_df['date']
    del tweet_df['time']
    return tweet_df

In [4]:
def read_from_input_data (filename):
    tweet_df_obama = read_excel_file(filename, 0)
    tweet_df_romney = read_excel_file(filename, 1)
    # tweet_df_obama = remove_unwanted_columns(tweet_df_obama)
    # tweet_df_romney = remove_unwanted_columns(tweet_df_romney)
    
    tweet_df_obama['Class'] = tweet_df_obama['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df_obama['Class'] = tweet_df_obama['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df_obama = tweet_df_obama.loc[tweet_df_obama.Class=='1'].append(tweet_df_obama.loc[tweet_df_obama.Class=='-1']).append(tweet_df_obama.loc[tweet_df_obama.Class=='0'])
    
    tweet_data_df_obama.dropna(inplace = True)
    
    tweet_df_romney['Class'] = tweet_df_romney['Class'].astype('str').map(lambda x: x.lstrip(' ').rstrip(' '))
    tweet_df_romney['Class'] = tweet_df_romney['Class'].astype('str').map(lambda x: x.replace('!!!!', ''))
    
    tweet_data_df_romney = tweet_df_romney.loc[tweet_df_romney.Class=='1'].append(tweet_df_romney.loc[tweet_df_romney.Class=='-1']).append(tweet_df_romney.loc[tweet_df_romney.Class=='0'])
    
    tweet_data_df_romney.dropna(inplace = True)
    
    return tweet_data_df_obama, tweet_data_df_romney

In [5]:
def split_training_data(df, train_data_prcnt):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test

In [6]:
tweet_df_obama, tweet_df_romney = read_from_input_data("training-Obama-Romney-tweets.xlsx")
exclude = set(string.punctuation)

In [7]:
print("Obama Data")
print("-1 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='1'])))
print(" 0 : " + str(len(tweet_df_obama.loc[tweet_df_obama.Class=='0'])))

print("\nRomney Data")

print("-1 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='-1'])))
print(" 1 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='1'])))
print(" 0 : " + str(len(tweet_df_romney.loc[tweet_df_romney.Class=='0'])))

Obama Data
-1 : 1968
 1 : 1679
 0 : 1977

Romney Data
-1 : 2893
 1 : 1075
 0 : 1680


In [8]:
def clean_tweet_text(text):
    
    if not pd.isnull(text):
        # Remove html escape characters and replace with their meaning
        text = unescape(text)
        # Decode tweet to utf-8 format

        text = text.encode("ascii", "ignore").decode("utf8")

        # Clean data using tweet preprocessor and convert to lower case
        text = str.lower(p.clean(text))

        # Remove characters
        text = text.replace("<e>", "")
        text = text.replace("</e>", "")
        text = text.replace("<a>", "")
        text = text.replace("</a>", "")

        # Remove multiple repetition of a character in word
        text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

        # Remove punctuation
        text = ''.join(ch for ch in text if ch not in exclude)
    return text

In [9]:
tweet_df_obama['Anootated tweet'] = tweet_df_obama['Anootated tweet'].map(clean_tweet_text)
tweet_df_romney['Anootated tweet'] = tweet_df_romney['Anootated tweet'].map(clean_tweet_text)

#### Slipt data into train and test data

In [10]:
tweet_random_df_obama = tweet_df_obama.copy()

for i in range(0, 500):
    split1_df, split2_df = split_training_data(tweet_random_df_obama, 50)
    tweet_random_df_obama = pd.concat([split1_df, split2_df])

train_df_obama, test_df_obama = split_training_data(tweet_random_df_obama, 75)

print(len(train_df_obama))
print(len(test_df_obama))

tweet_random_df_romney = tweet_df_romney.copy()

for i in range(0, 500):
    split1_df, split2_df = split_training_data(tweet_random_df_romney, 50)
    tweet_random_df_romney = pd.concat([split1_df, split2_df])

train_df_romney, test_df_romney = split_training_data(tweet_random_df_romney, 75)

print(len(train_df_romney))
print(len(test_df_romney))

4281
1343
4264
1384


#### Convert Bag of Words model to sparce vectors

In [11]:
count_vect = CountVectorizer(stop_words='english', max_features=2000 )
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=2000)
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features = 2000)
tfidf_transformer = TfidfTransformer()

In [12]:
X_train_counts_obama = count_vect.fit_transform(train_df_obama['Anootated tweet'])
X_test_counts_obama = count_vect.transform(test_df_obama['Anootated tweet'])

X_train_obama = tfidf_transformer.fit_transform(X_train_counts_obama)
X_test_obama = tfidf_transformer.transform(X_test_counts_obama)

y_train_obama = train_df_obama['Class']
y_test_obama = test_df_obama['Class']


X_train_counts_romney = count_vect.fit_transform(train_df_romney['Anootated tweet'])
X_test_counts_romney = count_vect.transform(test_df_romney['Anootated tweet'])

X_train_romney = tfidf_transformer.fit_transform(X_train_counts_romney)
X_test_romney = tfidf_transformer.transform(X_test_counts_romney)

y_train_romney = train_df_romney['Class']
y_test_romney = test_df_romney['Class']


In [13]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

### Classification function

In [14]:
def pipeline_function(clf, X_train, X_test, y_train, y_test, categories):
    print('*' * 60)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#         if feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=categories))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


### Naive Bayes Algorithm

In [15]:
results = []
categories_obama = np.unique(y_train_obama.values)
# Train sparse Naive Bayes classifiers
print('*' * 60)
print("Naive Bayes Obama")
results.append(pipeline_function(MultinomialNB(alpha=.01), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

************************************************************
Naive Bayes Obama
************************************************************
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.015s
test time:  0.000s
accuracy:   0.561
dimensionality: 2000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.58      0.64      0.61       488
          0       0.51      0.45      0.48       457
          1       0.58      0.59      0.59       398

avg / total       0.56      0.56      0.56      1343

confusion matrix:
[[313 112  63]
 [145 205 107]
 [ 81  82 235]]



In [16]:
results = []
categories_romney = np.unique(y_train_romney.values)
# Train sparse Naive Bayes classifiers
print('*' * 60)
print("Naive Bayes Romney")
results.append(pipeline_function(MultinomialNB(alpha=.01), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

************************************************************
Naive Bayes Romney
************************************************************
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.013s
test time:  0.000s
accuracy:   0.569
dimensionality: 2000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.80      0.69       716
          0       0.46      0.32      0.37       403
          1       0.52      0.34      0.41       265

avg / total       0.55      0.57      0.54      1384

confusion matrix:
[[570 108  38]
 [231 127  45]
 [133  42  90]]



### k-Nearest Neighbour Classification


In [17]:
# Train k-Nearest Neighbour classifiers
results = []
print('*' * 60)
print("k Nearest Neighbour Obama")
results.append(pipeline_function(KNeighborsClassifier(n_neighbors=10, n_jobs = -1), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

************************************************************
k Nearest Neighbour Obama
************************************************************
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.006s
test time:  0.498s
accuracy:   0.444
classification report:
             precision    recall  f1-score   support

         -1       0.67      0.23      0.35       488
          0       0.47      0.39      0.43       457
          1       0.38      0.76      0.51       398

avg / total       0.52      0.44      0.42      1343

confusion matrix:
[[114 124 250]
 [ 36 178 243]
 [ 21  73 304]]



In [18]:
results = []
print('*' * 60)
print("k Nearest Neighbour Romney")
results.append(pipeline_function(KNeighborsClassifier(n_neighbors=10, n_jobs = -1), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

************************************************************
k Nearest Neighbour Romney
************************************************************
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.005s
test time:  0.500s
accuracy:   0.363
classification report:
             precision    recall  f1-score   support

         -1       0.65      0.17      0.27       716
          0       0.30      0.86      0.45       403
          1       0.62      0.14      0.22       265

avg / total       0.54      0.36      0.31      1384

confusion matrix:
[[121 582  13]
 [ 49 345   9]
 [ 16 213  36]]



### Random Forest Classification

In [19]:
# Train Random Forest classifiers
results = []
print('=' * 80)
print("Random Forest Obama")
results.append(pipeline_function(RandomForestClassifier(n_estimators=100), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

Random Forest Obama
************************************************************
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 2.454s
test time:  0.075s
accuracy:   0.553
classification report:
             precision    recall  f1-score   support

         -1       0.63      0.53      0.58       488
          0       0.48      0.61      0.54       457
          1       0.58      0.51      0.54       398

avg / total       0.56      0.55      0.55      1343

confusion matrix:
[[260 166  62]
 [ 89 280  88]
 [ 63 132 203]]



In [20]:
# Train Random Forest classifiers
results = []
print('=' * 80)
print("Random Forest Romney")
results.append(pipeline_function(RandomForestClassifier(n_estimators=100), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

Random Forest Romney
************************************************************
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 2.720s
test time:  0.108s
accuracy:   0.547
classification report:
             precision    recall  f1-score   support

         -1       0.59      0.79      0.68       716
          0       0.43      0.28      0.34       403
          1       0.47      0.28      0.35       265

avg / total       0.52      0.55      0.52      1384

confusion matrix:
[[568 103  45]
 [251 114  38]
 [141  49  75]]



### SGD Model

In [21]:
# Train SGD Classifier
results = []
print('*' * 60)
print("SGD Model Obama")
results.append(pipeline_function(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1'), 
                                 X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

************************************************************
SGD Model Obama
************************************************************
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.156s
test time:  0.000s
accuracy:   0.564
dimensionality: 2000
density: 0.531333
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.60      0.60       488
          0       0.52      0.52      0.52       457
          1       0.56      0.58      0.57       398

avg / total       0.56      0.56      0.56      1343

confusion matrix:
[[291 118  79]
 [119 237 101]
 [ 69  99 230]]



In [22]:
# Train SGD Classifier
results = []
print('*' * 60)
print("SGD Model Romney")
results.append(pipeline_function(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty='l1'), 
                                 X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

************************************************************
SGD Model Romney
************************************************************
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.087s
test time:  0.000s
accuracy:   0.548
dimensionality: 2000
density: 0.522667
classification report:
             precision    recall  f1-score   support

         -1       0.64      0.71      0.67       716
          0       0.40      0.35      0.38       403
          1       0.46      0.40      0.43       265

avg / total       0.53      0.55      0.54      1384

confusion matrix:
[[510 143  63]
 [201 142  60]
 [ 92  67 106]]



### Linear SVM Classifier

In [23]:
# Train Linear SVM Classifier
print('*' * 80)
print("Linear SVM Model Obama")
results.append(pipeline_function(LinearSVC(loss='squared_hinge', penalty='l2',
                                            dual=False, tol=1e-3),
                                X_train_obama, X_test_obama, 
                                 y_train_obama, y_test_obama, 
                                 categories_obama))

********************************************************************************
Linear SVM Model Obama
************************************************************
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
train time: 0.059s
test time:  0.000s
accuracy:   0.558
dimensionality: 2000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.61      0.60      0.60       488
          0       0.51      0.50      0.50       457
          1       0.55      0.57      0.56       398

avg / total       0.56      0.56      0.56      1343

confusion matrix:
[[294 119  75]
 [121 229 107]
 [ 69 103 226]]



In [24]:
# Train Linear SVM Classifier
print('*' * 80)
print("Linear SVM Model Romney")
results.append(pipeline_function(LinearSVC(loss='squared_hinge', penalty='l2',
                                            dual=False, tol=1e-3),
                                X_train_romney, X_test_romney, 
                                 y_train_romney, y_test_romney, 
                                 categories_romney))

********************************************************************************
Linear SVM Model Romney
************************************************************
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
train time: 0.050s
test time:  0.000s
accuracy:   0.538
dimensionality: 2000
density: 1.000000
classification report:
             precision    recall  f1-score   support

         -1       0.62      0.69      0.66       716
          0       0.40      0.35      0.37       403
          1       0.47      0.41      0.44       265

avg / total       0.53      0.54      0.53      1384

confusion matrix:
[[494 154  68]
 [205 143  55]
 [ 92  65 108]]



### Cross Validation (10 fold) on Linear SVM

In [29]:
tweet_random_df = tweet_df_obama.copy()
len(tweet_random_df)
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = LinearSVC(loss='squared_hinge', penalty='l2',dual=False, tol=1e-3)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Linear SVM Obama")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Linear SVM Obama
[ 0.52966595  0.59747967  0.53881707  0.57758382  0.54466201  0.5723762
  0.5755843   0.53059608  0.5498899   0.5426779 ]
Accuracy: 0.56 (+/- 0.04)


In [30]:
tweet_random_df = tweet_df_romney.copy()
len(tweet_random_df)
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = LinearSVC(loss='squared_hinge', penalty='l2',dual=False, tol=1e-3)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Linear SVM Romney")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Linear SVM Romney
[ 0.53687145  0.56213105  0.57125204  0.56156171  0.5551547   0.54577134
  0.55447597  0.5294919   0.55826338  0.56034268]
Accuracy: 0.55 (+/- 0.02)


### Cross Validation (10 fold) on Naive Bayes

In [27]:
tweet_random_df = tweet_df_obama.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = MultinomialNB(alpha = 0.01)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Naive Bayes Obama")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Naive Bayes Obama
[ 0.52412138  0.56025644  0.52397638  0.57085236  0.53627839  0.56958426
  0.5631734   0.5357964   0.57170695  0.55743054]
Accuracy: 0.55 (+/- 0.04)


In [31]:
tweet_random_df = tweet_df_romney.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = MultinomialNB(alpha = 0.01)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Naive Bayes Romney")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Naive Bayes Romney
[ 0.56197955  0.53099091  0.55779734  0.57523544  0.52271181  0.54396701
  0.51041333  0.57248649  0.52477631  0.55006461]
Accuracy: 0.55 (+/- 0.04)


### Cross Validation (10 fold) on Random Forest

In [32]:
tweet_random_df = tweet_df_obama.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = RandomForestClassifier(n_estimators=100)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Random Forest Obama")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Random Forest Obama
[ 0.54416288  0.57806726  0.57098955  0.58060935  0.57727401  0.59708174
  0.55537649  0.55917022  0.55353935  0.52421015]
Accuracy: 0.56 (+/- 0.04)


In [33]:
tweet_random_df = tweet_df_romney.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = RandomForestClassifier(n_estimators=100)
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Random Forest Romney")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Random Forest Romney
[ 0.51715193  0.59279902  0.55210046  0.50244742  0.54871538  0.55004637
  0.54856987  0.56277669  0.52764518  0.51241765]
Accuracy: 0.54 (+/- 0.05)


### Cross Validation (10 fold) on Logistic Regression

In [34]:
tweet_random_df = tweet_df_obama.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = SGDClassifier(alpha=.0001, n_iter=50, penalty='l1')
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Logistic Regression Obama")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Logistic Regression Obama
[ 0.60710851  0.57358587  0.54813402  0.55599074  0.57758801  0.57621341
  0.56733556  0.58392341  0.56336149  0.5712577 ]
Accuracy: 0.57 (+/- 0.03)


In [35]:
tweet_random_df = tweet_df_romney.copy()
for i in range(0, 100):
    split1_df, split2_df = split_training_data(tweet_random_df, 50)
    tweet_random_df = pd.concat([split1_df, split2_df])
    
X_kfcv_counts = count_vect.fit_transform(tweet_random_df['Anootated tweet'])
X_kfcv = tfidf_transformer.fit_transform(X_kfcv_counts)
Y_kfcv = tweet_random_df['Class']

clf = SGDClassifier(alpha=.0001, n_iter=50, penalty='l1')
scores = cross_validation.cross_val_score(clf, X_kfcv, Y_kfcv, cv=10, scoring='f1_weighted')
print("Cross Validation Logistic Regression Romney")
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Logistic Regression Romney
[ 0.49903378  0.5644837   0.52795483  0.55739991  0.5647148   0.53822631
  0.5645592   0.59904538  0.52530778  0.50146229]
Accuracy: 0.54 (+/- 0.06)
