# Prerequisites

## STOP! Please read this before you execute any further cells,
if you are running on a personal computer, run the following cell:

In [None]:
!pip install -r requirements.txt

if you are running on Google Colab, you already have everything installed except datasets,
run the following cell:

In [None]:
!pip install datasets

# Imports

In [1]:
# for data loading
import pandas as pd
import numpy as np

# for data cleaning
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# for test handling
from sklearn.model_selection import train_test_split

# transformers/ encoders
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# our classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

# metrics
from sklearn.metrics import classification_report

# for the roberta classifier using hugging face APIs and torch
import joblib
from sklearn.preprocessing import LabelEncoder
from transformers import AdamW, get_scheduler, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# Read data and basic cleaning

In [2]:
sheet_names = ["Obama", "Romney"]
df = {}
for sheet in sheet_names:
    # rename columns, drop useless columns, format data, remove duplicates
    sh = pd.read_excel('./training-Obama-Romney-tweets.xlsx', usecols="D:E", sheet_name=sheet)
    sh.rename(columns={"Anootated tweet": "tweet", "Unnamed: 4": "class"}, inplace=True)
    sh.drop(0, inplace=True)
    sh.drop(sh.loc[sh["tweet"].isna()].index, inplace=True)
    sh['class'] = sh['class'].astype(str)
    sh.drop(sh.loc[~sh["class"].isin(['1', '0', '-1'])].index, inplace=True)
    sh.drop_duplicates(inplace=True)
    sh['class'] = sh['class'].astype(int)
    sh['tweet'] = sh['tweet'].astype(str)
    df[sheet] = sh

# Data Cleaning

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\S+')
st = nltk.PorterStemmer()
lm = nltk.WordNetLemmatizer()

# preprocessing lambda, to be used to run against a pandas dataframe, you can selectively turn on and off the respective cleaning tools
def preprocessing(x,
                  removeHtmlTags=True,
                  removeUrlLinks=True,
                  removeMentions=False,
                  removeHashtags=False,
                  removeNonWords=False,
                  removeSpecialSymbols=False,
                  removeSmallWords=0,
                  removeStopWords=False,
                  stemWords=False,
                  lemmatizeWords=False):
    if removeHtmlTags: x=re.sub(r'<[^>]+>', '', x) 
    if removeUrlLinks: x=re.sub(r'http\S+', '', x)
    if removeUrlLinks: x=re.sub(r'www\S+', '', x)
    if removeSpecialSymbols: x=re.sub(r'[@#]+', '', x)
    if removeMentions: x=re.sub(r'@\S+', '', x)
    if removeHashtags: x=re.sub(r'#\S+', '', x)
    if removeNonWords: x=re.sub(r'\W+', ' ', x)
    if removeSmallWords != 0: x=' '.join([w for w in x.split() if len(w)>removeSmallWords])
    if removeStopWords: x = " ".join([word for word in str(x).split() if word not in stop_words])
    x = tokenizer.tokenize(x)
    if stemWords: x = [st.stem(word) for word in x]
    if lemmatizeWords: x = [lm.lemmatize(word) for word in x]
    return ' '.join(x)

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 removeHtmlTags=True,
                 removeUrlLinks=True,
                 removeMentions=False,
                 removeHashtags=False,
                 removeSpecialSymbols=False,
                 removeNonWords=False,
                 removeSmallWords=0,
                 removeStopWords=False,
                 stemWords=False,
                 lemmatizeWords=False):
        self.removeHtmlTags = removeHashtags
        self.removeUrlLinks = removeUrlLinks
        self.removeMentions = removeMentions
        self.removeHashtags = removeHashtags
        self.removeNonWords = removeNonWords
        self.removeSmallWords = removeSmallWords
        self.removeStopWords = removeStopWords
        self.stemWords = stemWords
        self.lemmatizeWords = lemmatizeWords
        self.removeSpecialSymbols = removeSpecialSymbols
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(preprocessing,
                                args=(self.removeHtmlTags,
                                self.removeUrlLinks,
                                self.removeMentions,
                                self.removeHashtags,
                                self.removeNonWords,
                                self.removeSpecialSymbols,
                                self.removeSmallWords,
                                self.removeStopWords,
                                self.stemWords,
                                self.lemmatizeWords))

[nltk_data] Downloading package stopwords to /home/debian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/debian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Hugging Face Classifier Adapter

In [5]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

class HuggingFacePreprocessor(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(preprocess)

class HuggingFaceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name, num_labels=3, epochs=3, batch_size=16, lr=5e-5, max_length=128):
        self.epochs = epochs
        self.batch_size = batch_size
        self.model_name = model_name
        self.num_labels = num_labels
        self.lr = lr
        self.max_length = max_length
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.enc = LabelEncoder()

    def fit(self, X, y):
        # Tokenize the data
        encodings = self.tokenizer(list(X), truncation=True, padding=True, max_length=self.max_length, return_tensors="pt")
        y_labels = self.enc.fit_transform(y)
        labels = torch.tensor(y_labels, dtype=torch.long)

        dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        # Set up optimizer and scheduler
        optimizer = AdamW(self.model.parameters(), lr=self.lr)
        num_training_steps = len(dataloader) * self.epochs
        lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

        self.model.to(self.device)
        self.model.train()

        # Training loop
        progress_bar = tqdm(range(num_training_steps), desc="Training")
        for epoch in range(self.epochs):
            for batch in dataloader:
                input_ids, attention_mask, labels = [x.to(self.device) for x in batch]
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)

        return self

    def predict(self, X):
        # Tokenize the data
        encodings = self.tokenizer(list(X), truncation=True, padding=True, max_length=self.max_length, return_tensors="pt")
        dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"])
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size)

        self.model.to(self.device)
        self.model.eval()

        predictions = []
        with torch.no_grad():
            for batch in dataloader:
                input_ids, attention_mask = [x.to(self.device) for x in batch]
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

        return self.enc.inverse_transform(predictions)


# Train Test Split

In [6]:
X_train, X_test, y_train, y_test = {}, {}, {}, {}
for name in sheet_names:
    X_train[name], X_test[name], y_train[name], y_test[name] = train_test_split(
        df[name]["tweet"],
        df[name]["class"],
        test_size=0.2,
        random_state=46548694)

# Model Pipeline and Eval

## Logistic Regression

In [8]:
logRes = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,2), max_features= 2500, smooth_idf=True)), 
    ('classifier', LogisticRegression(random_state=5235253))
])

obama_pipeline = clone(logRes)
romney_pipeline = clone(logRes)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.58      0.67      0.62       390
           0       0.56      0.54      0.55       397
           1       0.65      0.56      0.60       335

    accuracy                           0.59      1122
   macro avg       0.60      0.59      0.59      1122
weighted avg       0.59      0.59      0.59      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.62      0.83      0.71       589
           0       0.48      0.30      0.37       344
           1       0.57      0.36      0.45       195

    accuracy                           0.59      1128
   macro avg       0.56      0.50      0.51      1128
weighted avg       0.57      0.59      0.56      1128



## SVM

In [7]:
svc = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,3), max_features= 2500, smooth_idf=True)), 
    ('classifier', SVC(random_state=5235253))
])

obama_pipeline = clone(svc)
romney_pipeline = clone(svc)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.56      0.69      0.62       390
           0       0.57      0.56      0.56       397
           1       0.70      0.53      0.60       335

    accuracy                           0.60      1122
   macro avg       0.61      0.59      0.60      1122
weighted avg       0.60      0.60      0.59      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.60      0.90      0.72       589
           0       0.52      0.23      0.32       344
           1       0.62      0.29      0.39       195

    accuracy                           0.59      1128
   macro avg       0.58      0.47      0.48      1128
weighted avg       0.58      0.59      0.54      1128



## Naive Bayes

In [9]:
multiNBC = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, removeMentions=True, stemWords=True, lemmatizeWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,5), max_features= 2500, smooth_idf=True)), 
    ('classifier', MultinomialNB(alpha=2))
])

obama_pipeline = clone(multiNBC)
romney_pipeline = clone(multiNBC)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.55      0.75      0.63       390
           0       0.57      0.53      0.55       397
           1       0.75      0.50      0.60       335

    accuracy                           0.60      1122
   macro avg       0.62      0.59      0.59      1122
weighted avg       0.62      0.60      0.59      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.57      0.96      0.71       589
           0       0.64      0.15      0.25       344
           1       0.60      0.14      0.23       195

    accuracy                           0.57      1128
   macro avg       0.60      0.42      0.40      1128
weighted avg       0.59      0.57      0.49      1128



In [10]:
bernoulliNBC = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,4), max_features= 2500, smooth_idf=True)), 
    ('classifier', BernoulliNB(alpha=1)) 
])

obama_pipeline = clone(bernoulliNBC)
romney_pipeline = clone(bernoulliNBC)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.61      0.61      0.61       390
           0       0.54      0.61      0.57       397
           1       0.66      0.57      0.61       335

    accuracy                           0.60      1122
   macro avg       0.60      0.60      0.60      1122
weighted avg       0.60      0.60      0.60      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.66      0.63      0.65       589
           0       0.41      0.44      0.42       344
           1       0.46      0.49      0.48       195

    accuracy                           0.55      1128
   macro avg       0.51      0.52      0.51      1128
weighted avg       0.55      0.55      0.55      1128



## Random Forests

In [11]:
randomForests = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,4), max_features= 2500, smooth_idf=True)), 
    ('classifier', RandomForestClassifier(random_state=5235253)) 
])

obama_pipeline = clone(randomForests)
romney_pipeline = clone(randomForests)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.57      0.68      0.62       390
           0       0.57      0.56      0.56       397
           1       0.63      0.50      0.55       335

    accuracy                           0.58      1122
   macro avg       0.59      0.58      0.58      1122
weighted avg       0.59      0.58      0.58      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.58      0.82      0.68       589
           0       0.41      0.26      0.32       344
           1       0.61      0.28      0.38       195

    accuracy                           0.55      1128
   macro avg       0.53      0.45      0.46      1128
weighted avg       0.53      0.55      0.52      1128



In [12]:
randomForests = Pipeline([
    ('preprocessor', TextPreprocessor(removeHashtags=True, stemWords=True)), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,5), max_features= 2500, smooth_idf=True)), 
    ('classifier', ExtraTreesClassifier(random_state=5235253)) 
])

obama_pipeline = clone(randomForests)
romney_pipeline = clone(randomForests)

obama_pipeline.fit(X_train['Obama'], y_train['Obama'])
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])
y_pred = {}
y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print("Obama Classification Report")
print(classification_report(y_test["Obama"], y_pred["Obama"]))
print("Romney Classification Report")
print(classification_report(y_test["Romney"], y_pred["Romney"]))

Obama Classification Report
              precision    recall  f1-score   support

          -1       0.59      0.67      0.63       390
           0       0.56      0.59      0.57       397
           1       0.68      0.52      0.59       335

    accuracy                           0.60      1122
   macro avg       0.61      0.60      0.60      1122
weighted avg       0.61      0.60      0.60      1122

Romney Classification Report
              precision    recall  f1-score   support

          -1       0.59      0.83      0.69       589
           0       0.44      0.25      0.32       344
           1       0.58      0.32      0.42       195

    accuracy                           0.56      1128
   macro avg       0.54      0.47      0.48      1128
weighted avg       0.54      0.56      0.53      1128



## Roberta

In [7]:
# Define pipeline
pipeline = Pipeline([
    ('preprocessor', HuggingFacePreprocessor()),
    ('classifier', HuggingFaceClassifier(model_name="cardiffnlp/twitter-roberta-base-sentiment", num_labels=3))
])

obama_pipeline = clone(pipeline)
romney_pipeline = clone(pipeline)
y_test = {}

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
obama_pipeline.fit(X_train['Obama'], y_train['Obama'])

y_pred['Obama'] = obama_pipeline.predict(X_test['Obama'])
print(classification_report(y_test['Obama'], y_pred['Obama']))

In [None]:
romney_pipeline.fit(X_train['Romney'], y_train['Romney'])

y_pred['Romney'] = romney_pipeline.predict(X_test['Romney'])
print(classification_report(y_test['Romney'], y_pred['Romney']))

In [None]:
joblib.dump(obama_pipeline, "roberta_obama_pipeline.pkl")
joblib.dump(romney_pipeline, "roberta_romney_pipeline.pkl")

# Running against test data

In [9]:
test_df = {}
for sheet in sheet_names:
    # rename columns, drop useless columns, format data, remove duplicates
    sh = pd.read_excel('./sample-testdata.xlsx', sheet_name=sheet, header=None, names=['#', 'tweet'])
    test_df[sheet] = sh

In [10]:
test_df['Romney']

Unnamed: 0,#,tweet
0,1,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...
1,2,Senior <e>Romney</e> Advisor Claims <e>Obama</...
2,3,.@WardBrenda @shortwave8669 @allanbourdius you...
3,4,<e>Mitt Romney</e> still doesn't <a>believe</a...


In [8]:
roberta_obama_pipeline = joblib.load('roberta_obama_pipeline.pkl')
roberta_romney_pipeline = joblib.load('roberta_romney_pipeline.pkl')

In [15]:
test_y = {}

In [16]:
test_y['Obama'] = roberta_obama_pipeline.predict(test_df['Romney']['tweet'])
test_y['Romney'] = roberta_romney_pipeline.predict(test_df['Romney']['tweet'])

In [17]:
test_df['Romney']['Class'] = test_y['Romney']

In [18]:
test_df['Romney']

Unnamed: 0,#,tweet,Class
0,1,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...,-1
1,2,Senior <e>Romney</e> Advisor Claims <e>Obama</...,1
2,3,.@WardBrenda @shortwave8669 @allanbourdius you...,-1
3,4,<e>Mitt Romney</e> still doesn't <a>believe</a...,-1


In [20]:
with pd.ExcelWriter('output.xlsx') as writer: 
    test_df['Romney'].to_excel(writer, sheet_name='Romney', index=False, header=False)
    # test_df['Obama'].to_excel(writer, sheet_name='Obama', index=False, header=False)

# Testing

In [245]:
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()), 
    ('vectorizer', TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,2), max_features= 5000, smooth_idf=True)), 
    ('classifier', LinearSVC(random_state=5235253))
])

param_combos = [
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, removeMentions=True, stemWords=True, lemmatizeWords=True)], 
        'vectorizer': [CountVectorizer(token_pattern=r'\S+', ngram_range=(1,1), strip_accents='unicode', max_features=5000)],
        'classifier': [MultinomialNB(alpha=2.5)]
    },
    {
    	'preprocessor': [TextPreprocessor(removeHashtags=True, removeMentions=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,5), max_features= 2500, smooth_idf=True)], 
    	'classifier': [MultinomialNB(alpha=2)]
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,5), max_features= 2500, smooth_idf=True)], 
    	'classifier': [ExtraTreesClassifier(random_state=5235253)]
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,2), max_features= 2500, smooth_idf=True)], 
    	'classifier': [LogisticRegression(random_state=5235253)]
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,4), max_features= 2500, smooth_idf=True)], 
    	'classifier': [RandomForestClassifier(random_state=5235253)] 
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,4), max_features= 2500, smooth_idf=True)], 
    	'classifier': [BernoulliNB(alpha=1)]
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,3), max_features= 2500, smooth_idf=True)], 
    	'classifier': [SVC(random_state=5235253)]
    },
    {
        'preprocessor': [TextPreprocessor(removeHashtags=True, stemWords=True, lemmatizeWords=True)], 
    	'vectorizer': [TfidfVectorizer(token_pattern=r'\S+', ngram_range=(1,2), max_features= 5000, smooth_idf=True)], 
    	'classifier': [LinearSVC(random_state=5235253)]
    }
]

obama_pipeline = clone(pipeline)
romney_pipeline = clone(pipeline)

obama_grid_search = GridSearchCV(obama_pipeline, param_combos, cv=5, scoring='accuracy', verbose=2)
romney_grid_search = GridSearchCV(romney_pipeline, param_combos, cv=5, scoring='accuracy', verbose=2)

In [None]:
obama_grid_search.fit(X_train["Obama"], y_train["Obama"])
y_pred = obama_grid_search.best_estimator_.predict(X_test["Obama"])
print(classification_report(y_test["Obama"], y_pred))

In [None]:
y_pred = obama_grid_search.best_estimator_.predict(X_test["Obama"])
print(classification_report(y_test["Obama"], y_pred))

In [256]:
obama_grid_search.cv_results_['param_classifier']

masked_array(data=[MultinomialNB(alpha=2.5), MultinomialNB(alpha=2),
                   ExtraTreesClassifier(random_state=5235253),
                   LogisticRegression(random_state=5235253),
                   RandomForestClassifier(random_state=5235253),
                   BernoulliNB(alpha=1), SVC(random_state=5235253),
                   LinearSVC(random_state=5235253)],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object)