### Goal:

Using the previously created `44v45tweets.csv`, we'll train (at least) two ML models to predict whether the text of a tweet is directed at 44, 45, or both.

In [None]:
#! pip install --user pandas_ml
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

#import nltk
#nltk.download('stopwords')

In [None]:
# call the stop words list; append the two twitter handles of interest
stop_words = stopwords.words('english')
stop_words += ['barackobama', 'realdonaldtrump']
stop_words[-5:]

In [None]:
# load the fetched tweets
df = pd.read_csv('44v45tweets.csv', index_col = 0)
df.head()

In [None]:
# the columns at_44 and at_45 need to be combined to create a single
# target column containing three possible values:
# both, trump, and obama
conditions = [
    (df['at_44']) & (df['at_45']),
    (df['at_44'] == False) & (df['at_45']),
    (df['at_44']) & (df['at_45'] == False)
]
choices = ['both', 'trump', 'obama']
# add this new series as a column named 'label'
df['label'] = np.select(conditions, choices)
df.head()

In [None]:
# further prune the data; we just need the full text and the label
data = df[['label', 'full_text']].copy()
data.head()

In [None]:
# preparing the text data
from sklearn.feature_extraction.text import CountVectorizer
# load the stop words and call the vectorizer
vectorizer = CountVectorizer(stop_words = stop_words)
# the column of full text, re-loaded as a list of strings
corpus = data['full_text'].iloc[:].tolist()
X = vectorizer.fit_transform(corpus)
# let's see how many unique and 'interesting' words we fetched from the 
# corpus of tweets
X.shape

In [None]:
# store and peek at the label data
Y = data['label']
Y.head()

In [None]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.2, random_state = 12)

In [None]:
# load a Naive Bayes classifuer
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xTrain, yTrain)

In [None]:
# make the predictions; test the predictions
predicted = clf.predict(xTest)
np.mean(predicted == yTest)

In [None]:
# let's visualize our model's effectiveness using the confusion matrix
from pandas_ml import ConfusionMatrix
import matplotlib.pyplot as plt

cm = ConfusionMatrix(yTest, predicted)
cm

In [None]:
# let's try using a linear SVM 
from sklearn.linear_model import SGDClassifier
clf_svm = SGDClassifier().fit(xTrain, yTrain)
predicted = clf_svm.predict(xTest)
np.mean(predicted == yTest)

In [None]:
cm = ConfusionMatrix(yTest, predicted)
cm