In [None]:
import tweepy
import dataset
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from configparser import ConfigParser
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
%pylab inline

### Setting up our database

In [None]:
tweet_db = dataset.connect('sqlite:///../data/tweets_bk.db')

In [None]:
tweet_db['tweets']

In [None]:
table = tweet_db['tweets']

In [None]:
!ls ../data

### Setting up Tweepy to collect from the Twitter API

If you don't already have a Twitter API account, you will need to create one to get your consumer key and secret.

Then you can create a configuration file and put it in your `config` folder and name it `prod.cfg`. The format should follow the `example.cfg`.

For now, you can leave the access tokens blank until you have completed the first OAuth.

In [None]:
config = ConfigParser()
config.read('../config/prod.cfg')
consumer_token = config.get('twitter', 'consumer_key')
consumer_secret = config.get('twitter', 'consumer_secret')

In [None]:
!cat ../config/example.cfg

In [None]:
auth = tweepy.OAuthHandler(consumer_token, consumer_secret)

### The following lines are for if you DON'T ALREADY HAVE an authorization token and secret and you'd like to set them up 

i.e. do this later if you'd like !

You can then add your access_token and access_token_secret to you config file

In [None]:
redirect_url = auth.get_authorization_url()
redirect_url

In [None]:
auth.get_access_token("VALUE HERE")

In [None]:
auth.access_token

In [None]:
auth.access_token_secret

#### If you have access tokens and want to use them...

NOTE: You do *not* need to do this to use this notebook. :)

In [None]:
access_token = config.get('twitter', 'access_token')
access_token_secret = config.get('twitter', 'access_token_secret')
auth.set_access_token(access_token, access_token_secret)

In [None]:
api = tweepy.API(auth)

In [None]:
trump = api.get_user('RealDonaldTrump')
trudeau = api.get_user('JustinTrudeau')

In [None]:
trump

In [None]:
latest_tweets = api.user_timeline(trump.id, count=200)

In [None]:
type(latest_tweets)

In [None]:
len(latest_tweets)

In [None]:
tweet = latest_tweets[0]

In [None]:
tweet.

In [None]:
for tweet in latest_tweets:
    if tweet.author.id == trump.id:
        table.insert({'author': 'Donald Trump',
                      'status': tweet.text})

### Exercise

Can you write a function `add_latest_tweets_to_db` that takes an author, api instance and database table and asks the API for the latest tweets of that user. Tests if they are the user id and if so, inserts them into the table using the author name and tweet text (similar to above but more abstracted).

In [None]:
%load ../solutions/tweets_to_db.py



Since we don't have much in our DB, we can now delete from the table and enter only tweets that use this abstracted function.

In [None]:
table.delete()

In [None]:
add_latest_tweets_to_db(trump, api, table)

In [None]:
add_latest_tweets_to_db(trudeau, api, table)

In [None]:
tweets = tweet_db.query('select * from tweets limit 2;')

In [None]:
[t for t in tweets]

### We might want to do some preprocessing, but let's take a look at what results we get without

In [None]:
tweet_df = pd.read_sql_table('tweets', tweet_db.url)

In [None]:
tweet_df.head()

### Exercise

- create y and set it equal to the author series. This is our label
- create X_train, X_test, y_train, y_test using the train_test_split and the y and status series. Use random_state=53 and a test_size of .33.

In [None]:
%load ../solutions/tweet_df_to_train.py


### Setting up our vectors 

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

### Introspecting our vectors

In [None]:
tfidf_vectorizer.get_feature_names()[-10:]

In [None]:
count_vectorizer.get_feature_names()[:10]

### Discussion

- What is going on here? How might we fix it?

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1, max_df=0.9)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_vectorizer.get_feature_names()[-10:]

### Exercise

Experiment with min_df and choose a number that seems to allow for enough tokens, while still removing noise. Set that value to `min_df`.

In [None]:
%load ../solutions/min_df.py


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=min_df, max_df=0.9)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', min_df=min_df, max_df=0.9)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
count_vectorizer.get_feature_names()[:10]

In [None]:
tfidf_vectorizer.get_feature_names()[-10:]

### Discussion

- Can you see any other issues with the text? 
- What other preprocessing might you want to do?

### Evaluating several models

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Multinomial Naive Bayes

In [None]:
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train, y_train)
pred = tfidf_nb.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['Donald J. Trump', 'Justin Trudeau'])
plot_confusion_matrix(cm, classes=['Donald J. Trump', 'Justin Trudeau'])

### Exercise

- Make the CountVectorizer based MulitnomialNB model
- Fit it and predict
- Print the accuracy
- Plot the confusion matrix

In [None]:
%load ../solutions/multinomial_count.py


## LinearSVC

In [None]:
tfidf_svc = LinearSVC()
tfidf_svc.fit(tfidf_train, y_train)
pred = tfidf_svc.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[trump.name, trudeau.name])
plot_confusion_matrix(cm, classes=[trump.name, trudeau.name])

### Exercise

- Make the CountVectorizer based LinearSVC model
- Fit it and predict
- Print the accuracy
- Plot the confusion matrix

In [None]:
%load ../solutions/linear_svc_count.py

## Passive Aggressive Classifier

### Exercise

- Make the CountVectorizer based PassiveAggressiveClassifier model
- Fit it and predict
- Print the accuracy
- Plot the confusion matrix

In [None]:
%load ../solutions/pa_models.py



### Exercise:

- Choose one of the classifiers you built above and run parameter tuning on it
- Then, retrain and evaluate the model. How much did it improve by?

In [None]:
%load ../solutions/grid_search_count_nb.py


### Introspecting what our model has learned

In [None]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    
    Modified by @kjam to support a dict return.
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    return {class_labels[0]: topn_class1,
            class_labels[1]: topn_class2
    }



In [None]:
most_informative_feature_for_binary_classification(count_vectorizer, count_nb_final, n=30)

In [None]:
most_informative_feature_for_binary_classification(tfidf_vectorizer, tfidf_pa, n=30)

### Discussion

- Do any of these tokens look useful? Why? Why not?
- What can be done to improve the model?

### Bonus

- Add preprocessing to improve the tokens!