#  Tweet Classifier

This tweet classifier uses a public tweet training / testing dataset created by Sentiment140

In [27]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from __future__ import division, print_function



#modeling includes
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

#persistance through joblib
from sklearn.externals import joblib

In [7]:
df = pd.read_csv('../../data/training.1600000.processed.noemoticon.csv', header=None, encoding='ISO-8859-1')
df.columns = ['polarity', 'tweet_id', 'date', 'query', 'user', 'text']

### Some Quick EDA

We're going to take a quick look at our training dataset.   

In [8]:
df.polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

The authors of this dataset noted that positive tweets are labled 4 and negative tweets are labeled 0.  So, we have 800,000 positive tweets, and 800,000 negative tweets.   Lets look at some examples

In [9]:
df[df.polarity == 0].head()

Unnamed: 0,polarity,tweet_id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
df[df.polarity == 4].head()

Unnamed: 0,polarity,tweet_id,date,query,user,text
800000,4,1467822272,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,ersle,I LOVE @Health4UandPets u guys r the best!!
800001,4,1467822273,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,becca210,im meeting up with one of my besties tonight! ...
800002,4,1467822283,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,Wingman29,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,4,1467822287,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,katarinka,Being sick can be really cheap when it hurts t...
800004,4,1467822293,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,_EmilyYoung,@LovesBrooklyn2 he has that effect on everyone


### Model Creation

In [11]:
def create_training_data():
    df = pd.read_csv('../../data/training.1600000.processed.noemoticon.csv', 
                     names=['polarity', 'tweet_id', 'date', 'query', 'user', 'text'],
                     header=None,
                     encoding='ISO-8859-1')
    df.drop(['tweet_id','date','query','user'], axis=1, inplace=True)
    df.polarity.replace(4,1, inplace=True )
    return df

In [12]:
def create_testing_data():
    df = pd.read_csv('../../data/testdata.manual.2009.06.14.csv', 
                     names=['polarity', 'tweet_id', 'date', 'query', 'user', 'text'],
                     header=None,
                     encoding='ISO-8859-1')
    df.drop(['tweet_id','date','query','user'], axis=1, inplace=True)
    df.polarity.replace(4,1, inplace=True )
    return df

In [13]:
training_tweets = create_training_data()
y_train = training_tweets.pop('polarity')

In [14]:
testing_tweets = create_testing_data()
neutral_tests = testing_tweets[testing_tweets.polarity == 2]
testing_tweets = testing_tweets[testing_tweets.polarity.isin([0,1])] 
y_test = testing_tweets.pop('polarity')

In [15]:
def clean_tweets(x):
    x = x.lower()  #lowercase everything
    x = re.sub('@[a-z_-]+',"",x) #remove handles
    x = re.sub('[0-9]+',"",x)  #remove numbers
    x = re.sub('(https?):\/\/(www\.)?[a-z0-9\.:].*?(?=\s)', "",x) #remove urls
    x = re.sub('&[a-z]+','',x)  #remove html punctuations &amp, etc...
    return x

training_tweets.text = training_tweets.text.map(clean_tweets)
testing_tweets.text = testing_tweets.text.map(clean_tweets)

In [16]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, encoding='ISO-8859-1',
                             lowercase=True, strip_accents='ascii', stop_words=stopset,
                            min_df = 3)

In [17]:
X_train = vectorizer.fit_transform(training_tweets.text)

In [18]:
X_test = vectorizer.transform(testing_tweets.text)

In [19]:
#we will train a naive_bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])


0.87725833488545346

### Create an object package that can be reused/pickled

In [22]:
class tweet_classifier(object):
    vectorizer = ""
    classifier = ""
    
    def __init__(self, vectorizer, classifier):
        self.vectorizer = vectorizer
        self.classifier = classifier
    
    def clean_tweets(x):
        x = x.lower()  #lowercase everything
        x = re.sub('@[a-z_-]+',"",x) #remove handles
        x = re.sub('[0-9]+',"",x)  #remove numbers
        x = re.sub('(https?):\/\/(www\.)?[a-z0-9\.:].*?(?=\s)', "",x) #remove urls
        x = re.sub('&[a-z]+','',x)  #remove html punctuations &amp, etc...
        return x
    
    

In [37]:
joblib.dump(vectorizer, "vectorizer.joblib", compress=3 )
joblib.dump(clf, "bayes_classifier.joblib", compress=3 )

['bayes_classifier.joblib']

In [38]:
test_obj = joblib.load( "vectorizer.joblib")

In [39]:
type(test_obj)

sklearn.feature_extraction.text.TfidfVectorizer