In [1]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
import re

In [2]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv('twitter_general_training_data.csv', encoding='latin1', names = column_names, header = None)

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
def process_tweet(tweet):
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    return tweet

In [5]:
df.text = df.text.apply(process_tweet)

In [6]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"AT_USER URL - awww, that's a bummer. you shou..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,AT_USER i dived many times for the ball. manag...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"AT_USER no, it's not behaving at all. i'm mad...."


In [7]:
# Using built-in TfidVectoriser
stopwords_set = set(stopwords.words('english'))
vectoriser = TfidfVectorizer(use_idf = True, lowercase = True, strip_accents = 'ascii', stop_words = stopwords_set)

In [8]:
y = df.target

In [9]:
X = vectoriser.fit_transform(df.text)

In [10]:
print(y.shape)
print(X.shape)

(1600000,)
(1600000, 287118)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 37)

In [12]:
multinomial_clf = naive_bayes.MultinomialNB()
multinomial_clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
print('MultinomialNB score: {}%'.format(roc_auc_score(y_test, multinomial_clf.predict_proba(X_test)[:,1])))

MultinomialNB score: 0.8455591045965252%


In [14]:
from sklearn.linear_model import SGDClassifier

In [38]:
sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=1000, average=True)
sgd_clf.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=True, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
       max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=None, validation_fraction=0.1, verbose=0, warm_start=False)

In [39]:
print('SGDClassifier score: {}%'.format(roc_auc_score(y_test, sgd_clf.decision_function(X_test))))

SGDClassifier score: 0.8538109460699008%
