# Competition Twitter Sentiment Analysis

In this notebook, to predict the sentiment of a tweet, we use the algorithm : 

- Linear Support Vector Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
import re
import string
import nltk
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
from sklearn.metrics import confusion_matrix
#import ConfusionMatrix
from pandas_ml import ConfusionMatrix
from sklearn.metrics import accuracy_score



#### Load DataSet : TRAIN & TEST

In [2]:
train = pd.read_csv("/Users/laurent/Documents/Data/TextClassificationGit/train_E6oV3lV.csv", encoding='ISO-8859-1')
test = pd.read_csv("/Users/laurent/Documents/Data/TextClassificationGit/test_tweets_anuFYb8.csv", encoding='ISO-8859-1')

## Function to clean data

In [3]:
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

#### Clean Train and Test Dataset

In [4]:
train.tweet = train.tweet.str.replace(r"can't", "cannot ")
train.tweet = train.tweet.str.replace(r"\'ve", " have ")
train.tweet = train.tweet.str.replace(r"\'s", " ")
train.tweet = train.tweet.str.replace(r"n't", " not ")
train.tweet = train.tweet.str.replace(r"i'm", "I am")
train.tweet = train.tweet.str.replace(r"\'re", " are ")
train.tweet = train.tweet.str.replace(r"\'d", " would ")
train.tweet = train.tweet.str.replace(r"\rly", " really ")
train.tweet = train.tweet.str.replace(r"\ gp", " grand prix ")
train.tweet = train.tweet.str.replace(r"\ yeeesss", " yes ")
train.tweet = train.tweet.str.replace(r"\ pt", " point ")

test.tweet = test.tweet.str.replace(r"can't", "cannot ")
test.tweet = test.tweet.str.replace(r"\'ve", " have ")
test.tweet = test.tweet.str.replace(r"\'s", " ")
test.tweet = test.tweet.str.replace(r"n't", " not ")
test.tweet = test.tweet.str.replace(r"i'm", "I am")
test.tweet = test.tweet.str.replace(r"\'re", " are ")
test.tweet = test.tweet.str.replace(r"\'d", " would ")
test.tweet = test.tweet.str.replace(r"\rly", " really ")
test.tweet = test.tweet.str.replace(r"\ gp", " grand prix ")
test.tweet = test.tweet.str.replace(r"\ yeeesss", " yes ")
test.tweet = test.tweet.str.replace(r"\ pt", " point ")

train_label = train['label'] == 1
df_try = train[train_label]
train = train.append([df_try]*10, ignore_index = True)

train.sample(frac=1)

training = train.tweet
dftext = []
for t in training:
    dftext.append(tweet_cleaner(t))
    
testing = test.tweet
dftest = []
for t in testing:
    dftest.append(tweet_cleaner(t))

Transfrom Train label to list

In [5]:
dfList = train['label'].tolist()

Merge Train tweet and label in dataframe

In [6]:
dataset = sklearn.datasets.base.Bunch(data=dftext, target=dfList)
train = pd.DataFrame(dataset)
train.columns = ['tweet','label']

## Build Model with pipeline

In [26]:
pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(stop_words = 'english', lowercase=True,binary= False,max_df= 0.5,max_features= 50000, ngram_range = (1, 2))),
    ('tfidf_transformer',  TfidfTransformer(norm = 'l2', use_idf = True, sublinear_tf=True)),
    ('classifier',         LinearSVC(tol = 0.1, max_iter = 100))
])

## Test model with Train Dataset

In [27]:
%%time
k_fold = KFold(n=len(train), n_folds=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = train.iloc[train_indices]['tweet'].values
    train_y = train.iloc[train_indices]['label'].values

    test_text = train.iloc[test_indices]['tweet'].values
    test_y = train.iloc[test_indices]['label'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, average= 'binary')
    scores.append(score)

print('Total tweets classified:', len(train))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total tweets classified: 54382
Score: 0.95854969325
Confusion matrix:
[[29357   363]
 [   65 24597]]
CPU times: user 11.2 s, sys: 300 ms, total: 11.5 s
Wall time: 11.6 s


## Make predictions with Test Dataset

In [28]:
predictions = pipeline.predict(dftest)
predictions = pd.DataFrame(predictions)

## Download Predictions

In [29]:
frames = [test, predictions]
submission = pd.concat(frames, axis=1, join_axes=[test.index])
submission
submission = submission[['id',0]]
submission.describe()
submission.columns = ['id', 'label']
submission = submission[['id', 'label']]
submission.to_csv('sub21.csv')

In [24]:
pipeline = Pipeline([
    ('vect',   CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('clf',  LinearSVC())
])
parameters = {  
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),  
    'clf__tol' : (0.1, 0.01, 0.001, 0.0001),
    'clf__max_iter' : (100, 500, 1000, 2000)
    } 

   

In [25]:
%%time
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    #t0 = time()
    grid_search.fit(train.tweet, train.label)
    #print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__tol': (0.1, 0.01, 0.001, 0.0001), 'clf__max_iter': (100, 500, 1000, 2000)}
Fitting 3 folds for each of 1536 candidates, totalling 4608 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 31.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed: 46.5min finished



Best score: 0.994
Best parameters set:
	clf__max_iter: 100
	clf__tol: 0.1
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: 50000
	vect__ngram_range: (1, 2)
CPU times: user 1min 46s, sys: 23.4 s, total: 2min 9s
Wall time: 46min 53s
