# Twitter Sentiment Analysis (2 labels)

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

def strip(text):
    t = text.lower()
    t = re.sub('\&amp;', ' ', t)
    t = re.sub('http\S+', ' ', t)
    t = re.sub('@\w+', ' ', t)
    t = re.sub("[^\w']", ' ', t)
    t = re.sub('\s+', ' ', t)
    return t

def explore(df):
    print('Observations:', len(df.index))
    for label in set(df.label):
        print('%.2f'%(len(df[df.label == label])/len(df.index)*100), '%', label)
    return

def load_dataset(path):
    columns = ['id', 'topic', 'label', 'text']
    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    df = df[df.text != 'Not Available']
    df.reset_index(drop=True, inplace=True)
    df.text = df.text.apply(lambda x: strip(x))
    return df

In [43]:
train_df = load_dataset('/home/kostas/Desktop/Semester 2/text engineering analytics/assignments/assignment2/twitter_datasets/train_2_labels.csv')
explore(train_df)

Observations: 3422
83.02 % positive
16.98 % negative


In [44]:
dev_df = load_dataset('/home/kostas/Desktop/Semester 2/text engineering analytics/assignments/assignment2/twitter_datasets/dev_2_labels.csv')
explore(dev_df)

Observations: 1058
73.82 % positive
26.18 % negative


In [47]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate(pipeline, x_train, y_train, x_test, y_test):
    fit = pipeline.fit(x_train, y_train)
    y_pred = fit.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    prec_rec_fscore = precision_recall_fscore_support(y_test, y_pred)
    print(prec_rec_fscore)

def checker(vectorizers, max_features, stopwords, ngrams):
    result = []
    for vec in vectorizers:
        for n in max_features:
            for sw in stopwords:
                for ngram in ngrams:
                    print('Trying vec=',vec,'max_features=',n,'stopwords=',sw,'ngrams=',ngram)
                    vec.set_params(stop_words=sw, max_features=n, ngram_range=ngram)
                    pipeline = Pipeline([('vectorizer', vec),('classifier', LogisticRegression())])
                    evaluate(pipeline, train_df.text, train_df.label, dev_df.text, dev_df.label)

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizers = [CountVectorizer(), TfidfVectorizer()]
max_features = np.arange(1000, 9001, 500)
stopwords = [None, 'english']
ngrams = [(1,1), (1,2), (1,3)]

checker(vectorizers, max_features, stopwords, ngrams)

Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 1000 stopwords= None ngrams= (1, 1)
0.74763705104
(array([ 0.54716981,  0.76995798]), array([ 0.20938628,  0.93854033]), array([ 0.30287206,  0.84593191]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 1000 stopwords= None ng

0.74763705104
(array([ 0.54464286,  0.77167019]), array([ 0.22021661,  0.9346991 ]), array([ 0.31362468,  0.84539664]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 2000 stopwords= english ngrams= (1, 1)
0.756143667297
(array([ 0.62666667,  0.76602238]), array([ 0.16967509,  0.96414853]), array([ 0.26704545,  0.8537415 ]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='eng

0.755198487713
(array([ 0.61538462,  0.76632653]), array([ 0.1732852 ,  0.96158771]), array([ 0.27042254,  0.85292447]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 3500 stopwords= None ngrams= (1, 1)
0.756143667297
(array([ 0.59793814,  0.77211238]), array([ 0.20938628,  0.95006402]), array([ 0.31016043,  0.85189437]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=N

0.753308128544
(array([ 0.5754717 ,  0.77310924]), array([ 0.22021661,  0.94238156]), array([ 0.31853786,  0.84939411]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4500, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 4500 stopwords= english ngrams= (1, 1)
0.754253308129
(array([ 0.63076923,  0.76233635]), array([ 0.14801444,  0.96927017]), array([ 0.23976608,  0.85343856]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='en

0.758979206049
(array([ 0.65714286,  0.76619433]), array([ 0.16606498,  0.96927017]), array([ 0.26512968,  0.85585076]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5500, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 6000 stopwords= None ngrams= (1, 1)
0.751417769376
(array([ 0.57954545,  0.76701031]), array([ 0.18411552,  0.95262484]), array([ 0.27945205,  0.84980011]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=6000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=N

0.75236294896
(array([ 0.57894737,  0.7694704 ]), array([ 0.19855596,  0.94878361]), array([ 0.29569892,  0.84977064]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 7000 stopwords= english ngrams= (1, 1)
0.756143667297
(array([ 0.66101695,  0.76176176]), array([ 0.14079422,  0.97439181]), array([ 0.23214286,  0.85505618]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='eng

0.756143667297
(array([ 0.64615385,  0.7633434 ]), array([ 0.15162455,  0.97055058]), array([ 0.24561404,  0.85456595]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=8000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None) max_features= 8500 stopwords= None ngrams= (1, 1)
0.74763705104
(array([ 0.55952381,  0.76386037]), array([ 0.16967509,  0.95262484]), array([ 0.26038781,  0.84786325]), array([277, 781]))
Trying vec= CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=8500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=No

0.746691871456
(array([ 0.69565217,  0.74782609]), array([ 0.05776173,  0.99103713]), array([ 0.10666667,  0.85242291]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 1000 stopwords= None ngrams= (1, 3)
0.74763705104
(array([ 0.75      ,  0.74759152]), array([ 0.05415162,  0.99359795]), array([ 0.1010101 ,  0.85321605]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_d

0.746691871456
(array([ 0.8       ,  0.74592522]), array([ 0.0433213 ,  0.99615877]), array([ 0.08219178,  0.85307018]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 2000 stopwords= english ngrams= (1, 1)
0.742911153119
(array([ 0.85714286,  0.74215033]), array([ 0.02166065,  0.99871959]), array([ 0.04225352,  0.85152838]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, m

0.743856332703
(array([ 0.875     ,  0.74285714]), array([ 0.02527076,  0.99871959]), array([ 0.04912281,  0.85199345]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 3000 stopwords= english ngrams= (1, 2)
0.743856332703
(array([ 1.        ,  0.74239544]), array([ 0.02166065,  1.        ]), array([ 0.04240283,  0.85215494]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30

0.742911153119
(array([ 0.85714286,  0.74215033]), array([ 0.02166065,  0.99871959]), array([ 0.04225352,  0.85152838]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 4000 stopwords= english ngrams= (1, 3)
0.742911153119
(array([ 0.85714286,  0.74215033]), array([ 0.02166065,  0.99871959]), array([ 0.04225352,  0.85152838]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=40

0.742911153119
(array([ 0.85714286,  0.74215033]), array([ 0.02166065,  0.99871959]), array([ 0.04225352,  0.85152838]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 5500 stopwords= None ngrams= (1, 1)
0.744801512287
(array([ 0.73333333,  0.74496644]), array([ 0.03971119,  0.99487836]), array([ 0.07534247,  0.85197368]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5500,

0.743856332703
(array([ 0.6875    ,  0.74472169]), array([ 0.03971119,  0.99359795]), array([ 0.07508532,  0.85134394]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=6500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 6500 stopwords= None ngrams= (1, 2)
0.741965973535
(array([ 0.83333333,  0.74144487]), array([ 0.01805054,  0.99871959]), array([ 0.03533569,  0.85106383]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=6500, min_

0.741965973535
(array([ 0.83333333,  0.74144487]), array([ 0.01805054,  0.99871959]), array([ 0.03533569,  0.85106383]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 7500 stopwords= None ngrams= (1, 3)
0.742911153119
(array([ 0.77777778,  0.74261201]), array([ 0.02527076,  0.99743918]), array([ 0.04895105,  0.85136612]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7500, min_

0.742911153119
(array([ 0.77777778,  0.74261201]), array([ 0.02527076,  0.99743918]), array([ 0.04895105,  0.85136612]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=8500, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) max_features= 8500 stopwords= english ngrams= (1, 1)
0.741965973535
(array([ 0.83333333,  0.74144487]), array([ 0.01805054,  0.99871959]), array([ 0.03533569,  0.85106383]), array([277, 781]))
Trying vec= TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=8500, m