### Read the dataset (tweets.csv)

In [49]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
from importlib import reload
import warnings
warnings.filterwarnings('ignore')
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')

In [50]:
def converttoutf8(a):
    return unicode(a, "utf-8")

In [51]:
# read yelp.csv into a DataFrame
data = pd.read_csv('tweets.csv', engine = 'python')

# create a new DataFrame that only contains the 5-star and 1-star reviews
#yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# define X and y
#X = yelp_best_worst.text
#y = yelp_best_worst.stars

# split the new DataFrame into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [52]:
data.shape

(9093, 3)

In [53]:
data.head(5)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


##### Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [54]:
def preprocess(text):
    try:
        return text.decode('ascii')
    except Exception as e:
        return text

In [55]:
data.tweet_text.shape

(9093,)

In [56]:
data["text"] = pd.DataFrame([preprocess(text) for text in data.tweet_text], index=data.index)

In [57]:
data.text.value_counts()

RT @mention Marissa Mayer: Google Will Connect the Digital &amp; Physical Worlds Through Mobile - {link} #sxsw                                             5
RT @mention Marissa Mayer: Google Will Connect the Digital &amp; Physical Worlds Through Mobile - {link} #SXSW                                             4
RT @mention Google to Launch Major New Social Network Called Circles, Possibly Today {link} #sxsw                                                          4
RT @mention Google to Launch Major New Social Network Called Circles, Possibly Today {link} #SXSW                                                          3
RT @mention RT @mention It's not a rumor: Apple is opening up a temporary store in downtown Austin for #SXSW and the iPad 2 launch {link}                  2
RT @mention RT @mention Google to Launch Major New Social Network Called Circles, Possibly Today {link} #sxsw                                              2
Oh. My. God. The #SXSW app for iPad is pure, unadulterated

In [58]:
data.text.head(50)

0     .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1     @jessedee Know about @fludapp ? Awesome iPad/i...
2     @swonderlin Can not wait for #iPad 2 also. The...
3     @sxsw I hope this year's festival isn't as cra...
4     @sxtxstate great stuff on Fri #SXSW: Marissa M...
5     @teachntech00 New iPad Apps For #SpeechTherapy...
6                                                   NaN
7     #SXSW is just starting, #CTIA is around the co...
8     Beautifully smart and simple idea RT @madebyma...
9     Counting down the days to #sxsw plus strong Ca...
10    Excited to meet the @samsungmobileus at #sxsw ...
11    Find &amp; Start Impromptu Parties at #SXSW Wi...
12    Foursquare ups the game, just in time for #SXS...
13    Gotta love this #SXSW Google Calendar featurin...
14    Great #sxsw ipad app from @madebymany: http://...
15    haha, awesomely rad iPad app by @madebymany ht...
16    Holler Gram for iPad on the iTunes App Store -...
17    I just noticed DST is coming this weekend.

### Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [59]:
data1=data.copy(deep=True)

In [60]:
data1.is_there_an_emotion_directed_at_a_brand_or_product.unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [61]:
#data = data.drop(data[(data.is_there_an_emotion_directed_at_a_brand_or_product != "Positive emotion")].index)
data2 = data1[~data1['is_there_an_emotion_directed_at_a_brand_or_product'].isin(["No emotion toward brand or product", "I can't tell"])]

In [62]:
data2.is_there_an_emotion_directed_at_a_brand_or_product.unique()

array(['Negative emotion', 'Positive emotion'], dtype=object)

In [63]:
data2.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@jessedee Know about @fludapp ? Awesome iPad/i...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@swonderlin Can not wait for #iPad 2 also. The...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@sxsw I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@sxtxstate great stuff on Fri #SXSW: Marissa M...


### Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [64]:
vect = CountVectorizer()
text_dtm=vect.fit_transform(data2.text)

In [65]:
text_dtm.shape

(3548, 6020)

### Find number of different words in vocabulary

In [66]:
#vect._count_vocab(data2.text,)
vect.get_feature_names()

['000',
 '02',
 '03',
 '0310apple',
 '08',
 '10',
 '100',
 '100s',
 '100tc',
 '101',
 '106',
 '10am',
 '10k',
 '10mins',
 '10pm',
 '10x',
 '11',
 '11ntc',
 '11th',
 '12',
 '120',
 '12b',
 '12th',
 '13',
 '130',
 '14',
 '1406',
 '1413',
 '1415',
 '15',
 '150',
 '1500',
 '150m',
 '157',
 '15am',
 '15k',
 '15slides',
 '16162',
 '169',
 '16gb',
 '16mins',
 '17',
 '188',
 '1986',
 '1990style',
 '1991',
 '1k',
 '1m',
 '1of',
 '1pm',
 '1st',
 '20',
 '200',
 '2010',
 '2011',
 '2012',
 '20s',
 '21',
 '210',
 '22',
 '23',
 '24',
 '25',
 '250k',
 '25th',
 '2am',
 '2b',
 '2day',
 '2honor',
 '2moro',
 '2nd',
 '2nite',
 '2s',
 '2yrs',
 '2åê',
 '30',
 '300',
 '3000',
 '30a',
 '30am',
 '30p',
 '30pm',
 '310409h2011',
 '32',
 '32gb',
 '35',
 '36',
 '360',
 '37',
 '3blks',
 '3d',
 '3g',
 '3gs',
 '3k',
 '3rd',
 '3x',
 '40',
 '400',
 '40min',
 '41',
 '437',
 '45',
 '45am',
 '47',
 '48',
 '4am',
 '4android',
 '4chan',
 '4g',
 '4nqv92l',
 '4sq',
 '4sq3',
 '4square',
 '50',
 '54',
 '55',
 '58',
 '59',
 '59p'

#### Tip: To see all available functions for an Object use dir

In [67]:
dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_limit_features',
 '_sort_features',
 '_validate_vocabulary',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words',
 'input',
 'inverse_transform',
 'lowercase',
 'max_df',
 'max_features',
 'min_df',
 'ngram_range',
 'preprocessor',
 'set_params',
 'stop_words',
 'stop_word

### Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [68]:
data2["is_there_an_emotion_directed_at_a_brand_or_product"].value_counts()

Positive emotion    2978
Negative emotion     570
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### Change the labels for Positive and Negative emotions as 1 and 0 respectively.

Hint: use map on that column and give labels

In [69]:
data2['is_there_an_emotion_directed_at_a_brand_or_product'] = data2['is_there_an_emotion_directed_at_a_brand_or_product'].map({'Positive emotion': 1, 'Negative emotion': 0})

In [70]:
data2.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,1,@jessedee Know about @fludapp ? Awesome iPad/i...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,1,@swonderlin Can not wait for #iPad 2 also. The...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0,@sxsw I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1,@sxtxstate great stuff on Fri #SXSW: Marissa M...


### Define feature set as `text` column and above labels as target  and divide into train and test datasets

In [71]:
x=data2.text
y=data2.is_there_an_emotion_directed_at_a_brand_or_product

In [72]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30)

## **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [80]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [82]:
X_train_dtm = vect.fit_transform(x_train)
X_test_dtm = vect.transform(x_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_class))

0.863849765258216


In [85]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()

In [86]:
lm.fit(X_train_dtm,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [87]:
y_pred_class = lm.predict(X_test_dtm)

In [88]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_class))

0.8788732394366198
