In [153]:
import pandas as pd
tweets = pd.read_csv("C:\\Users\\Kaumudi\\Documents\\kmd\\Sentiment Analysis\\LinearSVC\\labelled.csv",encoding = "ISO-8859-1")
list(tweets.columns.values)

['tweet_text', 'tweet_emotion']

In [154]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
I can't tell                          156 
Name: tweet_emotion, dtype: int64


In [155]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 2)

In [156]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
Name: tweet_emotion, dtype: int64


In [157]:
import re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [158]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",str(tweet)) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [159]:
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
tweets['normalized_tweet'] = tweets.tweet_text.apply(normalizer)
tweets[['tweet_text','normalized_tweet']].head()

Unnamed: 0,tweet_text,normalized_tweet
0,So there is no way for me to plug it in here in the US unless I go by a converter.,"[way, plug, u, unless, go, converter]"
1,Good case Excellent value.,"[excellent, value]"
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,"[charger, conversation, lasting, minute, major, problem]"
3,The mic is great.,[great]
4,I have to jiggle the plug to get it to line up right to get decent volume.,"[jiggle, plug, get, line, right, get, decent, volume]"


In [160]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
tweets['grams'] = tweets.normalized_tweet.apply(ngrams)
tweets[['grams']].head()

Unnamed: 0,grams
0,"[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,[excellent value]
2,"[charger conversation, conversation lasting, lasting minute, minute major, major problem, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,[]
4,"[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [161]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [162]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,3))

In [163]:
import joblib
file = 'count_vectBig.sav'
joblib.dump(count_vectorizer, file)

['count_vectBig.sav']

In [164]:
vectorized_data = count_vectorizer.fit_transform(tweets.tweet_text.astype(str))
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [165]:
tweets.shape

(9930, 4)

In [166]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 4)

In [167]:
tweets.head(5)

Unnamed: 0,tweet_text,tweet_emotion,normalized_tweet,grams
0,So there is no way for me to plug it in here in the US unless I go by a converter.,Negative emotion,"[way, plug, u, unless, go, converter]","[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,Good case Excellent value.,Positive emotion,"[excellent, value]",[excellent value]
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,Negative emotion,"[charger, conversation, lasting, minute, major, problem]","[charger conversation, conversation lasting, lasting minute, minute major, major problem, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,The mic is great.,Positive emotion,[great],[]
4,I have to jiggle the plug to get it to line up right to get decent volume.,Negative emotion,"[jiggle, plug, get, line, right, get, decent, volume]","[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [168]:
def sentiment2target(sentiment):
    return {
        #'1': 0,
        'Negative emotion': 0,
        'No emotion toward brand or product': 1,
        #'I can\'t tell': 1,
        'Positive emotion' : 2,
        #1: 0,
        #'3': 1,
        #'5': 2,
        #'not_relevant': 1,
    }[sentiment]
targets = tweets.tweet_emotion.apply(sentiment2target)

In [169]:
targets.shape

(9930,)

In [174]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.2, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [175]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=10., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(data_train, targets_train)

In [176]:
clf.score(data_test, targets_test)

0.6928499496475328

In [177]:
file = 'trained_modelBig.sav'
joblib.dump(clf, file)

['trained_modelBig.sav']

In [178]:
targets.shape[0]

9930

In [181]:
targets.shape

(9930,)

In [182]:
targets

0        0
1        2
2        0
3        2
4        0
5        0
6        0
7        0
8        2
9        2
10       0
11       2
12       0
13       2
14       0
15       2
16       2
17       0
18       0
19       2
20       0
21       2
22       0
23       2
24       2
25       0
26       0
27       0
28       0
29       2
        ..
10054    1
10056    1
10057    2
10058    2
10059    1
10060    2
10061    2
10062    1
10064    1
10065    1
10066    1
10067    2
10068    2
10069    2
10070    1
10071    1
10072    2
10073    1
10074    2
10075    0
10076    1
10077    1
10078    1
10079    2
10080    1
10081    1
10082    2
10083    1
10084    1
10085    1
Name: tweet_emotion, Length: 9930, dtype: int64

In [179]:
targets.tail(10)

10076    1
10077    1
10078    1
10079    2
10080    1
10081    1
10082    2
10083    1
10084    1
10085    1
Name: tweet_emotion, dtype: int64

In [180]:
targets.head(10)

0    0
1    2
2    0
3    2
4    0
5    0
6    0
7    0
8    2
9    2
Name: tweet_emotion, dtype: int64