In [42]:
import pandas as pd
tweets = pd.read_csv("C:\\Users\\Kaumudi\\Documents\\kmd\\Sentiment Analysis\\LinearSVC\\labelled.csv",encoding = "ISO-8859-1")
list(tweets.columns.values)

['tweet_text', 'tweet_emotion']

In [43]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
I can't tell                          156 
Name: tweet_emotion, dtype: int64


In [44]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 2)

In [45]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
Name: tweet_emotion, dtype: int64


In [46]:
import re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [47]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",str(tweet)) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [48]:
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
tweets['normalized_tweet'] = tweets.tweet_text.apply(normalizer)
tweets[['tweet_text','normalized_tweet']].head()

Unnamed: 0,tweet_text,normalized_tweet
0,So there is no way for me to plug it in here in the US unless I go by a converter.,"[way, plug, u, unless, go, converter]"
1,Good case Excellent value.,"[excellent, value]"
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,"[charger, conversation, lasting, minute, major, problem]"
3,The mic is great.,[great]
4,I have to jiggle the plug to get it to line up right to get decent volume.,"[jiggle, plug, get, line, right, get, decent, volume]"


In [49]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
tweets['grams'] = tweets.normalized_tweet.apply(ngrams)
tweets[['grams']].head()

Unnamed: 0,grams
0,"[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,[excellent value]
2,"[charger conversation, conversation lasting, lasting minute, minute major, major problem, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,[]
4,"[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [50]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [51]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,3))

In [52]:
import joblib
file = 'count_vectBig.sav'
joblib.dump(count_vectorizer, file)

['count_vectBig.sav']

In [53]:
vectorized_data = count_vectorizer.fit_transform(tweets.tweet_text.astype(str))
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [54]:
tweets.shape

(9930, 4)

In [55]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 4)

In [56]:
tweets.head(5)

Unnamed: 0,tweet_text,tweet_emotion,normalized_tweet,grams
0,So there is no way for me to plug it in here in the US unless I go by a converter.,Negative emotion,"[way, plug, u, unless, go, converter]","[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,Good case Excellent value.,Positive emotion,"[excellent, value]",[excellent value]
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,Negative emotion,"[charger, conversation, lasting, minute, major, problem]","[charger conversation, conversation lasting, lasting minute, minute major, major problem, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,The mic is great.,Positive emotion,[great],[]
4,I have to jiggle the plug to get it to line up right to get decent volume.,Negative emotion,"[jiggle, plug, get, line, right, get, decent, volume]","[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [57]:
def sentiment2target(sentiment):
    return {
        #'1': 0,
        'Negative emotion': 0,
        'No emotion toward brand or product': 1,
        #'I can\'t tell': 1,
        'Positive emotion' : 2,
        #1: 0,
        #'3': 1,
        #'5': 2,
        #'not_relevant': 1,
    }[sentiment]
targets = tweets.tweet_emotion.apply(sentiment2target)

In [58]:
targets.shape

(9930,)

In [59]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.2, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [60]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma='auto', C=1., probability=True, class_weight='balanced', kernel='linear'))

In [61]:
clf.fit(data_train, targets_train)
targets_pred=clf.predict(data_test)

In [62]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(targets_test, targets_pred))

Accuracy: 0.6993957703927492


In [65]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)

In [66]:
clf.fit(data_train, targets_train)
targets_pred=clf.predict(data_test)

In [67]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(targets_test, targets_pred))

Accuracy: 0.6807653575025177


In [70]:
from sklearn.naive_bayes import MultinomialNB
clf = OneVsRestClassifier(MultinomialNB())

In [71]:
clf.fit(data_train, targets_train)
targets_pred=clf.predict(data_test)

In [72]:
print("Accuracy:",metrics.accuracy_score(targets_test, targets_pred))

Accuracy: 0.6626384692849949


In [74]:
from sklearn.ensemble import GradientBoostingClassifier
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(data_train, targets_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(data_train, targets_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(data_test, targets_test)))

Learning rate:  0.05
Accuracy score (training): 0.539
Accuracy score (validation): 0.557
Learning rate:  0.075
Accuracy score (training): 0.539
Accuracy score (validation): 0.557
Learning rate:  0.1
Accuracy score (training): 0.539
Accuracy score (validation): 0.557
Learning rate:  0.25
Accuracy score (training): 0.541
Accuracy score (validation): 0.557
Learning rate:  0.5
Accuracy score (training): 0.545
Accuracy score (validation): 0.560
Learning rate:  0.75
Accuracy score (training): 0.546
Accuracy score (validation): 0.560
Learning rate:  1
Accuracy score (training): 0.546
Accuracy score (validation): 0.560
