In [1]:
import pandas as pd
tweets = pd.read_csv("C:\\Users\\Kaumudi\\Documents\\kmd\\Sentiment Analysis\\LinearSVC\\labelled.csv",encoding = "ISO-8859-1")
list(tweets.columns.values)

['tweet_text', 'tweet_emotion']

In [2]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
I can't tell                           156
Name: tweet_emotion, dtype: int64


In [3]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 2)

In [4]:
sentiment_counts = tweets.tweet_emotion.value_counts()
number_of_tweets = tweets.tweet_text.count()
print(sentiment_counts)

No emotion toward brand or product    5384
Positive emotion                      3476
Negative emotion                      1070
Name: tweet_emotion, dtype: int64


In [5]:
import re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [6]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",str(tweet)) 
    tokens = nltk.word_tokenize(only_letters)
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [7]:
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
tweets['normalized_tweet'] = tweets.tweet_text.apply(normalizer)
tweets[['tweet_text','normalized_tweet']].head()

Unnamed: 0,tweet_text,normalized_tweet
0,So there is no way for me to plug it in here in the US unless I go by a converter.,"[way, plug, u, unless, go, converter]"
1,Good case Excellent value.,"[good, case, excellent, value]"
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,"[tied, charger, conversation, lasting, minute, major, problem]"
3,The mic is great.,"[mic, great]"
4,I have to jiggle the plug to get it to line up right to get decent volume.,"[jiggle, plug, get, line, right, get, decent, volume]"


In [8]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
tweets['grams'] = tweets.normalized_tweet.apply(ngrams)
tweets[['grams']].head()

Unnamed: 0,grams
0,"[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,"[good case, case excellent, excellent value, good case excellent, case excellent value]"
2,"[tied charger, charger conversation, conversation lasting, lasting minute, minute major, major problem, tied charger conversation, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,[mic great]
4,"[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [9]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [10]:
tweets[(tweets.tweet_emotion == 'Positive emotion')][['grams']].apply(count_words)['grams'].most_common(20)

[('rt mention', 908),
 ('sxsw link', 314),
 ('apple store', 226),
 ('mention sxsw', 224),
 ('link sxsw', 170),
 ('mention mention', 151),
 ('iphone app', 136),
 ('pop store', 132),
 ('store sxsw', 122),
 ('ipad sxsw', 120),
 ('via mention', 113),
 ('mention google', 105),
 ('austin sxsw', 103),
 ('sxsw apple', 101),
 ('mention apple', 91),
 ('sxsw ipad', 89),
 ('google map', 87),
 ('social network', 86),
 ('apple pop', 79),
 ('link via', 79)]

In [11]:
tweets[(tweets.tweet_emotion == 'Negative emotion')][['grams']].apply(count_words)['grams'].most_common(20)

[('rt mention', 138),
 ('mention sxsw', 33),
 ('apple store', 25),
 ('sxsw link', 24),
 ('link sxsw', 24),
 ('iphone app', 23),
 ('mention google', 23),
 ('ipad design', 20),
 ('mention mention', 20),
 ('ipad sxsw', 18),
 ('rt mention google', 18),
 ('design headache', 17),
 ('google circle', 16),
 ('ipad design headache', 16),
 ('new social', 16),
 ('mention quot', 16),
 ('social network', 15),
 ('quot sxsw', 15),
 ('iphone sxsw', 15),
 ('news apps', 13)]

In [12]:
tweets[(tweets.tweet_emotion == 'No emotion toward brand or product')][['grams']].apply(count_words)['grams'].most_common(20)

[('rt mention', 1808),
 ('sxsw link', 577),
 ('link sxsw', 575),
 ('mention mention', 433),
 ('social network', 350),
 ('apple store', 344),
 ('mention sxsw', 334),
 ('mention google', 328),
 ('new social', 319),
 ('new social network', 298),
 ('google launch', 252),
 ('network called', 252),
 ('social network called', 252),
 ('called circle', 244),
 ('network called circle', 240),
 ('via mention', 226),
 ('major new', 225),
 ('major new social', 225),
 ('launch major', 219),
 ('launch major new', 219)]

In [17]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [18]:
vectorized_data = count_vectorizer.fit_transform(tweets.tweet_text.astype(str))

In [19]:
import joblib
file = 'count_vectBig.sav'
joblib.dump(count_vectorizer, file)

['count_vectBig.sav']

In [20]:
#vectorized_data = count_vectorizer.fit_transform(tweets.tweet_text.astype(str))
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [21]:
tweets.shape

(9930, 4)

In [22]:
tweets = tweets[tweets.tweet_emotion != 'I can\'t tell']
tweets.shape

(9930, 4)

In [23]:
tweets.head(5)

Unnamed: 0,tweet_text,tweet_emotion,normalized_tweet,grams
0,So there is no way for me to plug it in here in the US unless I go by a converter.,Negative emotion,"[way, plug, u, unless, go, converter]","[way plug, plug u, u unless, unless go, go converter, way plug u, plug u unless, u unless go, unless go converter]"
1,Good case Excellent value.,Positive emotion,"[good, case, excellent, value]","[good case, case excellent, excellent value, good case excellent, case excellent value]"
2,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,Negative emotion,"[tied, charger, conversation, lasting, minute, major, problem]","[tied charger, charger conversation, conversation lasting, lasting minute, minute major, major problem, tied charger conversation, charger conversation lasting, conversation lasting minute, lasting minute major, minute major problem]"
3,The mic is great.,Positive emotion,"[mic, great]",[mic great]
4,I have to jiggle the plug to get it to line up right to get decent volume.,Negative emotion,"[jiggle, plug, get, line, right, get, decent, volume]","[jiggle plug, plug get, get line, line right, right get, get decent, decent volume, jiggle plug get, plug get line, get line right, line right get, right get decent, get decent volume]"


In [24]:
def sentiment2target(sentiment):
    return {
        #'1': 0,
        'Negative emotion': 0,
        'No emotion toward brand or product': 1,
        #'I can\'t tell': 1,
        'Positive emotion' : 2,
        #1: 0,
        #'3': 1,
        #'5': 2,
        #'not_relevant': 1,
    }[sentiment]
targets = tweets.tweet_emotion.apply(sentiment2target)

In [25]:
targets.shape

(9930,)

In [26]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.2, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [27]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma='auto', C=1., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(data_train, targets_train)

In [28]:
clf.score(data_test, targets_test)

0.6968781470292045

In [29]:
file = 'trained_modelBig.sav'
joblib.dump(clf, file)

['trained_modelBig.sav']