   ### SST  Linear and SVM classification with Bag Of Words, Bigrams, TF-Idf


In [None]:
# import required packages
import pandas as pd
import os
from pandas import Series, DataFrame

In [7]:
df = pd.DataFrame()

# Fetch data with Phrase and Phrase_ID
sentlex = pd.read_csv('input\\stanfordSentimentTreebank\\dictionary.txt', sep="|", names= ['phrase','phrase_ID'])
sentlex.head()

Unnamed: 0,phrase,phrase_ID
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [8]:
# Fetch data with Phrase_ID and sentiment Values
raw_score = pd.read_csv('input\\stanfordSentimentTreebank\\sentiment_labels.txt', sep="|")
raw_score.head()

Unnamed: 0,phrase ids,sentiment values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [9]:
# Rename columns of raw_score table
raw_score = raw_score.rename(columns = {"phrase ids" : "phrase_ID", "sentiment values" : "sentiment_value"})
raw_score.head()


Unnamed: 0,phrase_ID,sentiment_value
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [11]:
# create dictionary sentiment values and it's rating
#very negative, negative, neutral, positive, very positive in range of [0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0] respectively#
x = list(raw_score.sentiment_value)
from collections import OrderedDict
dic = OrderedDict()
for i in x:
    if (i >= 0 and i <= 0.2): 
        dic.update({i : 'very negative'})
    elif (i >0.2 and i <= 0.4):
        dic.update({i : 'negative'})
    elif (i >0.4 and i <= 0.6):
        dic.update({i : 'neutral'})
    elif (i >0.6 and i <= 0.8):
        dic.update({i : 'positive'})
    else:
        dic.update({i : 'very positive'})

In [13]:
# create table for Sentiment_value and Rating

rating_df = pd.DataFrame([
    [key,value] for key, value in dic.items()
])
rating_df = rating_df.rename(columns = {0 : "sentiment_value", 1 : "rating"})
rating_df.head()

Unnamed: 0,sentiment_value,rating
0,0.5,neutral
1,0.44444,neutral
2,0.42708,neutral
3,0.375,negative
4,0.41667,neutral


In [14]:
# merge tables raw_score and rating_df
rating_merged = pd.merge(raw_score, rating_df, on=['sentiment_value'])
rating_merged.head()

Unnamed: 0,phrase_ID,sentiment_value,rating
0,0,0.5,neutral
1,1,0.5,neutral
2,3,0.5,neutral
3,17,0.5,neutral
4,18,0.5,neutral


In [15]:
# merge tables sentlex, rating_merged
sentlex_merged = pd.merge(sentlex, rating_merged, on=['phrase_ID'])
sentlex_merged.head()

Unnamed: 0,phrase,phrase_ID,sentiment_value,rating
0,!,0,0.5,neutral
1,! ',22935,0.52778,neutral
2,! '',18235,0.5,neutral
3,! Alas,179257,0.44444,neutral
4,! Brilliant,22936,0.86111,very positive


In [19]:
sentlex_merged.rating.value_counts()

neutral          119449
positive          50148
negative          43028
very positive     15255
very negative     11352
Name: rating, dtype: int64

In [21]:
from sklearn.preprocessing import LabelEncoder
Label_encoded = LabelEncoder()

# Hot-encode the target category
Y = Label_encoded.fit_transform(sentlex_merged['rating'].values)

# combined X data
X = sentlex_merged.phrase

In [23]:
print (X.shape)
print (Y.shape)

(239232,)
(239232,)


In [40]:
from sklearn.model_selection import train_test_split

# spliting to train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 3)


(167462,)
(167462,)
(71770,)
(71770,)


In [None]:
print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)
print (Y_test.shape)

In [43]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.feature_extraction.text import CountVectorizer

# vect = CountVectorizer()


# # fit and transform X_train

# X_train_dtm = vect.fit_transform(X_train)
# X_test_dtm = vect.transform(X_test)

# LR = LogisticRegression()

# LR.fit(X_train_dtm, Y_train)

# Y_predicted = LR.predict(X_test_dtm)

# print ('accracy:', accuracy_score(Y_test, Y_predicted))

accracy: 0.63500069667


In [24]:
def tokenize_test_SST_LR(vect):
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    

    # tokenize X_train and X-test
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # fit and transform multi-class linear regression
    LR = LogisticRegression()
    LR.fit(X_train_dtm, Y_train)

    Y_predicted = LR.predict(X_test_dtm)

    print ('accracy:', accuracy_score(Y_test, Y_predicted))

In [45]:
# Bag of Words Linear Regression
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
tokenize_test_SST_LR(vect)

accracy: 0.63500069667


In [46]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test_SST_LR(vect)

accracy: 0.655817193814


In [47]:
# Average Embedding with TF-idf weights
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tokenize_test_SST_LR(vect)

accracy: 0.623686777205


In [48]:
def tokenize_test_SST_svm(vect):
    from sklearn import svm
    from sklearn.metrics import accuracy_score
    

    # tokenize X_train and X-test
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # fit and transform Support Vector Machine Classification
    lin_clf = svm.LinearSVC()
    lin_clf.fit(X_train_dtm, Y_train)
    Y_predicted = lin_clf.predict(X_test_dtm)

    print ('accracy:', accuracy_score(Y_test, Y_predicted))



In [49]:
# BAG OF WORDS SUPPORT VECTOR MACHINE CLASSIFICATION

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
tokenize_test_SST_svm(vect)

accracy: 0.631879615438


In [51]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test_SST_svm(vect)

accracy: 0.640016720078
