In [1]:
import sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# from google.colab import drive 
# drive.mount('/content/gdrive')

df=pd.read_csv('sentiment_5_class.csv')

In [4]:
df

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3
...,...,...
18384,"to balance pointed , often incisive satire and...",3
18385,have to be a most hard-hearted person not to b...,4
18386,could young romantics out on a date,3
18387,could be this good,3


## Data split
Dataset split in train and test set
KFold Cross Validation used to determine hyperparameters

In [5]:
X=df.Phrase.tolist()
print('No of instances',len(X))
y=df.Sentiment.tolist()
values, counts = np.unique(y, return_counts=True)
print(values)
print(counts)

No of instances 18389
[0 1 2 3 4]
[1235 1456 2345 8792 4561]


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

In [8]:
print(len(X_train))
print(len(X_test))
# print(y_train)
# print(y_test)

14711
3678


## Feature Extraction

**Binary Feature Vectorization<br>
Count Feature Vectorization<br>
TF-IDF Feature Vectorization**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
def Vectorizer(vectorizer):
    pass

#### Example:

In [11]:
data = ['Happy, happy','I am so happy','Sad sad', 'I am so sad']
c_vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w\\w*\\b')
t_vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b')

In [12]:
c_vectorizer.fit(data)
print(c_vectorizer.get_feature_names())

t_vectorizer.fit(data)
print(t_vectorizer.get_feature_names())

['am', 'happy', 'i', 'sad', 'so']
['am', 'happy', 'i', 'sad', 'so']


In [13]:
c_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w*\\b',
                tokenizer=None, vocabulary=None)

In [14]:
c_data_v=c_vectorizer.transform(data)  #Vector conversion

c_data_v.toarray()

array([[0, 2, 0, 0, 0],
       [1, 1, 1, 0, 1],
       [0, 0, 0, 2, 0],
       [1, 0, 1, 1, 1]])

In [15]:
t_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w*\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

### Data into vector is done only by transform
['am', 'happy', 'i', 'sad', 'so']
and its respective frequency

In [16]:
t_vectorizer = TfidfVectorizer()  # (token pattern ='(?u)\\b\\w\\w*\\b')
t_vectorizer.fit(data)
t_vectorizer.get_feature_names()

['am', 'happy', 'sad', 'so']

In [17]:
t_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [18]:
t_data_v=t_vectorizer.transform(data)
t_data_v.toarray()

array([[0.        , 1.        , 0.        , 0.        ],
       [0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.        , 0.        , 1.        , 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.57735027]])

### Count Vectorizer VS TF-IDF Vectorizer

In [19]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
# Fit should be done only on training data BE CAREFULLLLL
count_vectorizer.fit(X_train)
tfidf_vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [20]:
# COUNT VECTORIZER
X_train_count_v=count_vectorizer.transform(X_train)
X_test_count_v= count_vectorizer.transform(X_test)

#TF-IDF VECTORIZER
X_train_tfidf_v=tfidf_vectorizer.transform(X_train)
X_test_tfidf_v=tfidf_vectorizer.transform(X_test)

## Building Model

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score
from sklearn import metrics


grid_param={'kernel':('linear','rbf'),'C':(1,10)}


In [22]:
i=0
def SVM_Model(X_train,X_test,y_train,y_test,grid):
    model=SVC()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    global i
    vectors=['Count Vectorizer','TF-IDF Vectorizer']
    print('Classification Report with {} \n '.format(vectors[i]))
    print(metrics.classification_report(y_test, y_pred))
    
    scorer=make_scorer(f1_score,average='micro')
    clf=GridSearchCV(SVC(),grid_param,scoring=scorer)
    clf.fit(X_train,y_train)
    print('Best Score',clf.best_score_,'with',clf.best_params_)
    x=clf.best_params_
    
    model=SVC(kernel=x['kernel'],C=x['C'],random_state=1)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    vectors=['Count Vectorizer','TF-IDF Vectorizer']
    print('\tClassification Report of best feature with {} \n'.format(vectors[i]))
    print(metrics.classification_report(y_test,y_pred))
    i=i+1

In [23]:
SVM_Model(X_train_count_v,X_test_count_v,y_train,y_test,grid_param)

Classification Report with Count Vectorizer 
 
              precision    recall  f1-score   support

           0       0.83      0.49      0.62       247
           1       0.69      0.31      0.43       291
           2       0.68      0.23      0.34       469
           3       0.61      0.91      0.73      1759
           4       0.77      0.51      0.62       912

    accuracy                           0.65      3678
   macro avg       0.72      0.49      0.55      3678
weighted avg       0.68      0.65      0.62      3678

Best Score 0.7002919739727378 with {'C': 10, 'kernel': 'rbf'}
	Classification Report of best feature with Count Vectorizer 

              precision    recall  f1-score   support

           0       0.77      0.63      0.69       247
           1       0.64      0.55      0.59       291
           2       0.66      0.73      0.69       469
           3       0.73      0.79      0.76      1759
           4       0.69      0.59      0.64       912

    accuracy 

In [24]:
SVM_Model(X_train_tfidf_v,X_test_tfidf_v,y_train,y_test,grid_param)

Classification Report with TF-IDF Vectorizer 
 
              precision    recall  f1-score   support

           0       0.83      0.54      0.66       247
           1       0.70      0.47      0.56       291
           2       0.67      0.51      0.58       469
           3       0.67      0.88      0.76      1759
           4       0.79      0.55      0.65       912

    accuracy                           0.70      3678
   macro avg       0.73      0.59      0.64      3678
weighted avg       0.71      0.70      0.69      3678

Best Score 0.6938339439608625 with {'C': 10, 'kernel': 'rbf'}
	Classification Report of best feature with TF-IDF Vectorizer 

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       247
           1       0.68      0.58      0.62       291
           2       0.65      0.63      0.64       469
           3       0.72      0.81      0.76      1759
           4       0.71      0.61      0.65       912

    accurac

In [46]:
model=SVC()
model.fit(X_train_v,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [49]:
from sklearn import metrics

# There are three classes [0,1,2]
a_true = [1,1,1]
a_pred = [1,1,0]

# Confusion matrix
# [0, 0, 0
#  1, 2, 0
#  0, 0, 0]

TP_0 = 0
FP_0 = 1
Precision_0 = 0

TP_1 = 0
FP_1 = 1
Precision_1 = 1

TP_2 = 0
FP_2 = 0
Precision_2 = 0

macro_precision = (Precision_0 + Precision_1 + Precision_2)/3
print(macro_precision)

#Preferred
micro_precision = (TP_0 + TP_1 + TP_2)/(TP_0 + FP_0 + TP_1 + FP_1 + TP_2 + FP_2)
print(micro_precision)

weight_precision = (Precision_0*0 + Precision_1*3 + Precision_2*0)/(0+3+0)
print(weight_precision)

0.3333333333333333
0.0
1.0


In [50]:
print(metrics.classification_report(a_true, a_pred, labels=[0,1,2]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.67      0.80         3
           2       0.00      0.00      0.00         0

   micro avg       0.67      0.67      0.67         3
   macro avg       0.33      0.22      0.27         3
weighted avg       1.00      0.67      0.80         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
y_pred = model.predict(X_test_v)
print('Classification Report with Count Vectorizer')
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.54      0.66       247
           1       0.70      0.47      0.56       291
           2       0.67      0.51      0.58       469
           3       0.67      0.88      0.76      1759
           4       0.79      0.55      0.65       912

    accuracy                           0.70      3678
   macro avg       0.73      0.59      0.64      3678
weighted avg       0.71      0.70      0.69      3678



In [52]:
x = [1,2,3,4,5,6,7,8,9,10]  #X_train and test
y = [1,0,0,0,1,1,1,1,1,0]   #y_train and test

# k fold cross validation
K = 5
x_1 = [1, 2]
y_1 = [1, 0]

x_2 = [3, 4]
y_2 = [0, 0]

x_3 = [5, 6]
y_3 = [1, 1]

x_4 = [7, 8]
y_4 = [1, 1]

x_5 = [9, 10]
y_5 = [1, 0]

In [54]:
from sklearn.model_selection import KFold
import numpy as np

a=np.array([1,2,3,4,5,6,7,8,9,10])
b=np.array([1,1,1,1,3,0,0,2,2,3])

k=2
skf=KFold(n_splits=k, shuffle=True,random_state=1)

for train_index, test_index in skf.split(a,b):
    print(a[train_index],b[train_index])
    print(a[test_index],b[test_index])

[2 4 6 8 9] [1 1 0 2 2]
[ 1  3  5  7 10] [1 1 3 0 3]
[ 1  3  5  7 10] [1 1 3 0 3]
[2 4 6 8 9] [1 1 0 2 2]


In [56]:
from sklearn.model_selection import StratifiedKFold

a=np.array([1,2,3,4,5,6,7,8,9,10])
b=np.array([1,1,1,1,3,0,0,2,2,3])

k=2

skf=StratifiedKFold(n_splits=k, shuffle=True,random_state=1)


for train_index, test_index in skf.split(a,b):
    print(a[train_index],b[train_index])
    print(a[test_index],b[test_index])

[ 1  2  7  9 10] [1 1 0 2 3]
[3 4 5 6 8] [1 1 3 0 2]
[3 4 5 6 8] [1 1 3 0 2]
[ 1  2  7  9 10] [1 1 0 2 3]


In [57]:
from sklearn.model_selection import GridSearchCV

grid_param={'kernel':('linear','rbf'),'C':(1,10)}
grid_param

#Makes a combination [linear,1],[linear,10],[rbf,1],[rbf,10]

#[linear,1] --> 5 fold stratified cross validation, average_val_loss_calculate
#[linear,10] --> 5 fold cross validation, average_val_loss_calculate
#[rbf,1] --> 5 fold cross validation, average_val_loss_calculate
#[rbf,10] --> 5 fold cross validation, average_val_loss_calculate


#Define scorer in scoring=None for 

{'kernel': ('linear', 'rbf'), 'C': (1, 10)}

In [58]:
from sklearn.metrics import make_scorer, f1_score

scorer=make_scorer(f1_score,average='micro')
clf=GridSearchCV(SVC(),grid_param,scoring=scorer)

In [59]:
# clf=GridSearchCV(SVC(),grid_param)
clf.fit(X_train_v,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 10), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(f1_score, average=micro), verbose=0)

In [60]:
print(clf.best_score_,clf.best_params_)

0.6938339439608625 {'C': 10, 'kernel': 'rbf'}


In [61]:
#Final Model

model=SVC(random_state=1,kernel='rbf',C=10)
model.fit(X_train_v,y_train)
y_pred=model.predict(X_test_v)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       247
           1       0.68      0.58      0.62       291
           2       0.65      0.63      0.64       469
           3       0.72      0.81      0.76      1759
           4       0.71      0.61      0.65       912

    accuracy                           0.71      3678
   macro avg       0.70      0.66      0.68      3678
weighted avg       0.71      0.71      0.70      3678



In [62]:
model=SVC(random_state=1,kernel='linear',C=1)
model.fit(X_train_v,y_train)
y_pred=model.predict(X_test_v)
print(metrics.classification_report(y_test,y_pred))

#Report the f1-score

              precision    recall  f1-score   support

           0       0.77      0.56      0.65       247
           1       0.63      0.47      0.54       291
           2       0.65      0.51      0.57       469
           3       0.68      0.86      0.76      1759
           4       0.75      0.56      0.64       912

    accuracy                           0.69      3678
   macro avg       0.70      0.59      0.63      3678
weighted avg       0.69      0.69      0.68      3678

