In [1]:
import sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
# from google.colab import drive 
# drive.mount('/content/gdrive')

# df=pd.read_csv('sentiment_5_class.csv')
train_df=pd.read_csv('sentiment_5_class_train.csv')
test_df=pd.read_csv('sentiment_5_class_test.csv')

In [4]:
print(train_df.shape)
print(test_df.shape)

(14711, 2)
(3678, 2)


## Train Test Dataset

In [5]:
X_train=train_df.Phrase.tolist()
X_test=test_df.Phrase.tolist()

print('No of instances',len(X_train), len(X_test))
y_train=train_df.Sentiment.tolist()
y_test =test_df.Sentiment.tolist()
values_train, counts_train = np.unique(y_train, return_counts=True)
values_test, counts_test = np.unique(y_test, return_counts=True)

print(values_train, values_test)
print(counts_train, counts_test)

No of instances 14711 3678
[0 1 2 3 4] [0 1 2 3 4]
[ 988 1165 1876 7033 3649] [ 247  291  469 1759  912]


In [7]:
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

In [8]:
# print(len(X_train))
# print(len(X_test))
# print(y_train)
# print(y_test)

## Feature Extraction

**Binary Feature Vectorization<br>
Count Feature Vectorization<br>
TF-IDF Feature Vectorization**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Example:  
By default, Regex is +.  So, Single alphabet is ignored 
              

### Data into vector is done only by transform
['am', 'happy', 'i', 'sad', 'so']
and its respective frequency

In [10]:
data = ['Happy, happy','I am so happy','Sad sad', 'I am so sad']
c_vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w\\w*\\b')
t_vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b')

c_vectorizer.fit(data)
print(c_vectorizer.get_feature_names())

t_vectorizer.fit(data)
print(t_vectorizer.get_feature_names())

print(c_vectorizer)

c_data_v=c_vectorizer.transform(data)  #Vector conversion
c_data_v.toarray()

['am', 'happy', 'i', 'sad', 'so']
['am', 'happy', 'i', 'sad', 'so']
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w*\\b',
                tokenizer=None, vocabulary=None)


array([[0, 2, 0, 0, 0],
       [1, 1, 1, 0, 1],
       [0, 0, 0, 2, 0],
       [1, 0, 1, 1, 1]])

In [11]:
t_vectorizer = TfidfVectorizer()  # (token pattern ='(?u)\\b\\w\\w*\\b')
t_vectorizer.fit(data)
print(t_vectorizer.get_feature_names())

print(t_vectorizer)

t_data_v=t_vectorizer.transform(data)
print(t_data_v.toarray())

['am', 'happy', 'sad', 'so']
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)
[[0.         1.         0.         0.        ]
 [0.57735027 0.57735027 0.         0.57735027]
 [0.         0.         1.         0.        ]
 [0.57735027 0.         0.57735027 0.57735027]]


### Count Vectorizer VS TF-IDF Vectorizer

In [12]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
# Fit should be done only on training data BE CAREFULLLLL
count_vectorizer.fit(X_train)
tfidf_vectorizer.fit(X_train)

print(len(count_vectorizer.get_feature_names()))
print(len(tfidf_vectorizer.get_feature_names()))

7115
7115


In [13]:
# COUNT VECTORIZER
X_train_count_v=count_vectorizer.transform(X_train)
X_test_count_v= count_vectorizer.transform(X_test)

#TF-IDF VECTORIZER
X_train_tfidf_v=tfidf_vectorizer.transform(X_train)
X_test_tfidf_v=tfidf_vectorizer.transform(X_test)

print(X_train_count_v.shape, X_test_count_v.shape)

(14711, 7115) (3678, 7115)


## Building Model

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn import metrics


grid_param={'kernel':('linear','rbf'),'C':(10,20)}

# ,'gamma':[1,0.1,0.01,0.001]

In [15]:
i=0
def SVM_Model(X_train,X_test,y_train,y_test,grid):
    model=SVC()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    global i
    vectors=['Count Vectorizer','TF-IDF Vectorizer']
    print('Classification Report with {} \n '.format(vectors[i]))
    print(metrics.classification_report(y_test, y_pred))
    
    scorer=make_scorer(f1_score,average='micro')
    clf=GridSearchCV(SVC(),grid_param,scoring=scorer)
    clf.fit(X_train,y_train)
    print('Best Score',clf.best_score_,'with',clf.best_params_)
    x=clf.best_params_
    
    model=SVC(kernel=x['kernel'],C=x['C'],random_state=1)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
#     print('AUC ROC Score', roc_auc_score(y_pred, y_test))
    vectors=['Count Vectorizer','TF-IDF Vectorizer']
    print('\tClassification Report of best feature with {} \n'.format(vectors[i]))
    print(metrics.classification_report(y_test,y_pred))
    i=i+1

In [16]:
SVM_Model(X_train_count_v,X_test_count_v,y_train,y_test,grid_param)

Classification Report with Count Vectorizer 
 
              precision    recall  f1-score   support

           0       0.79      0.45      0.57       247
           1       0.62      0.30      0.40       291
           2       0.78      0.20      0.32       469
           3       0.60      0.91      0.73      1759
           4       0.76      0.52      0.62       912

    accuracy                           0.64      3678
   macro avg       0.71      0.48      0.53      3678
weighted avg       0.68      0.64      0.61      3678

Best Score 0.6987284810677747 with {'C': 10, 'kernel': 'rbf'}
	Classification Report of best feature with Count Vectorizer 

              precision    recall  f1-score   support

           0       0.75      0.66      0.70       247
           1       0.63      0.56      0.59       291
           2       0.70      0.72      0.71       469
           3       0.74      0.80      0.77      1759
           4       0.71      0.64      0.67       912

    accuracy 

In [17]:
SVM_Model(X_train_tfidf_v,X_test_tfidf_v,y_train,y_test,grid_param)

Classification Report with TF-IDF Vectorizer 
 
              precision    recall  f1-score   support

           0       0.81      0.55      0.65       247
           1       0.64      0.44      0.52       291
           2       0.74      0.52      0.61       469
           3       0.68      0.89      0.77      1759
           4       0.78      0.58      0.66       912

    accuracy                           0.71      3678
   macro avg       0.73      0.60      0.64      3678
weighted avg       0.72      0.71      0.70      3678

Best Score 0.6977766782555387 with {'C': 10, 'kernel': 'rbf'}
	Classification Report of best feature with TF-IDF Vectorizer 

              precision    recall  f1-score   support

           0       0.74      0.67      0.70       247
           1       0.64      0.54      0.58       291
           2       0.71      0.63      0.67       469
           3       0.73      0.80      0.76      1759
           4       0.69      0.65      0.67       912

    accurac

In [39]:
# import seaborn as sns

In [40]:
# model=SVC(kernel='rbf',C=10,random_state=1)
# model.fit(X_train_count_v,y_train)
# y_pred = model.predict(X_test_count_v)

In [41]:
# vectors=['Count Vectorizer','TF-IDF Vectorizer']
# for i in range(2):
#     plt.figure(figsize=(6,6))
#     clf_report = metrics.classification_report(y_test,y_pred,output_dict=True)
#     sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
#     plt.title('Classification Report of best feature of SVM with {} \n '.format(vectors[i]))

#     plt.xlabel('Evaluation Metrics')
#     plt.ylabel('Classes')

In [None]:
# model=SVC()
# model.fit(X_train_v,y_train)

In [None]:
# from sklearn import metrics

# # There are three classes [0,1,2]
# a_true = [1,1,1]
# a_pred = [1,1,0]

# # Confusion matrix
# # [0, 0, 0
# #  1, 2, 0
# #  0, 0, 0]

# TP_0 = 0
# FP_0 = 1
# Precision_0 = 0

# TP_1 = 0
# FP_1 = 1
# Precision_1 = 1

# TP_2 = 0
# FP_2 = 0
# Precision_2 = 0

# macro_precision = (Precision_0 + Precision_1 + Precision_2)/3
# print(macro_precision)

# #Preferred
# micro_precision = (TP_0 + TP_1 + TP_2)/(TP_0 + FP_0 + TP_1 + FP_1 + TP_2 + FP_2)
# print(micro_precision)

# weight_precision = (Precision_0*0 + Precision_1*3 + Precision_2*0)/(0+3+0)
# print(weight_precision)

In [None]:
# print(metrics.classification_report(a_true, a_pred, labels=[0,1,2]))

In [None]:
# y_pred = model.predict(X_test_v)
# print('Classification Report with Count Vectorizer')
# print(metrics.classification_report(y_test, y_pred))

In [None]:
# x = [1,2,3,4,5,6,7,8,9,10]  #X_train and test
# y = [1,0,0,0,1,1,1,1,1,0]   #y_train and test

# # k fold cross validation
# K = 5
# x_1 = [1, 2]
# y_1 = [1, 0]

# x_2 = [3, 4]
# y_2 = [0, 0]

# x_3 = [5, 6]
# y_3 = [1, 1]

# x_4 = [7, 8]
# y_4 = [1, 1]

# x_5 = [9, 10]
# y_5 = [1, 0]

In [None]:
# from sklearn.model_selection import KFold
# import numpy as np

# a=np.array([1,2,3,4,5,6,7,8,9,10])
# b=np.array([1,1,1,1,3,0,0,2,2,3])

# k=2
# skf=KFold(n_splits=k, shuffle=True,random_state=1)

# for train_index, test_index in skf.split(a,b):
#     print(a[train_index],b[train_index])
#     print(a[test_index],b[test_index])

In [None]:
# from sklearn.model_selection import StratifiedKFold

# a=np.array([1,2,3,4,5,6,7,8,9,10])
# b=np.array([1,1,1,1,3,0,0,2,2,3])

# k=2

# skf=StratifiedKFold(n_splits=k, shuffle=True,random_state=1)


# for train_index, test_index in skf.split(a,b):
#     print(a[train_index],b[train_index])
#     print(a[test_index],b[test_index])