In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [None]:
# Loading the dataset from online
!wget https://raw.githubusercontent.com/lee1613/Sentiment-Analysis-NLP-/main/dataset.csv

--2023-04-01 15:56:47--  https://raw.githubusercontent.com/lee1613/Sentiment-Analysis-NLP-/main/dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1273144 (1.2M) [text/plain]
Saving to: ‘dataset.csv’


2023-04-01 15:56:47 (18.2 MB/s) - ‘dataset.csv’ saved [1273144/1273144]



In [None]:
# Reading in the dataset and rename the column name
df  = pd.read_csv("dataset.csv", header = None)
df.rename(columns = {0:"Label",1:"Sentiment"}, inplace = True)
print(df.shape)
df.head(10)


(10662, 2)


Unnamed: 0,Label,Sentiment
0,1,the rock is destined to be the 21st century's ...
1,1,"the gorgeously elaborate continuation of "" the..."
2,1,effective but too-tepid biopic
3,1,if you sometimes like to go to the movies to h...
4,1,"emerges as something rare , an issue movie tha..."
5,1,the film provides some great insight into the ...
6,1,offers that rare combination of entertainment ...
7,1,perhaps no picture ever made has more literall...
8,1,steers turns in a snappy screenplay that curls...
9,1,take care of my cat offers a refreshingly diff...


In [None]:
# Hyperparameters are tuned through trials and errors to get a better result as much as possible 
# Sublinear is set to true to further reduce the weight that carried by the frequent word tf = (1+log(tf))
vectorizer = TfidfVectorizer(min_df = 4, max_df = .8,sublinear_tf = True) # Precision of 78.2 is obtained in polynomial SVM instead of a 78.9 accuracy if sublinear_tf is obtained
vectorizer.fit_transform(df.loc[:,"Sentiment"])
vector = vectorizer.transform(df.loc[:,"Sentiment"])


In [None]:
print(vector.toarray()[2])
print(df.loc[2,"Sentiment"])

[0. 0. 0. ... 0. 0. 0.]
effective but too-tepid biopic


In [None]:
print(len(vectorizer.vocabulary_))
#print(vectorizer.vocabulary_)
print(vector.shape)
print(vector[1,].shape) ## Indicates that each vector has a vector length of 4574 to represent it, which is the length
# of the vocabulary 
print((vector[1].toarray()[0]))
print(len(vector[1].toarray()[0]))


5555
(10662, 5555)
(1, 5555)
[0. 0. 0. ... 0. 0. 0.]
5555


In [None]:
np.random.seed(0)
train_x, test_x, train_y, test_y = train_test_split(vector,df.loc[:,"Label"],train_size = .8)
print((train_x.shape,test_x.shape, train_y.shape, test_y.shape))

((8529, 5555), (2133, 5555), (8529,), (2133,))


### In the following cells, each cell contains different type of kernel used for SVM model. The evaluation of model is carried out and the time taken is recorded. Kernel used include: 
## **1.Linear**

##**2.Polynomial**

##**3.Sigmoid**

##**4.Radial Basis Function**.

In [None]:
# Creating a svm model
classifier_linear = svm.SVC(kernel = "linear")
t0 = time.time() # time.time() function record the current time when the line is executed
# Model Fitting
classifier_linear.fit(train_x, train_y)
t1 = time.time()
# Predicting the data provided using built model
prediction_linear = classifier_linear.predict(test_x)
t2= time.time()
print(prediction_linear)
report = classification_report(test_y, prediction_linear, output_dict = True)
confusion_matrix_result = confusion_matrix(test_y,prediction_linear)
print(report)
print(confusion_matrix_result)
# Taking the time difference between the time recorded to predict the training time and prediction time
print("Training time: " + str(t1-t0))
print("Prediction time: " + str(t2-t1))

[0 0 1 ... 1 0 1]
{'0': {'precision': 0.7915904936014625, 'recall': 0.7794779477947795, 'f1-score': 0.7854875283446712, 'support': 1111}, '1': {'precision': 0.7641963426371511, 'recall': 0.776908023483366, 'f1-score': 0.7704997573993208, 'support': 1022}, 'accuracy': 0.7782466010314112, 'macro avg': {'precision': 0.7778934181193068, 'recall': 0.7781929856390728, 'f1-score': 0.777993642871996, 'support': 2133}, 'weighted avg': {'precision': 0.7784649322861665, 'recall': 0.7782466010314112, 'f1-score': 0.7783063272634952, 'support': 2133}}
[[866 245]
 [228 794]]
Training time: 12.149487257003784
Prediction time: 2.3042943477630615


In [None]:
## Best SVM model
classifier_poly = svm.SVC(kernel = "poly")
t0 = time.time()
classifier_poly.fit(train_x, train_y)
t1 = time.time()
prediction_poly = classifier_poly.predict(test_x)
t2= time.time()
report = classification_report(test_y, prediction_poly, output_dict = True)
confusion_matrix_result = confusion_matrix(test_y,prediction_poly)
print(report)
print(confusion_matrix_result)
print("Training time: " + str(t1-t0))
print("Prediction time: " + str(t2-t1))

{'0': {'precision': 0.8304568527918782, 'recall': 0.7362736273627363, 'f1-score': 0.7805343511450382, 'support': 1111}, '1': {'precision': 0.7447735191637631, 'recall': 0.8365949119373777, 'f1-score': 0.7880184331797235, 'support': 1022}, 'accuracy': 0.784341303328645, 'macro avg': {'precision': 0.7876151859778207, 'recall': 0.786434269650057, 'f1-score': 0.7842763921623809, 'support': 2133}, 'weighted avg': {'precision': 0.7894027660746098, 'recall': 0.784341303328645, 'f1-score': 0.7841202544921776, 'support': 2133}}
[[818 293]
 [167 855]]
Training time: 28.10062003135681
Prediction time: 3.0129761695861816


In [None]:
classifier_sigmoid = svm.SVC(kernel = "sigmoid")
t0 = time.time()
classifier_sigmoid.fit(train_x, train_y)
t1 = time.time()
prediction_sigmoid = classifier_sigmoid.predict(test_x)
t2= time.time()
report = classification_report(test_y, prediction_sigmoid, output_dict = True)
confusion_matrix_result = confusion_matrix(test_y,prediction_sigmoid)
print(report)
print(confusion_matrix_result)
print("Training time: " + str(t1-t0))
print("Prediction time: " + str(t2-t1))

{'0': {'precision': 0.7946837763519706, 'recall': 0.7803780378037803, 'f1-score': 0.7874659400544958, 'support': 1111}, '1': {'precision': 0.7658349328214972, 'recall': 0.7808219178082192, 'f1-score': 0.7732558139534884, 'support': 1022}, 'accuracy': 0.7805907172995781, 'macro avg': {'precision': 0.7802593545867339, 'recall': 0.7805999778059998, 'f1-score': 0.780360877003992, 'support': 2133}, 'weighted avg': {'precision': 0.7808612174733285, 'recall': 0.7805907172995781, 'f1-score': 0.7806573376751101, 'support': 2133}}
[[867 244]
 [224 798]]
Training time: 11.578099012374878
Prediction time: 2.0863845348358154


In [None]:
classifier_linear = svm.SVC(kernel = "rbf")
t0 = time.time()
classifier_linear.fit(train_x, train_y)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_x)
t2= time.time()
report = classification_report(test_y, prediction_linear, output_dict = True)
confusion_matrix_result = confusion_matrix(test_y,prediction_linear)
print(report)
print(confusion_matrix_result)
print("Training time: " + str(t1-t0))
print("Prediction time: " + str(t2-t1))

{'0': {'precision': 0.7983501374885427, 'recall': 0.783978397839784, 'f1-score': 0.7910990009082653, 'support': 1111}, '1': {'precision': 0.7696737044145874, 'recall': 0.7847358121330724, 'f1-score': 0.7771317829457365, 'support': 1022}, 'accuracy': 0.784341303328645, 'macro avg': {'precision': 0.784011920951565, 'recall': 0.7843571049864282, 'f1-score': 0.784115391927001, 'support': 2133}, 'weighted avg': {'precision': 0.7846101869017718, 'recall': 0.784341303328645, 'f1-score': 0.7844067848943391, 'support': 2133}}
[[871 240]
 [220 802]]
Training time: 15.199859380722046
Prediction time: 2.957329511642456
