In [1]:
import pandas as pd 
import numpy as np 
import re

In [2]:
df=pd.read_csv('amazonreviews.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [4]:
df.shape

(10000, 2)

In [5]:
# checking duplicates in the data
df.duplicated().sum()

0

In [6]:
# preprocessing text before converting them into embeddings
from nltk.corpus import stopwords
stopwords=stopwords.words('English')
def clean_text(text):
    text=text.lower()
    text=re.sub(r"[^a-zA-Z0-9\s]","",text)
    text=text.strip()
    
    words=text.split()
    words=[w for w in words if w not in stopwords]
    text=" ".join(words)
    
    return text

In [7]:
df['cleaned_review']=df['review'].apply(clean_text)

In [9]:
df.head()

Unnamed: 0,label,review,cleaned_review
0,pos,Stuning even for the non-gamer: This sound tra...,stuning even nongamer sound track beautiful pa...
1,pos,The best soundtrack ever to anything.: I'm rea...,best soundtrack ever anything im reading lot r...
2,pos,Amazing!: This soundtrack is my favorite music...,amazing soundtrack favorite music time hands i...
3,pos,Excellent Soundtrack: I truly like this soundt...,excellent soundtrack truly like soundtrack enj...
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",remember pull jaw floor hearing youve played g...


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC

In [20]:
df['label']=df['label'].map({'pos':1,'neg':0})

In [22]:
X=df['cleaned_review']

In [23]:
y=df['label']

In [25]:
tfidf=TfidfVectorizer(max_features=5000)

In [26]:
X_transformed=tfidf.fit_transform(X)

In [29]:
X_train,X_test,y_train,y_test=train_test_split(X_transformed,y,test_size=0.2,random_state=42)

In [31]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((8000, 5000), (2000, 5000), (8000,), (2000,))

In [32]:
lr=LogisticRegression()

In [33]:
lr.fit(X_train,y_train)

In [34]:
y_pred=lr.predict(X_test)

In [75]:
from sklearn.metrics import accuracy_score,classification_report

In [40]:
lr_accuracy=round(accuracy_score(y_test,y_pred),2)
print("Accuracy Score:",round(accuracy_score(y_test,y_pred),2))

Accuracy Score: 0.84


In [83]:
report=classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.853944,0.845709,0.849806,1037.0
1,0.83556,0.844237,0.839876,963.0
accuracy,0.845,0.845,0.845,0.845
macro avg,0.844752,0.844973,0.844841,2000.0
weighted avg,0.845092,0.845,0.845025,2000.0


In [47]:
svc=SVC(kernel='linear')

In [48]:
svc.fit(X_train,y_train)

In [49]:
y_pred1=svc.predict(X_test)

In [51]:
svc_accuracy=round(accuracy_score(y_test,y_pred1),2)
print("Accuracy Score:",round(accuracy_score(y_test,y_pred1),2))

Accuracy Score: 0.85


In [84]:
report=classification_report(y_test,y_pred1,output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.855212,0.854388,0.8548,1037.0
1,0.843361,0.844237,0.843799,963.0
accuracy,0.8495,0.8495,0.8495,0.8495
macro avg,0.849287,0.849312,0.849299,2000.0
weighted avg,0.849506,0.8495,0.849503,2000.0


In [55]:
data={'Algorithm':['Logistic Regression','Support Vector Classifier'],
'Accuracy Score':[lr_accuracy,svc_accuracy]}

pd.DataFrame(data).set_index('Algorithm')

Unnamed: 0_level_0,Accuracy Score
Algorithm,Unnamed: 1_level_1
Logistic Regression,0.84
Support Vector Classifier,0.85
