In [2]:
import spacy
import numpy as np
import pandas as pd

## Read the Data into dataframes

In [28]:
df_amazon = pd.read_csv(r'D:\Data-Science\\Gen_AI\\NLP\\NLP-Projects\\NLP-projects\\IMDB_AMAZON_YELP_Review_classification\\amazon_cells_labelled.txt',sep='\t',header=None)

In [29]:
df_amazon.rename(columns={0:'Review',1:'Sentiment'},inplace=True)
df_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [42]:
df_imdb = pd.read_csv(r'D:\Data-Science\Gen_AI\NLP\NLP-Projects\NLP-projects\IMDB_AMAZON_YELP_Review_classification\imdb_labelled.txt',sep='\t',header=None)

In [43]:
df_imdb.rename(columns={0:'Review',1:'Sentiment'},inplace=True)

In [44]:
df_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [45]:
df_yelp = pd.read_csv(r'D:\Data-Science\\Gen_AI\\NLP\\NLP-Projects\\NLP-projects\\IMDB_AMAZON_YELP_Review_classification\\yelp_labelled.txt',sep='\t',header=None)
df_yelp.rename(columns={0:'Review',1:'Sentiment'},inplace=True)
df_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### merge all the dataframes into 1

In [46]:
print(f"rows in amazon = {len(df_amazon)}")
print(f"rows in imdb = {len(df_imdb)}")
print(f"rows in yelp = {len(df_yelp)}")

rows in amazon = 1000
rows in imdb = 748
rows in yelp = 1000


In [50]:
df=pd.concat([df_amazon,df_imdb],axis=0,ignore_index=True)
df=pd.concat([df,df_yelp],axis=0,ignore_index=True)
len(df)

2748

## EDA

### Check for NULL values

In [52]:
df.isna().sum()

Review       0
Sentiment    0
dtype: int64

### Check for Data imbalance

In [53]:
df['Sentiment'].value_counts()

Sentiment
1    1386
0    1362
Name: count, dtype: int64

Almost balanced dataset

## Data Cleaning

### 1. removal of punctuation marks

In [66]:
nlp=spacy.load('en_core_web_sm')

In [101]:
import string
punct = string.punctuation

In [110]:
def data_clean(sentence):
    sentence=nlp(sentence.lower())
    psentence=[]
    
    psentence = [i for i in sentence if i.is_stop==False ]
    psentence = [i for i in psentence if i.text not in punct]
    psentence = [ i.lemma_ for i in psentence]
    #psentence = " ".join(psentence)
    return psentence

In [115]:
print (data_clean("Hello all it's a beautiful day outside there!!"))

['hello', 'beautiful', 'day', 'outside']


## TFIDF Vectorizer to create vectors of the sentence

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [119]:
tfidf = TfidfVectorizer(tokenizer=data_clean)
model=LinearSVC()

## Train the model

### Split data into train and test

In [121]:
from sklearn.model_selection import train_test_split

In [140]:
X_train,X_test,y_train,y_test = train_test_split(df['Review'],df['Sentiment'],test_size=0.2)

In [141]:
clf=Pipeline([("Vectorizer",tfidf),("modeling",model)])

### Train the model with pipeline

In [142]:
clf.fit(X_train,y_train)



### Test the model on test dataset

In [143]:
y_pred = clf.predict(X_test)

### Evaluate the model performance

In [144]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [145]:
print(confusion_matrix(y_test,y_pred))

[[202  64]
 [ 54 230]]


In [146]:
print(accuracy_score(y_pred,y_test))

0.7854545454545454


In [147]:
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.76      0.77       266
           1       0.78      0.81      0.80       284

    accuracy                           0.79       550
   macro avg       0.79      0.78      0.78       550
weighted avg       0.79      0.79      0.79       550



In [148]:
clf.predict(['Wow, I am learning Natural Language Processing in a fun faishon !!!'])

array([1], dtype=int64)

In [149]:
clf.predict(["it's hard to learn new things"])

array([0], dtype=int64)