# 1. Importing the datasets

### Importing libraries

In [1]:
# importing pandas as pd
import pandas as pd

### Reading the dataset

In [None]:
# reading the dataset
df = pd.read_excel('/content/test.xlsx')

### Operations on the dataset

In [None]:
# updating the column names
col_name = ['Reviews','Sentiment']
df.columns = col_name

In [None]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,Who would have thought that a movie about a ma...,pos
1,After realizing what is going on around us ......,pos
2,I grew up watching the original Disney Cindere...,neg
3,David Mamet wrote the screenplay and made his ...,pos
4,"Admittedly, I didn't have high expectations of...",neg


before we feed any data to our model it must be converted into numeric form. Sentiment data field is not in numeric form 
hence we will use label encoder to convert it into numeric data.

In [None]:
import sklearn as sk
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()           # Instance of Label Encoder Created & is stored in le

In [None]:
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [None]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,Who would have thought that a movie about a ma...,1
1,After realizing what is going on around us ......,1
2,I grew up watching the original Disney Cindere...,0
3,David Mamet wrote the screenplay and made his ...,1
4,"Admittedly, I didn't have high expectations of...",0


In [None]:
df.shape
# 25000 rows, 2 columns

(25000, 2)

In [None]:
# check for null values
df.isnull().sum()

Reviews      0
Sentiment    0
dtype: int64

In [None]:
# no null values in the data

In [None]:
# distribution of sentiments
df['Sentiment'].value_counts()
# 1 : pos  ;   0 : neg

1    12500
0    12500
Name: Sentiment, dtype: int64

# 2. Data Cleaning

### Installing and Importing required packages

In [None]:
# installing the package : preprocess_kgptalkie
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-q0l30ivx
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-q0l30ivx
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-py3-none-any.whl size=11754 sha256=b5880aa0d29c8244be4b2139d9d305312da6324ebcd994ae890aba2e14d0eedb
  Stored in directory: /tmp/pip-ephem-wheel-cache-oqlvosvr/wheels/0d/b3/29/bfe3deffda68980088d17b81331be6667e837ffb4a071bae82
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.3


In [None]:
# importing preprocess_kgptalkie
import preprocess_kgptalkie as ps

In [None]:
# importing regular expression (re)
import re

### Create a function to clean the data

In [None]:
# here we will remove urls, html_tags, punctuations etc
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)                                  # I'm --> i am
    x = ps.remove_emails(x)         
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)                      
    x = re.sub("(.)\\1{2,}", "\\1", x)                  # aweesooomeee --> awesome
 #   x = ps.spelling_correction(x).raw_sentences[0]      # godo --> good
    return x

### Cleaning the dataset

In [None]:
# calling the get_clean() function
df['Reviews'] = df['Reviews'].apply(lambda x : get_clean(x))

In [None]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,who would have thought that a movie about a ma...,1
1,after realizing what is going on around us in ...,1
2,i grew up watching the original disney cindere...,0
3,david mamet wrote the screenplay and made his ...,1
4,admittedly i did not have high expectations of...,0


In [None]:
# example of get clean function
get_clean("hi, I'm manoj.<br> this movie is aweeesooooomeeee! contact us : 1ms@gmail.com , my website is : https://www.google.com")

'hi i am manoj this movie is awesome contact us my website is'

# 3. Vectorization Feature Engineering (TF-IDF)

In [None]:
# importing TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### vectorizing

In [None]:
tfidf = TfidfVectorizer(max_features = 10000)

# 4. Train the model

### Splitting the dataset into the Train and Test set

In [None]:
# importing train_test_split
from sklearn.model_selection import train_test_split

### shaping

In [None]:
# segregation of columns
x = df['Reviews']
y = df['Sentiment']

In [None]:
x = tfidf.fit_transform(x)

In [None]:
# training
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size = 0.3, random_state = 0)
# we divide our dataset into 2 parts training data and testing data of x and y
# test size --> manually splits the data, here 30% of data used as test_data and 70% as train_data
# random_state --> to get same split every time

In [None]:
print(x_train.shape)
print(x_test.shape)

(7500, 10000)
(17500, 10000)


In [None]:
# splitting of data be like 5000 samples in training dataset(20%) and 20000 in test dataset(80%)

### SVM Model

In [None]:
# importing LinearSVC
from sklearn.svm import LinearSVC

fit to the data you provide, returning a "best fit" hyperplane that divides, or categorizes, your data.

In [None]:
# SVM model
clf = LinearSVC()

### Fit the x_train and y_train

In [None]:
clf.fit(x_train, y_train)
# train our model using given dataset

LinearSVC()

# 5. Predicting the test results

### Predict

In [None]:
# prediction
y_pred = clf.predict(x_test)

### classsification report

In [None]:
# importing classification_report
from sklearn.metrics import classification_report 

In [None]:
# classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      8737
           1       0.86      0.88      0.87      8763

    accuracy                           0.87     17500
   macro avg       0.87      0.87      0.87     17500
weighted avg       0.87      0.87      0.87     17500



In [None]:
# we are getting almost 87% accuracy

### confusion matrix

In [None]:
# importing confusion_matrix
from sklearn.metrics import confusion_matrix

In [None]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[7531, 1206],
       [1063, 7700]])

### accuracy score

In [None]:
# importing accuracy score
from sklearn.metrics import accuracy_score

In [None]:
# accuracy score
accuracy_score(y_test, y_pred)

0.8703428571428572

# 6. Testing some examples


### prediction

In [None]:
x = 'not a good movie.'
x = get_clean(x)
vec = tfidf.transform([x])

In [None]:
clf.predict(vec)

array([0])

### vectorizer

In [None]:
tfidf = TfidfVectorizer()
doc1 = "manoj provide trainings to working professionals"
doc2 = "manoj provide trainings to students"

In [None]:
x = tfidf.fit_transform([doc1,doc2])

In [None]:
len(tfidf.vocabulary_)

7

In [None]:
tfidf.vocabulary_

{'manoj': 0,
 'professionals': 1,
 'provide': 2,
 'students': 3,
 'to': 4,
 'trainings': 5,
 'working': 6}

In [None]:
print(x)

  (0, 1)	0.49844627974580596
  (0, 6)	0.49844627974580596
  (0, 4)	0.35464863330313684
  (0, 5)	0.35464863330313684
  (0, 2)	0.35464863330313684
  (0, 0)	0.35464863330313684
  (1, 3)	0.5749618667993135
  (1, 4)	0.40909010368335985
  (1, 5)	0.40909010368335985
  (1, 2)	0.40909010368335985
  (1, 0)	0.40909010368335985
