In [None]:
## Perform Sentiment Analysis By Spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import spacy



In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
cd '/content/drive/MyDrive/dataset/NLP'

/content/drive/MyDrive/dataset/NLP


In [None]:
ls

negative.txt  pos.txt  sentiment.pkl  transform.pkl


In [None]:
f= open('/content/drive/MyDrive/dataset/NLP/pos.txt', encoding='utf8')

In [None]:
lines = f.readlines()
f.close()

# remove /n at the end of each line
for index, line in enumerate(lines):
  lines[index]=line.strip()

In [None]:
# lines

In [None]:
import pandas as pd

In [None]:
pos_df = pd.DataFrame(lines)

In [None]:
pos_df.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [None]:
n= open('/content/drive/MyDrive/dataset/NLP/negative.txt', encoding='utf8')

In [None]:
nlines = n.readlines()
n.close()

for index,lines in enumerate(nlines):
  nlines[index] = lines.strip()

In [None]:
# nlines

In [None]:
neg_df = pd.DataFrame(nlines)
neg_df.head()

Unnamed: 0,0
0,"simplistic , silly and tedious."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...


In [None]:
# rename '0' column into 'review' for both positive and negative reviews

pos_df.rename(columns = {0:'review'}, inplace=True)
neg_df.rename(columns = {0:'review'}, inplace=True)

In [None]:
pos_df['target'] = 1
neg_df['target'] = 0

In [None]:
# Turn all characters into lowercase

pos_df['review'] = pos_df['review'].apply(lambda x: x.lower())
neg_df['review'] = neg_df['review'].apply(lambda x: x.lower())

In [None]:
# Create Lemmatization Function

def lemmatizer(token):
  doc = nlp(token)
  return " ".join([i.lemma_ for i in doc if not i.is_punct and not i.is_stop])

In [None]:
pos_df['review']=pos_df['review'].apply(lemmatizer)
neg_df['review']=neg_df['review'].apply(lemmatizer)

In [None]:
neg_df.head()

Unnamed: 0,review,target
0,simplistic silly tedious,0
1,laddish juvenile teenage boy possibly find funny,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0


In [None]:
pos_df.head()

Unnamed: 0,review,target
0,rock destine 21st century new conan splash gre...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tepid biopic,1
3,like movie fun wasabi good place start,1
4,emerge rare issue movie honest keenly observe ...,1


In [None]:
# Merge positive & negative review

review_df = pd.concat([pos_df,neg_df],axis = 0)

In [None]:
review_df

Unnamed: 0,review,target
0,rock destine 21st century new conan splash gre...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tepid biopic,1
3,like movie fun wasabi good place start,1
4,emerge rare issue movie honest keenly observe ...,1
...,...,...
5326,terrible movie people find,0
5327,definition time waster movie surely,0
5328,stand crocodile hunter hurried badly cobbled l...,0
5329,thing look like home video quickie,0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train,  y_test = train_test_split(review_df['review'].values, review_df['target'].values, test_size = 0.2)

In [None]:
train_data = pd.DataFrame({'review':x_train, 'target':y_train})
test_data = pd.DataFrame({'review':x_test, 'target':y_test})

In [None]:
print(train_data.shape, test_data.shape)

(8529, 2) (2133, 2)


In [None]:
train_data['target'].value_counts()

0    4277
1    4252
Name: target, dtype: int64

In [None]:
test_data['target'].value_counts()

1    1079
0    1054
Name: target, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
train_vector = tfidf.fit_transform(train_data['review'])

In [None]:
train_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
train_vector.shape

(8529, 13307)

In [None]:
test_vector= tfidf.transform(test_data['review'])

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
classifier = svm.SVC()
classifier.fit(train_vector, train_data['target'])

In [None]:
preds = classifier.predict(train_vector)

In [None]:
classification_report(train_data['target'], preds, output_dict = True)

{'0': {'precision': 0.9876629422718808,
  'recall': 0.9920505026888006,
  'f1-score': 0.989851860492243,
  'support': 4277},
 '1': {'precision': 0.9919678714859438,
  'recall': 0.9875352775164629,
  'f1-score': 0.9897466116676488,
  'support': 4252},
 'accuracy': 0.9897995075624341,
 'macro avg': {'precision': 0.9898154068789122,
  'recall': 0.9897928901026317,
  'f1-score': 0.9897992360799459,
  'support': 8529},
 'weighted avg': {'precision': 0.9898090976263415,
  'recall': 0.9897995075624341,
  'f1-score': 0.9897993903313596,
  'support': 8529}}

In [None]:
testpreds = classifier.predict(test_vector)
classification_report(test_data['target'], testpreds, output_dict = True)

{'0': {'precision': 0.7533460803059273,
  'recall': 0.7476280834914611,
  'f1-score': 0.7504761904761905,
  'support': 1054},
 '1': {'precision': 0.7552897884084636,
  'recall': 0.7608897126969416,
  'f1-score': 0.758079409048938,
  'support': 1079},
 'accuracy': 0.7543366150961087,
 'macro avg': {'precision': 0.7543179343571955,
  'recall': 0.7542588980942013,
  'f1-score': 0.7542777997625643,
  'support': 2133},
 'weighted avg': {'precision': 0.7543293250516547,
  'recall': 0.7543366150961087,
  'f1-score': 0.7543223568334313,
  'support': 2133}}