# Movie Review System

Configuring kaggle API

In [1]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"janvichoudhary","key":""}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle competitions download -c word2vec-nlp-tutorial

Downloading word2vec-nlp-tutorial.zip to /content
 64% 33.0M/51.7M [00:00<00:00, 51.4MB/s]
100% 51.7M/51.7M [00:00<00:00, 68.7MB/s]


In [4]:
!unzip 'word2vec-nlp-tutorial.zip' -d dataset

Archive:  word2vec-nlp-tutorial.zip
  inflating: dataset/labeledTrainData.tsv.zip  
  inflating: dataset/sampleSubmission.csv  
  inflating: dataset/testData.tsv.zip  
  inflating: dataset/unlabeledTrainData.tsv.zip  


Downloading libraries

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

Importing the data

In [6]:
train = pd.read_csv('/content/dataset/labeledTrainData.tsv.zip', sep='\t')
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
test = pd.read_csv('/content/dataset/testData.tsv.zip', sep='\t')
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


Exploring the data

In [8]:
train.shape

(25000, 3)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [30]:
train.select_dtypes('object').describe()

Unnamed: 0,review
count,25000
unique,24904
top,"When i got this movie free from my job, along ..."
freq,3


Checking NULL and Duplicated values

In [11]:
train.isnull().sum().sum()

0

In [12]:
train.duplicated().sum()

0

Dropping id column

In [13]:
train.drop('id', axis=1, inplace=True)

In [14]:
submission = pd.DataFrame({'id': test['id']})

In [15]:
test.drop('id', axis=1, inplace=True)

Splitting the training data

In [16]:
X = train['review'].copy()
y = train['sentiment'].copy()

In [17]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_prepared = tfidf_vectorizer.fit_transform(X)
test_prepared = tfidf_vectorizer.fit_transform(test['review'])

In [18]:
X_prepared.shape

(25000, 74538)

In [19]:
test_prepared.shape

(25000, 73511)

In [20]:
X_prepared = X_prepared[:, :test_prepared.shape[1]]

Logistic Regression


In [21]:
model = LogisticRegression()
model.fit(X_prepared, y)
y_predict = model.predict(test_prepared)

In [22]:
model.score(X_prepared, y)

0.937

Printing the F1 Score and Confusion matrix

In [29]:
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_predict, y)))
print('Confusion matrix:')
confusion_matrix(y_predict, y)

F1-score: 0.5116441275622893
Confusion matrix:


array([[5795, 5898],
       [6705, 6602]])

In [23]:
submission['sentiment'] = y_predict

In [24]:
submission.to_csv('submission.csv', index=False)

In [25]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,1
2,5828_4,0
3,7186_2,1
4,12128_7,0
