Ideal steps:

- Text cleaning
- Convert text data into numerical data, and doing feature engineering
- Build and training the model
- Test and evaluate

# 1. Import packages and collect data

In [None]:
#import my text preprocessing package

# !pip uninstall preprocess_hungcuongthan
!pip install git+https://github.com/hungcuongthan/preprocess_hungcuongthan.git

import preprocess_hungcuongthan as pp

Collecting git+https://github.com/hungcuongthan/preprocess_hungcuongthan.git
  Cloning https://github.com/hungcuongthan/preprocess_hungcuongthan.git to /tmp/pip-req-build-zztwkwpy
  Running command git clone -q https://github.com/hungcuongthan/preprocess_hungcuongthan.git /tmp/pip-req-build-zztwkwpy


In [None]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('/content/drive/MyDrive/My learnings/NLP basics/Sentiment analysis on IMDB Movie Reviews/imdb_reviews.txt',sep = '\t', header = None)

In [None]:
df.columns = ['reviews','sentiment']

In [None]:
df.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# 2. Preprocess the data

In [None]:
# Transform to expansion words from reviews (example: I'm --> Iam)
df['reviews'] = df['reviews'].apply(lambda x: pp.get_contraction_expansion(x))

# Transform to base words from reviews (example: checking --> check)
df['reviews'] = df['reviews'].apply(lambda x: pp.make_base(x))

In [None]:
df.head()

Unnamed: 0,reviews,sentiment
0,"a very , very , very slow - move , aimless mov...",0
1,not sure who was more lose - the flat characte...,0
2,attempt artiness with black & white and clever...,0
3,very little music or anything to speak of .,0
4,the good scene in the movie was when Gerardo i...,1


In [None]:
# Remove accented characters from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_accented_chars(x))

# Remove common words from reviews
# df['reviews'] = df['reviews'].apply(lambda x: pp.remove_common_words(x))

# Remove emails from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_emails(x))

# Remove html_tags from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_html_tags(x))

# Remove rare words from reviews
# df['reviews'] = df['reviews'].apply(lambda x: pp.remove_rare_words(x))

# Remove special characters from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_special_chars(x))

# Remove stop words from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_stopwords(x))

# Remove urls from reviews
df['reviews'] = df['reviews'].apply(lambda x: pp.remove_urls(x))

In [10]:
df.head()

Unnamed: 0,reviews,sentiment
0,slow aimless movie thistressed drift young man,0
1,sure lose flat character audience nearly half ...,0
2,attempt artiness black white clever camera ang...,0
3,little music speak,0
4,good scene movie Gerardo try find song run head,1


# 3. Prepare the datasets for the training

In [12]:
X = df['reviews']
y = df['sentiment']

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, stratify = y, random_state = 1)

In [14]:
X_train.shape,X_test.shape

((598,), (150,))

# 4. Models building

### 4.1. Logistic Regression model

In [19]:
pipe = Pipeline([
                 ('tfidf', TfidfVectorizer()),
                 ('clf', LogisticRegression(solver = 'liblinear'))
])

hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1),(1,2)),
    'tfidf__use_idf': (True,False),
    'tfidf__analyzer': ('word','char','char_wb'),
    
    'clf__penalty': ('l2','l1'),
    'clf__C': (1,2)
}

In [20]:
clf = GridSearchCV(pipe, hyperparameters,n_jobs = - 1, cv = None)

In [21]:
%%time
clf.fit(X_train,y_train)

CPU times: user 1.31 s, sys: 48.2 ms, total: 1.35 s
Wall time: 14.3 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [22]:
clf.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=2, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1

In [23]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [25]:
clf.best_score_

0.750812324929972

In [26]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.70      0.75        73
           1       0.75      0.84      0.79        77

    accuracy                           0.77       150
   macro avg       0.78      0.77      0.77       150
weighted avg       0.78      0.77      0.77       150



We get a accuracy score of 77% in test set with logistic regression model.

### 4.2. SVM model

In [27]:
from sklearn.svm import LinearSVC

In [28]:
pipe = Pipeline([
                 ('tfidf', TfidfVectorizer()),
                 ('clf', LinearSVC())
])

hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1),(1,2)),
    'tfidf__use_idf': (True,False),
    'tfidf__analyzer': ('word','char','char_wb'),
    
    'clf__penalty': ('l2','l1'),
    'clf__C': (1,2)
}

In [29]:
clf = GridSearchCV(pipe, hyperparameters,n_jobs = - 1, cv = None)

In [30]:
%%time
clf.fit(X_train,y_train)

CPU times: user 1.3 s, sys: 39.2 ms, total: 1.34 s
Wall time: 12.9 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [31]:
clf.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=2, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                 

In [32]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [33]:
clf.best_score_

0.7607843137254903

In [34]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.74      0.77        73
           1       0.77      0.83      0.80        77

    accuracy                           0.79       150
   macro avg       0.79      0.79      0.79       150
weighted avg       0.79      0.79      0.79       150



We get a accuracy score of 79% in test set with **SVC model.**

# 5. Model testing and saving

In [39]:
sample_reviews = ['This is a terrible movie','Good movie!']

clf.predict(sample_reviews)

array([0, 1])

In [40]:
# Save the model

import pickle as pkl

pkl.dump(clf,open('svc_model.pkl','wb'))