# Sentiment Analysis using Logistic Regression
### In this case study, we will use the dataset from imbd to create a model which will classify if a movie review is a positive or negative sentiment.

In [None]:
# Load the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1. Exploratory Data Analysis

In [None]:
# Load the data

df = pd.read_csv('/kaggle/input/imdb_movie_data.csv')

In [None]:
print('*'*10 + 'First 10 rows' + '*'*10)
print(df.head(10))
print("")
print('*'*10 + 'Information' + '*'*10)
print(df.info())
print("")
print('*'*10 + 'Null values' + '*'*10)
print(df.isnull().any())
print("")


### We see that there are 50,000 rows and our data has no null values. Next let us check if there are duplicate and missing values.

In [None]:
print('*'*10 + 'Duplicate values' + '*'*10)
print(df.duplicated(subset='review').value_counts())

sns.heatmap(df.isnull(),cmap='viridis',cbar=False,yticklabels=False)

### Missing values will appear as yellow line in the plot. So this means there are no missing values in our data.
### Also there are 418 duplicate values, so let us drop these.

In [None]:
df.drop_duplicates(subset='review', inplace=True)

### We also need to check the distribution of the dependent variable to determine the scoring method to use.

In [None]:
sns.distplot(df.sentiment,kde=False)

### We see that there is 50-50 distribution of our sentiment classes, therefore we can use accuracy scoring in our model.

# 2. Cleaning the Data

In [None]:
# Let us check one data row

df.loc[0,'review']

### We see that our data contains html objects so we need to remove these.

In [None]:
def cleaner(text):
    # Remove html objects
    text = re.sub('<[^<]*>','',text)
    
    # Temporarily store emoticons
    emoticons = ''.join(re.findall('[:;=]-+[\)\(pPD]+',text))
    
    # Remove non-word characters and combine back the emoticons
    text = re.sub('\W+',' ',text.lower()) + emoticons.replace('-','')
    
    return text

In [None]:
# let us check the function if it works

cleaner(df.loc[0,'review'])

In [None]:
# Apply the function to whole dataset

df['review'] = df['review'].apply(cleaner)
df.head(10)

# 3. Tokenization

### We use the nltk library to tokenize the documents.

In [None]:
porter = PorterStemmer()

def token_porter(text):
    return [porter.stem(word) for word in text.split()]

# We will also tokenize without porter
def token(text):
    return text.split()

# We will pass the 2 functions in our GridSearchCV

# 4. Transform into feature vectors and Data splitting
### We will use the TfidfVectorizer to transform the words into numbers and give weights to each word.

In [None]:
tfidf = TfidfVectorizer(lowercase=False)

# Also load the stopwords from nltk library
stop = stopwords.words('english')

In [None]:
X = df.iloc[:,0].to_numpy()
y = df.iloc[:,1].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y)

# 5. Classification using LogisticRegression and GridSearchCV

In [None]:
# Initialize parameters
param_grid = [{'vect__stop_words':[stop, None],
               'vect__tokenizer':[token, token_porter],
               'clf__penalty':['l2'],
               'clf__C':[1, 10, 100]},
              {'vect__use_idf':[False],
               'vect__stop_words':[stop, None], 
               'vect__tokenizer':[token, token_porter],
               'clf__penalty':['l2'],
               'clf__C':[1, 10, 100]}
             ]

# Use pipeline to build composite estimator
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(tol=0.01, random_state=0))])

gs = GridSearchCV(lr_tfidf, 
                  param_grid, 
                  scoring='accuracy',
                  cv=5,
                  n_jobs=1,
                  verbose=0)

In [None]:
# Fit our model to the train dataset
gs.fit(X_train, y_train)

# 6. Model Accuracy

In [None]:
print('Best parameter settings: %s' % gs.best_params_)
print('CV Accuracy:%.3f' % gs.best_score_)

In [None]:
# Get our best classifier settings
clf = gs.best_estimator_

print('Test Accuracy: %.3f' % clf.score(X_test, y_test))