In [4]:
import pandas as pd
import numpy as np

# IMPORTING DATA.
- Import the data required for **TRAINING** & **TESTING** the model.

In [5]:
# TRAINING dataset.
positive_training_reviews = pd.read_csv('./datasets/training/positive_reviews_training.csv', header = None)
negative_training_reviews = pd.read_csv('./datasets/training/negative_reviews_training.csv', header = None)

# TESTING datasets
positive_testing_reviews = pd.read_csv('./datasets/testing/positive_reviews_testing.csv', header = None)
negative_testing_reviews = pd.read_csv('./datasets/testing/negative_reviews_testing.csv', header = None)

- Merge both the **NEGATIVE** & **POSITIVE** reviews datasets into **ONE**.

In [6]:
# MERGE both NEGATIVE & POSITIVE reviews datasets into one.
training = pd.concat([positive_training_reviews, negative_training_reviews], axis = 0)
testing = pd.concat([positive_testing_reviews, negative_testing_reviews], axis = 0)

In [7]:
training.columns = ['review', 'sentiment']
testing.columns = ['review', 'sentiment']

- Reshuffle dataset to create a new dataset with newly sorted indices.

In [8]:
training = training.sample(frac = 1).reset_index(drop = True)
testing = testing.sample(frac = 1).reset_index(drop = True)

In [9]:
training.head()

Unnamed: 0,review,sentiment
0,"When you have a disembodied skull, an empty ma...",negative
1,This is a really funny (and sexy) movie - that...,positive
2,"Well, where to begin? I guess I can start with...",negative
3,This film is a joke and Quinton should be asha...,negative
4,Action & Adventure.Billie Clark is twenty year...,positive


In [10]:
testing.head()

Unnamed: 0,review,sentiment
0,I don't understand how this garbage got on the...,negative
1,I've been strangely attracted to this film sin...,positive
2,My original comment on this particular title w...,negative
3,Great. Another foreign film that thinks it's F...,negative
4,This was a highly-hyped movie prior to its rel...,negative


- Transform data in the **SENTIMENT** column of the **TESTING** dataset to **LOWERCASE**.


In [11]:
testing['sentiment'] = testing['sentiment'].apply(lambda x: x.lower())

In [12]:
testing.head()

Unnamed: 0,review,sentiment
0,I don't understand how this garbage got on the...,negative
1,I've been strangely attracted to this film sin...,positive
2,My original comment on this particular title w...,negative
3,Great. Another foreign film that thinks it's F...,negative
4,This was a highly-hyped movie prior to its rel...,negative


# DATA CLEANING.
- Clean the corpus in the dataset by removing unnecessary characters - **INTEGERS**, **PUNCTUATION** marks, **HTML** tags & **WHITESPACES**.

In [13]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('wordnet')


def parse_text(text):
  """
    - Method for parsing the corpus.
  """
  # 1. Transform to LOWERCASE.
  text = text.lower()

  # 2. Remove PUNCTUATION marks.
  text = re.sub(r"^/[a-zA-Z]/", '', text)
  text = text.replace('/', '')
  text = re.sub(r'[^\w\s]', '', text)

  # 3. Remove NUMBERS
  text = text.replace('\d+', '')

  # 4. Remove STOPWORDS
  stop_words = set(stopwords.words('english'))
  text = ' '.join(word for word in text.split() if word not in stop_words)

  # 5. LEMMATIZE the text
  lemmatizer = WordNetLemmatizer()
  text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())

  # Remove extra WHITESPACE
  text = text.strip()
  text = re.sub(r'\s+', ' ', text)

  return text

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:645)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:645)>


In [14]:
training['review'] = training['review'].apply(lambda x: parse_text(x))
testing['review'] = testing['review'].apply(lambda x: parse_text(x))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/mac/nltk_data'
    - '/Users/mac/anaconda/nltk_data'
    - '/Users/mac/anaconda/share/nltk_data'
    - '/Users/mac/anaconda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
training.head()

In [None]:
testing.head()

In [None]:
training.shape

In [None]:
testing.shape

## Merging Dataset
Combine both the TRAINING & TESTING dataframes into one dataframe for easier data wrangling.

In [None]:
df = pd.concat([testing, training], axis = 0)
df = df.sample(frac = 1).reset_index(drop = True)

In [None]:
df.head()

In [None]:
X = df.drop(['sentiment'], axis = 1)
y = df['sentiment']

In [None]:
X.shape

In [None]:
y.shape

# Split Data into TRAINING & TESTING.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
y_test

## Vectorizing the Corpus.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train['review'])
X_test = vectorizer.transform(X_test['review'])

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# MODEL TRAINING

## Logistic Regression (90%)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

print(classification_report(y_test, lr_predictions))
print(confusion_matrix(y_test, lr_predictions))

## Decision Tree Classifier (72%)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_test)

print(classification_report(y_test, dt_predictions))
print(confusion_matrix(y_test, dt_predictions))

## Random Forest Classifier (86%)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))

## Naive Bayes (87%)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_test)

print(classification_report(y_test, nb_predictions))
print(confusion_matrix(y_test, nb_predictions))

## Support Vector Machines ()

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
svc_predictions = svc.predict(X_test)

print(classification_report(y_test, svc_predictions))
print(confusion_matrix(y_test, svc_predictions))

## K-Nearest Neighbors ()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
print(confusion_matrix(y_test, knn_predictions))
print(confusion_matrix(y_test, knn_predictions))

## Linear SVC ()

In [None]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
lsvc_predictions = lsvc.predict(X_test)

print(classification_report(y_test, lsvc_predictions))
print(confusion_matrix(y_test, lsvc_predictions))

## XGBoost Classifier ()

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)

print(classification_report(y_test, xgb_predictions))
print(confusion_matrix(y_test, xgb_predictions))

## Grid Search CV ()

In [None]:
from sklearn.model_selection import GridSearchCV


models = [lr, dt, rf, nb, svc, knn, lsvc, xgb]

for model in models:
  parameters = {'C': [0.1, 1, 10, 100]}
  gs = GridSearchCV(model, parameters, cv = 5, scoring = 'accuracy')
  gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)