# Bag of Words Meets Bags of Popcorn

Notebook for optimized performance.

## Module Imports

In [1]:
import os
import re
from collections import Counter
from typing import *

import nltk
import pandas as pd
from bs4 import BeautifulSoup
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm import tqdm

## Configs

In [2]:
DATA_DIR = '../data/word2vec-nlp-tutorial'

In [3]:
nltk.data.path.append('/Users/chenjun/workspace/nltk_data')

## EDA

In [4]:
os.listdir(DATA_DIR)

['.DS_Store',
 'sampleSubmission.csv',
 'labeledTrainData.tsv',
 'testData.tsv',
 'unlabeledTrainData.tsv']

In [5]:
unlabeled_train_data = pd.read_csv(os.path.join(DATA_DIR, 'unlabeledTrainData.tsv'), header=0, delimiter='\t', quoting=3)

In [6]:
print(unlabeled_train_data.shape)
print(unlabeled_train_data.head())

(50000, 2)
          id                                             review
0   "9999_0"  "Watching Time Chasers, it obvious that it was...
1  "45057_0"  "I saw this film about 20 years ago and rememb...
2  "15561_0"  "Minor Spoilers<br /><br />In New York, Joan B...
3   "7161_0"  "I went to see this film with a great deal of ...
4  "43971_0"  "Yes, I agree with everyone on this site this ...


In [7]:
labeled_train_data = pd.read_csv(os.path.join(DATA_DIR, 'labeledTrainData.tsv'), header=0, delimiter='\t', quoting=3)

In [8]:
labeled_train_data = shuffle(labeled_train_data)
labeled_train_data.reset_index(inplace=True, drop=True)

In [9]:
print(Counter(labeled_train_data['sentiment'].tolist()))

Counter({1: 12500, 0: 12500})


In [10]:
print(labeled_train_data.shape)
print(labeled_train_data.head())

(25000, 3)
          id  sentiment                                             review
0    "681_9"          1  "Richard Chamberlain is David Burton, a tax la...
1  "11797_8"          1  "Powers Booth is hypnotic as cult leader jim j...
2   "7722_3"          0  "Extremely poor action film starring the ever ...
3   "5245_8"          1  "I thought this movie was LOL funny. It's a fu...
4   "7302_3"          0  "The first five minutes of this movie showed p...


## Data Preprocessing

In [11]:
STOP_WORDS = set(stopwords.words('english'))
print(f'Number of stop words loaded: {len(STOP_WORDS)}')

Number of stop words loaded: 179


In [12]:
stemmer = SnowballStemmer('english')

In [13]:
def clean_review(raw_review: str, remove_stop_words: bool = True, stem: bool = True) -> List[str]:
    """Data cleanning"""
    
    # Remove HTML markups
    cleaned_review = BeautifulSoup(raw_review).get_text()
    
    # Remove punctuations
    cleaned_review = re.sub('[^a-zA-Z]', ' ', cleaned_review)
    
    # Lower and split words
    words = cleaned_review.lower().split()
    
    # Remove stop wrods
    if remove_stop_words:
        words = [word for word in words if word not in STOP_WORDS]
        
    # Stem
    if stem:
        words = [stemmer.stem(word) for word in words]
    
    return words

In [14]:
print(clean_review(unlabeled_train_data['review'][0], remove_stop_words=False, stem=True))

['watch', 'time', 'chaser', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friend', 'mayb', 'they', 'were', 'sit', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'togeth', 'and', 'make', 'a', 'realli', 'bad', 'movi', 'or', 'someth', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'end', 'up', 'make', 'a', 'realli', 'bad', 'movi', 'dull', 'stori', 'bad', 'script', 'lame', 'act', 'poor', 'cinematographi', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corner', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevent', 'this', 'film', 's', 'releas', 'life', 's', 'like', 'that']


In [15]:
def preprocess(raw_df: pd.DataFrame, remove_stop_words: bool = True, stem: bool = True) -> List[List[str]]:
    """Data Preprocessing
    
    Map a textual review to a list of words.
    """
    data = []
    for idx in tqdm(range(raw_df.shape[0])):
        row = clean_review(raw_df['review'][idx], remove_stop_words=remove_stop_words, stem=stem)
        data.append(row)
        
    return data

## Train Doc2Vec Model

In [16]:
# Prepare training data
tagged_documents = []
docs = preprocess(unlabeled_train_data, remove_stop_words=False, stem=True)
for idx in tqdm(range(len(docs))):
    tagged_documents.append(TaggedDocument(docs[idx], [idx]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:05<00:00, 761.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 321924.59it/s]


In [17]:
d2v_model = Doc2Vec(tagged_documents, vector_size=50, window=2, min_count=1, workers=6)

In [18]:
print(d2v_model.infer_vector(["all", "stuff"]).shape)

(50,)


In [19]:
# Save 
d2v_model_path = os.path.abspath(os.path.join(os.path.curdir, 'd2v_model'))  # This should be absolute path.
fname = get_tmpfile(d2v_model_path)
d2v_model.save(fname)
print(f'Trained Doc2Vec model saved to: {d2v_model_path}')

Trained Doc2Vec model saved to: /Users/chenjun/workspace/InitialJ/Kaggle/word2vec-nlp-tutorial/d2v_model


## Feature Engineering

In [20]:
print(labeled_train_data.shape), print(labeled_train_data.head())

(25000, 3)
          id  sentiment                                             review
0    "681_9"          1  "Richard Chamberlain is David Burton, a tax la...
1  "11797_8"          1  "Powers Booth is hypnotic as cult leader jim j...
2   "7722_3"          0  "Extremely poor action film starring the ever ...
3   "5245_8"          1  "I thought this movie was LOL funny. It's a fu...
4   "7302_3"          0  "The first five minutes of this movie showed p...


(None, None)

In [21]:
train_data_features = []
for row in labeled_train_data.itertuples():
    doc = clean_review(row.review, remove_stop_words=False, stem=True)
    train_data_features.append(d2v_model.infer_vector(doc))



In [22]:
print(len(train_data_features), len(train_data_features[0]))

25000 50


## Model

Various models.

### Random Forest

In [23]:
# Train-test Split
split_ratio = 0.3
X_train, X_test, y_train, y_test = train_test_split(
    train_data_features, 
    labeled_train_data['sentiment'], 
    test_size=split_ratio, 
    random_state=2
)

In [24]:
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Fit the forest to the training set
forest = forest.fit(X_train, y_train)

In [25]:
forest.score(X_train, y_train)

1.0

In [26]:
forest.score(X_test, y_test)

0.8004

### Logistic Regression

In [27]:
logistic_regression_model = LogisticRegression(max_iter=10000, n_jobs=-1)
logistic_regression_model.fit(X_train, y_train)
print(logistic_regression_model.score(X_train, y_train))
print(logistic_regression_model.score(X_test, y_test))

0.8264
0.8210666666666666


## Evaluate

In [28]:
def evaluate(model, features, labels):
    predictions = model.predict(features)
    
    report = classification_report(predictions, labels)
    
    return report

In [29]:
print(evaluate(forest, X_test, y_test))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      3691
           1       0.81      0.80      0.80      3809

    accuracy                           0.80      7500
   macro avg       0.80      0.80      0.80      7500
weighted avg       0.80      0.80      0.80      7500



In [30]:
print(evaluate(logistic_regression_model, X_test, y_test))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      3694
           1       0.83      0.82      0.82      3806

    accuracy                           0.82      7500
   macro avg       0.82      0.82      0.82      7500
weighted avg       0.82      0.82      0.82      7500



## Create Submission

In [31]:
def create_submission(sub_name: str):
    # Read the test data
    test = pd.read_csv(os.path.join(DATA_DIR, "testData.tsv"), header=0, delimiter="\t", quoting=3)

    # Create an empty list and append the clean reviews one by one
    num_reviews = len(test["review"])
    clean_test_reviews = [] 

    for i in tqdm(range(0,num_reviews)):
        cleaned_review = clean_review(test["review"][i])
        clean_test_reviews.append(cleaned_review)

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = list(map(lambda doc: d2v_model.infer_vector(doc), clean_test_reviews))

    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    output = pd.DataFrame(data={"id":test["id"], "sentiment":result})

    # Use pandas to write the comma-separated output file
    output.to_csv(sub_name, index=False, quoting=3)

In [32]:
# create_submission("Bag_of_Words_model_v3.csv")