# Text Preprocessing

## Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import spacy

nlp = spacy.load('en_core_web_sm')

## Load Data

In [2]:
def load_data(path):
    return pd.read_csv(path, header = 0, delimiter = '\t', quoting = 3)

In [3]:
# Train Data
train_data = load_data(path = 'Data/Raw/labeledTrainData.tsv')
print('Train Data Shape:', train_data.shape)
print('Train Data Valeus:', train_data.columns.values)

# Test Data
test_data = load_data(path = 'Data/Raw/testData.tsv')
print('Test Data Shape:', test_data.shape)
print('Test Data Valeus:', test_data.columns.values)

Train Data Shape: (25000, 3)
Train Data Valeus: ['id' 'sentiment' 'review']
Test Data Shape: (25000, 2)
Test Data Valeus: ['id' 'review']


## Preprocess Data

In [4]:
# Get stopwords list    
stop_words = set(nlp.Defaults.stop_words)


def cleanup(review):
    
    # Remove Markups
    review =  BeautifulSoup(review).get_text()

    # Remove Numbers
    review = re.sub('[^a-zA-Z]', ' ', review)

    # Conver to lowercase
    review = review.lower()

    # Lemmatize
    review = [token.lemma_ for token in nlp(review)]

    # Remove stop words
    review = [word for word in review if not word in stop_words]

    # Rejoin the review words into one string
    review = ' '.join(review)

    return review

def process_data(data, data_type):
    
    # Create a new DataFrame
    cleaned_data = data.copy()    
    
    reviews = []
    for i, review in enumerate(data['review']):
        cleaned_review = cleanup(review)
        reviews.append(cleaned_review)

        if i % 100 == 0:
            print(f'Processing "{data_type}", {i} Review...')
            
    cleaned_data['review'] = reviews
    
    return cleaned_data

In [5]:
# Train Data
cleaned_train_data = process_data(train_data, data_type = 'Train Data')
cleaned_train_data.head(5)

# Test Data
cleaned_test_data = process_data(data = test_data, data_type = 'Test Data')
cleaned_test_data.head(5)

Processing "Train Data", 0 Review...
Processing "Train Data", 100 Review...
Processing "Train Data", 200 Review...
Processing "Train Data", 300 Review...
Processing "Train Data", 400 Review...
Processing "Train Data", 500 Review...
Processing "Train Data", 600 Review...
Processing "Train Data", 700 Review...
Processing "Train Data", 800 Review...
Processing "Train Data", 900 Review...
Processing "Train Data", 1000 Review...
Processing "Train Data", 1100 Review...
Processing "Train Data", 1200 Review...
Processing "Train Data", 1300 Review...
Processing "Train Data", 1400 Review...
Processing "Train Data", 1500 Review...
Processing "Train Data", 1600 Review...
Processing "Train Data", 1700 Review...
Processing "Train Data", 1800 Review...
Processing "Train Data", 1900 Review...
Processing "Train Data", 2000 Review...
Processing "Train Data", 2100 Review...
Processing "Train Data", 2200 Review...
Processing "Train Data", 2300 Review...
Processing "Train Data", 2400 Review...
Processing "

Unnamed: 0,id,review
0,"""12311_10""",naturally film s main theme mortality nost...
1,"""8348_2""",movie disaster disaster film great action ...
2,"""5828_4""",movie kid tonight child love point kid...
3,"""7186_2""",afraid dark leave I impression different scr...
4,"""12128_7""",accurate depiction small time mob life film ...


## Save cleaned data

In [6]:
cleaned_train_data.to_csv('Data/Processed/cleaned_labeledTrainData.csv', index = False)
cleaned_test_data.to_csv('Data/Processed/cleaned_testData.csv', index = False)

## Load the cleaned data

In [7]:
cleaned_train_data = pd.read_csv('Data/Processed/cleaned_labeledTrainData.csv')
cleaned_test_data = pd.read_csv('Data/Processed/cleaned_testData.csv')

cleaned_train_reviews = cleaned_train_data['review']
cleaned_test_reviews = cleaned_test_data['review']

## Vectorize Data

### Train Data

In [8]:
vectorizer = CountVectorizer(max_features = 5000)      # Top 5000 frequent words

train_data_features = vectorizer.fit_transform(cleaned_train_reviews)
train_data_features = train_data_features.toarray()

train_data_features.shape

(25000, 5000)

In [9]:
vectorizer.get_feature_names()

['abandon',
 'abc',
 'ability',
 'able',
 'abound',
 'abraham',
 'abrupt',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorb',
 'absurd',
 'absurdity',
 'abuse',
 'abusive',
 'abysmal',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accident',
 'accidentally',
 'acclaim',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accuracy',
 'accurate',
 'accurately',
 'accuse',
 'ace',
 'achieve',
 'achievement',
 'acid',
 'acknowledge',
 'acquire',
 'act',
 'acting',
 'action',
 'active',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addict',
 'addiction',
 'addition',
 'additional',
 'address',
 'adequate',
 'admirable',
 'admire',
 'admirer',
 'admit',
 'admittedly',
 'adolescent',
 'adopt',
 'adorable',
 'adore',
 'adult',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'advertise',
 'advertising',
 'advice',
 'advise',
 'affair',
 'affect',
 'affection',
 'affleck',
 'afford',

### Test Data

In [10]:
test_data_features = vectorizer.transform(cleaned_test_reviews)

## Train the Model (Random Forest)

In [11]:
random_forest = RandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(train_data_features, cleaned_train_data['sentiment'])

## Make Prediction

In [12]:
predictions = random_forest.predict(test_data_features)

## Save Predictions

In [18]:
cleaned_test_data['id'] = cleaned_test_data['id'].apply(lambda x: re.sub('"', "", x))

predictions_df = pd.DataFrame(data = {'id': cleaned_test_data['id'], 'sentiment': predictions})
predictions_df.to_csv('Data/Processed/Submission.csv', index = False)
predictions_df.head(10)

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1
5,2913_8,1
6,4396_1,0
7,395_2,1
8,10616_1,0
9,9074_9,1
