# Bag Of Words

## Imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import spacy

nlp = spacy.load('en_core_web_sm')

## Load Data

In [2]:
train_data = pd.read_csv('Data/Raw/labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
print('Shape:', train_data.shape)
print('Valeus:', train_data.columns.values)

Shape: (25000, 3)
Valeus: ['id' 'sentiment' 'review']


## Cleaning Data & Text Processing

In [3]:
cleaned_data = train_data[['id', 'sentiment']].copy()

### Cleanup

In [4]:
def cleanup(review):
    
    # Remove Markups
    review =  BeautifulSoup(review).get_text()
    
    # Remove Numbers
    review = re.sub('[^a-zA-Z]', ' ', review)
    
    # Conver to lowercase
    review = review.lower()
    
    # Lemmatize
    review = [token.lemma_ for token in nlp(review)]
    
    return review

reviews = []
for i, review in enumerate(train_data['review']):
    reviews.append(cleanup(review))  
    
    if i % 100 == 0:
        print(f'Processing {i} Review...')

Processing 0 Review...
Processing 100 Review...
Processing 200 Review...
Processing 300 Review...
Processing 400 Review...
Processing 500 Review...
Processing 600 Review...
Processing 700 Review...
Processing 800 Review...
Processing 900 Review...
Processing 1000 Review...
Processing 1100 Review...
Processing 1200 Review...
Processing 1300 Review...
Processing 1400 Review...
Processing 1500 Review...
Processing 1600 Review...
Processing 1700 Review...
Processing 1800 Review...
Processing 1900 Review...
Processing 2000 Review...
Processing 2100 Review...
Processing 2200 Review...
Processing 2300 Review...
Processing 2400 Review...
Processing 2500 Review...
Processing 2600 Review...
Processing 2700 Review...
Processing 2800 Review...
Processing 2900 Review...
Processing 3000 Review...
Processing 3100 Review...
Processing 3200 Review...
Processing 3300 Review...
Processing 3400 Review...
Processing 3500 Review...
Processing 3600 Review...
Processing 3700 Review...
Processing 3800 Review..

### Remove Stopwords

In [7]:
stop_words = set(nlp.Defaults.stop_words)

for i, words in enumerate(reviews):
    reviews[i] = [word for word in words if not word in stop_words]

## Rejoin the words into paragraphs

In [8]:
reviews_grouped = []

for i, words in enumerate(reviews):
    reviews_grouped.append(' '.join(words))

In [12]:
cleaned_data['review'] = reviews_grouped

In [14]:
cleaned_data.head(10)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff moment mj I ve start listen music wa...
1,"""2381_9""",1,classic war world timothy hine entertai...
2,"""7759_3""",0,film start manager nichola bell welcome ...
3,"""3630_4""",0,assume praise film greatest filmed opera...
4,"""9495_8""",1,superbly trashy wondrously unpretentious ...
5,"""8196_8""",1,I nt know people think bad movie pretty go...
6,"""7166_2""",0,movie good come way short cheesy special...
7,"""10633_1""",0,I watch video friend s house I m glad I wa...
8,"""319_1""",0,friend buy film grossly overprice des...
9,"""8713_10""",1,movie reference like mad max ii w...


## Save cleaned data

In [15]:
cleaned_data.to_csv('Data/Processed/cleaned_labeledTrainData.csv', index = False)