# Import

In [32]:
import pandas as pd
import re
import contractions
import torch 
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import collections

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamesnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading dataset

In [4]:
root = os.getcwd()
#yelp polarity has 2 labels, 
#yelp dataset has 5 labels, they have the same text content
train_datapath = os.path.join(root, 'dataset', 'yelp_polarity', 'train.csv') 
test_datapath = os.path.join(root,  'dataset', 'yelp_polarity', 'test.csv') 

assert os.path.exists(train_datapath),  f"train dataset path {train_datapath} not found"
assert os.path.exists(test_datapath),   f"test dataset path {test_datapath} not found"

sample_size = 0.1

In [38]:
train_df = pd.read_csv(train_datapath, names = ["review","text"])
train_df = train_df.sample(frac = sample_size)
print(train_df.shape)
train_df.reset_index(inplace=True, drop = True)
train_df.head()

(56000, 2)


Unnamed: 0,review,text
0,1,I feel violated and taken advantage of. Last n...
1,1,Wow. I hope that the one experience I had here...
2,2,My sons and i love the place. something about ...
3,1,"The ice cream is good, as far as that goes. T..."
4,2,The best bridal experience! I had been on the ...


# Cleaning

In [35]:
# Cleaning Text

def remove_urls(text):
    #if there's link in text, like www.something.com, https://www.something.com,
    # replace it with the <url> token
    pattern = re.compile(r'https?://\S+|www\.\S+')
    text = pattern.sub(' ', text)
    return text

def remove_digits(text):
    return re.sub("\d", ' ', text)

def remove_punctation(text):
    return re.sub(r'[^\w\s]',' ',text)

def expand_contraction(text):
    return contractions.fix(text)

def remove_stopwords(text):
    return ' '.join([word for word in text.split(' ') if word not in STOPWORDS])

def clean_text(text):
    '''
    extract feature and label from line and process the text
    @params:
        text: string, format: __label__2 some text.
    @return:
        feature: string
        label: int, 0: bad review, 1 good review
    '''
    #Each line has format: __label__2 some text.
    #The first part is label, the rest is text feature
    #lower case the features
    text = text.lower()
    #start cleaning

    #remove urls in text
    text = remove_urls(text)
    #remove digits
    text = remove_digits(text)
    # # #expand contractions
    text = expand_contraction(text)
    # # #remove punctuations
    text = remove_punctation(text)
    # # #remove stop words
    text = remove_stopwords(text)

    #after cleaning, there's a letter n that occur most frequently
    #this don't make sense so remove a standalone letter n
    text = ' '.join(t for t in text.split() if t != '' and t != 'n')
    return text.strip()

test_string = '''This is a test string. Here are some special characters: &,#,$. How about some punctuations? !@#$%^&*()_+=-`~{[]}|:;'<,>.?/"|https://www.example.com'''

clean_text(test_string)


'test string special characters punctuations _'

In [39]:
train_df['text'] = train_df['text'].apply(lambda s: clean_text(s))

In [40]:
train_df.head()

Unnamed: 0,review,text
0,1,feel violated taken advantage last night asked...
1,1,wow hope one experience unique norm frequent c...
2,2,sons love place something flavor ribs chicken ...
3,1,ice cream good far goes prices absurd take fam...
4,2,best bridal experience hunt weeks trying keep ...


# A little EDA

In [41]:
# wordcloud of common words
freq = collections.Counter()

for row in train_df.iterrows():
    label, text = row[1]
    freq.update(text.split())

print(freq.most_common(10))


[('food', 32954), ('place', 32283), ('good', 30693), ('would', 27667), ('like', 26556), ('get', 23788), ('one', 23597), ('time', 21650), ('great', 21280), ('service', 20692)]
