# Preprocessing

In [1]:
import re
from string import punctuation, digits, ascii_lowercase

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [2]:
reviews = pd.read_csv('yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.csv')

In [18]:
samp = reviews.sample(n=10000)

Define lists of escape sequences, digits, stopwords to use for parseing. Also defines the type of stemming to be used. 

In [30]:
escapes = ''.join([chr(char) for char in range(1, 32)])
removeables = escapes + digits 
stops = [str(word) for word in stopwords.words('english')] + list(ascii_lowercase)
sno = nltk.stem.SnowballStemmer('english')

Create function to process the text and then use it to make a new coloumn in the dataframe where all texted has been processed. 

In [34]:
def parse_text(text, stem=True):
    ''' This function takes a review string and removes all escape sequences,
        digits, punctuation, http links, and stop words. Furthermore, every
        word in the string will be stemmed using nltk's snowball stemmer.
        Every word is also transformed to be lowercase.'''
    
    text = re.sub(r"http\S+", " ", text)
    regex = re.compile('[%s]' % re.escape(punctuation))
    text = regex.sub(' ', text)
    text = text.translate(None, removeables)
    text = text.decode('utf8')
    if stem == True:
        text = ' '.join([sno.stem(word.lower()) for word in text.split() if word.lower() not in set(stops)])
    else:
        text = ' '.join([word.lower() for word in text.split() if word.lower() not in set(stops)])
    return text

In [13]:
samp['parsed_text']=samp.text.apply(parse_text,stem=False)

# Bag of words, tf-idf vectorization
Create the bag of words representation. Find counts of each word in document and in whole courpus. Then create the tfidf representation. Worth also considering binary count vecorizing, supposed to work better for smaller sample sets.

In [14]:
corpus = samp.parsed_text.tolist()

In [15]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(corpus)

In [16]:
words = count_vect.get_feature_names()

In [17]:
y = samp.stars.reshape(10000,1)
counts = X_counts.toarray()

In [19]:
df = pd.DataFrame(np.concatenate((stars,counts),axis=1))

In [20]:
df.columns = ['star_rating'] + words

In [164]:
df.to_csv('sample_data.csv', encoding='utf-8',index=False)