In [96]:
import pandas as pd
import json
import csv
import nltk
import re
import string

data = pd.read_csv('buffetpalace.csv',encoding = "ISO-8859-1")
data.head()

Unnamed: 0,review_id,business_id,stars,text
0,5CUCpTaV-YR2DzYHjpgQyQ,HJisBzwgGpCZu0ZkO8LFzA,4,This might be the best Chinese buffet in Austi...
1,nPuZpbJA0a5qFLxIhENs3A,HJisBzwgGpCZu0ZkO8LFzA,3,This place is the epitome of average as far as...
2,GvCxu0pSpPrmvi1Hkms0uA,HJisBzwgGpCZu0ZkO8LFzA,4,"For about $12, this is the best Asian buffet i..."
3,YzuNzsuIL4hKMFfNn1oo6w,HJisBzwgGpCZu0ZkO8LFzA,4,Place is good for a buffet. They used to have ...
4,YhzM2rwEQIIVCANtvqgeMA,HJisBzwgGpCZu0ZkO8LFzA,5,This is hands down the best buffet in Austin. ...


In [97]:
#Lower casing the text column

data['text'] = data['text'].str.lower()
data.head()

Unnamed: 0,review_id,business_id,stars,text
0,5CUCpTaV-YR2DzYHjpgQyQ,HJisBzwgGpCZu0ZkO8LFzA,4,this might be the best chinese buffet in austi...
1,nPuZpbJA0a5qFLxIhENs3A,HJisBzwgGpCZu0ZkO8LFzA,3,this place is the epitome of average as far as...
2,GvCxu0pSpPrmvi1Hkms0uA,HJisBzwgGpCZu0ZkO8LFzA,4,"for about $12, this is the best asian buffet i..."
3,YzuNzsuIL4hKMFfNn1oo6w,HJisBzwgGpCZu0ZkO8LFzA,4,place is good for a buffet. they used to have ...
4,YhzM2rwEQIIVCANtvqgeMA,HJisBzwgGpCZu0ZkO8LFzA,5,this is hands down the best buffet in austin. ...


In [98]:
#library that contains punctuation
string.punctuation

#Stopword removal
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

#Pre-processing the data by removing puntuation 

def clean_text(text):
    #removing punctuation 
    text = "".join([c for c in text if c not in string.punctuation])
    
    #tokenizing
    tokens = re.split('\W+', text)
    
    #filtering out stopwords
    text = [word for word in tokens if word not in stopwords]
    return text

In [99]:
data['clean_text'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,review_id,business_id,stars,text,clean_text
0,5CUCpTaV-YR2DzYHjpgQyQ,HJisBzwgGpCZu0ZkO8LFzA,4,this might be the best chinese buffet in austi...,"[might, best, chinese, buffet, austin, texas, ..."
1,nPuZpbJA0a5qFLxIhENs3A,HJisBzwgGpCZu0ZkO8LFzA,3,this place is the epitome of average as far as...,"[place, epitome, average, far, quality, food, ..."
2,GvCxu0pSpPrmvi1Hkms0uA,HJisBzwgGpCZu0ZkO8LFzA,4,"for about $12, this is the best asian buffet i...","[12, best, asian, buffet, town, get, sushi, wh..."
3,YzuNzsuIL4hKMFfNn1oo6w,HJisBzwgGpCZu0ZkO8LFzA,4,place is good for a buffet. they used to have ...,"[place, good, buffet, used, lot, foods, nowada..."
4,YhzM2rwEQIIVCANtvqgeMA,HJisBzwgGpCZu0ZkO8LFzA,5,this is hands down the best buffet in austin. ...,"[hands, best, buffet, austin, ive, never, bad,..."


In [100]:
#stemming 
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['clean_text'] = data['clean_text'].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,review_id,business_id,stars,text,clean_text
0,5CUCpTaV-YR2DzYHjpgQyQ,HJisBzwgGpCZu0ZkO8LFzA,4,this might be the best chinese buffet in austi...,"[might, best, chines, buffet, austin, texa, cl..."
1,nPuZpbJA0a5qFLxIhENs3A,HJisBzwgGpCZu0ZkO8LFzA,3,this place is the epitome of average as far as...,"[place, epitom, averag, far, qualiti, food, go..."
2,GvCxu0pSpPrmvi1Hkms0uA,HJisBzwgGpCZu0ZkO8LFzA,4,"for about $12, this is the best asian buffet i...","[12, best, asian, buffet, town, get, sushi, wh..."
3,YzuNzsuIL4hKMFfNn1oo6w,HJisBzwgGpCZu0ZkO8LFzA,4,place is good for a buffet. they used to have ...,"[place, good, buffet, use, lot, food, nowaday,..."
4,YhzM2rwEQIIVCANtvqgeMA,HJisBzwgGpCZu0ZkO8LFzA,5,this is hands down the best buffet in austin. ...,"[hand, best, buffet, austin, ive, never, bad, ..."


In [101]:
#detokenizing column
from nltk.tokenize.treebank import TreebankWordDetokenizer


data['clean_text'] = data['clean_text'].str.join(', ')
# 'The quick brown'
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

data['clean_text']= data['clean_text'].apply(lambda x:remove_punctuation(x))
data.head()

Unnamed: 0,review_id,business_id,stars,text,clean_text
0,5CUCpTaV-YR2DzYHjpgQyQ,HJisBzwgGpCZu0ZkO8LFzA,4,this might be the best chinese buffet in austi...,might best chines buffet austin texa clean fri...
1,nPuZpbJA0a5qFLxIhENs3A,HJisBzwgGpCZu0ZkO8LFzA,3,this place is the epitome of average as far as...,place epitom averag far qualiti food goe sushi...
2,GvCxu0pSpPrmvi1Hkms0uA,HJisBzwgGpCZu0ZkO8LFzA,4,"for about $12, this is the best asian buffet i...",12 best asian buffet town get sushi whole spre...
3,YzuNzsuIL4hKMFfNn1oo6w,HJisBzwgGpCZu0ZkO8LFzA,4,place is good for a buffet. they used to have ...,place good buffet use lot food nowaday dont fi...
4,YhzM2rwEQIIVCANtvqgeMA,HJisBzwgGpCZu0ZkO8LFzA,5,this is hands down the best buffet in austin. ...,hand best buffet austin ive never bad meal eve...


In [102]:
data.to_csv("cleaned_data.csv")