# Text Cleaning & Transformation

### Import statements

In [1]:
import time
import string
import pandas as pd
import re
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer

### Extract stopword list and create a stemmer object

In [2]:
sw = set(stopwords.words("english"))
stemmer = PorterStemmer()

### Read and display the data

In [3]:
data = pd.read_csv('../input/yelp_review.csv')

In [4]:
print("Shape of data:",data.shape)

Shape of data: (5261668, 9)


In [5]:
data.head(20)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0
5,HRPm3vEZ_F-33TYVT7Pebw,_4iMDXbXZ1p1ONG297YEAQ,8QWPlVQ6D-OExqXoaD2Z1g,5,2014-09-24,Cycle Pub Las Vegas was a blast! Got a groupon...,1,0,0
6,ymAUG8DZfQcFTBSOiaNN4w,u0LXt3Uea_GidxRW1xcsfg,9_CGhHMz8698M9-PkVf0CQ,4,2012-05-11,Who would have guess that you would be able to...,0,0,2
7,8UIishPUD92hXtScSga_gw,u0LXt3Uea_GidxRW1xcsfg,gkCorLgPyQLsptTHalL61g,4,2015-10-27,Always drove past this coffee house and wonder...,1,0,0
8,w41ZS9shepfO3uEyhXEWuQ,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3,2013-02-09,"Not bad!! Love that there is a gluten-free, ve...",1,0,0
9,WF_QTN3p-thD74hqpp2j-Q,u0LXt3Uea_GidxRW1xcsfg,fDF_o2JPU8BR1Gya--jRIA,5,2016-04-06,Love this place!\n\nPeggy is great with dogs a...,3,0,0


In [6]:
data.dtypes

review_id      object
user_id        object
business_id    object
stars           int64
date           object
text           object
useful          int64
funny           int64
cool            int64
dtype: object

### Preprocess the review text

In [7]:
def text_preprocessing(text):
    
    '''a function for preprocessing'''
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    text = str(text)
    
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    text = text.translate(translator)
    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    
    # stemming
#     text = stemmer.stem_sentence(text)

    
    # Clear multiple spaces
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return text

In [8]:
t_start = time.time()
data['text'] = data['text'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process text data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process text data: 4.82 mins


In [9]:
data.head(10)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,super simple place amazing nonetheless around ...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,small unassuming place changes menu every ofte...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,lester located beautiful neighborhood since 19...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,love coming yes place always needs floor swept...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,chocolate almond croissant amazing light butte...,0,0,0
5,HRPm3vEZ_F-33TYVT7Pebw,_4iMDXbXZ1p1ONG297YEAQ,8QWPlVQ6D-OExqXoaD2Z1g,5,2014-09-24,cycle pub las vegas blast got groupon rented b...,1,0,0
6,ymAUG8DZfQcFTBSOiaNN4w,u0LXt3Uea_GidxRW1xcsfg,9_CGhHMz8698M9-PkVf0CQ,4,2012-05-11,would guess would able get fairly decent vietn...,0,0,2
7,8UIishPUD92hXtScSga_gw,u0LXt3Uea_GidxRW1xcsfg,gkCorLgPyQLsptTHalL61g,4,2015-10-27,always drove past coffee house wondered bf fin...,1,0,0
8,w41ZS9shepfO3uEyhXEWuQ,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3,2013-02-09,bad love gluten free vegan version cheese curd...,1,0,0
9,WF_QTN3p-thD74hqpp2j-Q,u0LXt3Uea_GidxRW1xcsfg,fDF_o2JPU8BR1Gya--jRIA,5,2016-04-06,love place peggy great dogs great job patience...,3,0,0


In [10]:
data.dtypes

review_id      object
user_id        object
business_id    object
stars           int64
date           object
text           object
useful          int64
funny           int64
cool            int64
dtype: object

### Extract the processesed reviews and save it in chunks

In [11]:
reviews = data["text"]

In [12]:
reviews.head(10)

0    super simple place amazing nonetheless around ...
1    small unassuming place changes menu every ofte...
2    lester located beautiful neighborhood since 19...
3    love coming yes place always needs floor swept...
4    chocolate almond croissant amazing light butte...
5    cycle pub las vegas blast got groupon rented b...
6    would guess would able get fairly decent vietn...
7    always drove past coffee house wondered bf fin...
8    bad love gluten free vegan version cheese curd...
9    love place peggy great dogs great job patience...
Name: text, dtype: object

In [13]:
# number of folds
n = len(reviews)
k = 100
step = n//k

# iterate through each fold
for i in range(k):

    # start and end indices of the fold
    if i != (k-1):
        start = (step*i)
        end = (step*(i+1)) 
    else:
        start = (step*i)
        end = len(reviews)
        
    print("\rCompleted saving {} folds".format(i+1), end="")
        
    # saving the fold to the designated folder
    reviews[start:end].to_csv("../reviews/yelp_review_processed-"+str(i)+".csv", index = False, header = False)

Completed saving 100 folds

In [14]:
data.to_csv("../input/yelp_review_processed.csv", index = False, header = True)