In [1]:
import time
import string
import pandas as pd
import re
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer
import nltk

In [2]:
sw = set(stopwords.words("english"))
stemmer = PorterStemmer()

In [3]:
input_dir = "../input/yelp_review_polarity_csv/"

In [4]:
train_data = pd.read_csv(input_dir+"train.csv", names = ['sentiment', 'review'])
test_data = pd.read_csv(input_dir+"test.csv", names = ['sentiment', 'review'])

In [5]:
print("Shape of train data:",train_data.shape)
print("Shape of test data:",test_data.shape)

Shape of train data: (560000, 2)
Shape of test data: (38000, 2)


In [6]:
train_data.head(10)

Unnamed: 0,sentiment,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
5,1,Wing sauce is like water. Pretty much a lot of...
6,1,Owning a driving range inside the city limits ...
7,1,This place is absolute garbage... Half of the...
8,2,Before I finally made it over to this range I ...
9,2,I drove by yesterday to get a sneak peak. It ...


In [7]:
test_data.head(10)

Unnamed: 0,sentiment,review
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...
5,2,"Picture Billy Joel's \""Piano Man\"" DOUBLED mix..."
6,1,Mediocre service. COLD food! Our food waited s...
7,1,Ok! Let me tell you about my bad experience fi...
8,1,I used to love D&B when it first opened in the...
9,2,"Like any Barnes & Noble, it has a nice comfy c..."


In [8]:
train_data['sentiment'] = train_data['sentiment'].apply(lambda x: 1 if x==2 else 0)
test_data['sentiment'] = test_data['sentiment'].apply(lambda x: 1 if x==2 else 0)

In [9]:
train_data.head()

Unnamed: 0,sentiment,review
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...
3,0,I'm writing this review to give you a heads up...
4,1,All the food is great here. But the best thing...


In [10]:
test_data.head()

Unnamed: 0,sentiment,review
0,1,"Contrary to other reviews, I have zero complai..."
1,0,Last summer I had an appointment to get new ti...
2,1,"Friendly staff, same starbucks fair you get an..."
3,0,The food is good. Unfortunately the service is...
4,1,Even when we didn't have a car Filene's Baseme...


In [11]:
def text_preprocessing(text):
    '''
    A function for preprocessing
    '''
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = str(text)    
    text = text.translate(translator)    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)    
    # stemming
    text = stemmer.stem_sentence(text)
    # Clear multiple spaces
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    return text

In [12]:
t_start = time.time()
train_data['review'] = train_data['review'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the reviews of train data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the reviews of train data: 4.42 mins


In [13]:
t_start = time.time()
test_data['review'] = test_data['review'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the reviews of test data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the reviews of test data: 0.29 mins


In [14]:
def find_length(text):
    """
    A function to find the length
    """
    text = str(text)
    return len(text.split())

In [15]:
train_data['length'] = train_data['review'].apply(find_length)

In [16]:
train_data.head(10)

Unnamed: 0,sentiment,review,length
0,0,unfortun frustrat dr goldberg patient repeat e...,54
1,1,go dr goldberg 10 year think on 1st patient st...,39
2,0,know dr goldberg like move arizona let tell st...,101
3,0,write review give head see doctor offic staff ...,96
4,1,food great best thing wing wing simpli fantast...,46
5,0,wing sauc like water pretti much lot butter ho...,36
6,0,own drive rang insid citi limit like licens pr...,115
7,0,place absolut garbag half tee avail includ gra...,49
8,1,final made rang heard thing peopl fine go work...,89
9,1,drove yesterdai get sneak peak open juli 14th ...,41


In [17]:
test_data['length'] = test_data['review'].apply(find_length)

In [18]:
test_data.head(10)

Unnamed: 0,sentiment,review,length
0,1,contrari review zero complaint servic price ge...,55
1,0,last summer appoint get new tire wait super lo...,34
2,1,friendli staff starbuck fair get anywher els s...,11
3,0,food good unfortun servic hit miss main issu s...,26
4,1,even car filen basement worth bu trip waterfro...,82
5,1,pictur billi joel piano man doubl mix beer row...,114
6,0,mediocr servic cold food food wait long lettuc...,26
7,0,ok let tell bad experi first went b last night...,164
8,0,us love b first open waterfront gone hill year...,83
9,1,like barn nobl nice comfi cafe larg select boo...,32


In [19]:
reviews = pd.concat((train_data, test_data))['review']

In [20]:
# number of folds
n = len(reviews)
k = 100
step = n//k

# iterate through each fold
for i in range(k):

    # start and end indices of the fold
    if i != (k-1):
        start = (step*i)
        end = (step*(i+1)) 
    else:
        start = (step*i)
        end = len(reviews)
        
    print("\rCompleted saving {} folds".format(i+1), end="")
        
    # saving the fold to the designated folder
    reviews[start:end].to_csv("../input/yelp_review_processed_chunks/yelp_review_processed-"+str(i)+".csv",
                              index = False, header = False)

Completed saving 100 folds

In [21]:
train_data.to_csv("../input/yelp_review_polarity_csv/train_data_processed.csv", index = False, header = True)

In [22]:
test_data.to_csv("../input/yelp_review_polarity_csv/test_data_processed.csv", index = False, header = True)