# CLEAN PREPROCESS TEXT DATA

## 1. Import statement

**Import neccessary packages and modules**

In [1]:
import time
import string
import pandas as pd
import re
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer
import nltk

**Extract stopwords and create an object of PorterStemmer**

In [2]:
sw = set(stopwords.words("english"))
stemmer = PorterStemmer()

## 2. Preprocess the text data

**Input directory**

In [3]:
input_dir = "../input/amazon_review_polarity_csv/"

## 2.1 Load and display train and test data

**Load train and test data**

In [4]:
train_data = pd.read_csv(input_dir+"train.csv", names = ['sentiment', 'review_title', 'review_text'])
test_data = pd.read_csv(input_dir+"test.csv", names = ['sentiment', 'review_title', 'review_text'])

In [5]:
print("Shape of train data:",train_data.shape)
print("Shape of test data:",test_data.shape)

Shape of train data: (3600000, 3)
Shape of test data: (400000, 3)


**Print heads of train and test data**

In [6]:
train_data.head(10)

Unnamed: 0,sentiment,review_title,review_text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
5,2,an absolute masterpiece,I am quite sure any of you actually taking the...
6,1,Buyer beware,"This is a self-published book, and if you want..."
7,2,Glorious story,I loved Whisper of the wicked saints. The stor...
8,2,A FIVE STAR BOOK,I just finished reading Whisper of the Wicked ...
9,2,Whispers of the Wicked Saints,This was a easy to read book that made me want...


In [7]:
test_data.head(10)

Unnamed: 0,sentiment,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
5,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...
6,1,Incorrect Disc,"I love the style of this, but after a couple y..."
7,1,DVD menu select problems,I cannot scroll through a DVD menu that is set...
8,2,Unique Weird Orientalia from the 1930's,"Exotic tales of the Orient from the 1930's. ""D..."
9,1,"Not an ""ultimate guide""","Firstly,I enjoyed the format and tone of the b..."


## 2.2 Preprocess the sentiment column¶

**Change the numerical labels of the sentiment column**

In [8]:
train_data['sentiment'] = train_data['sentiment'].apply(lambda x: 1 if x==2 else 0)
test_data['sentiment'] = test_data['sentiment'].apply(lambda x: 1 if x==2 else 0)

In [9]:
train_data.head(10)

Unnamed: 0,sentiment,review_title,review_text
0,1,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,1,Amazing!,This soundtrack is my favorite music of all ti...
3,1,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,1,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
5,1,an absolute masterpiece,I am quite sure any of you actually taking the...
6,0,Buyer beware,"This is a self-published book, and if you want..."
7,1,Glorious story,I loved Whisper of the wicked saints. The stor...
8,1,A FIVE STAR BOOK,I just finished reading Whisper of the Wicked ...
9,1,Whispers of the Wicked Saints,This was a easy to read book that made me want...


In [10]:
test_data.head(10)

Unnamed: 0,sentiment,review_title,review_text
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
5,0,DVD Player crapped out after one year,I also began having the incorrect disc problem...
6,0,Incorrect Disc,"I love the style of this, but after a couple y..."
7,0,DVD menu select problems,I cannot scroll through a DVD menu that is set...
8,1,Unique Weird Orientalia from the 1930's,"Exotic tales of the Orient from the 1930's. ""D..."
9,0,"Not an ""ultimate guide""","Firstly,I enjoyed the format and tone of the b..."


## 2.3 Preprocess review column

**Function to preprocess text**

In [11]:
def text_preprocessing(text):
    '''
    A function for preprocessing
    '''
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = str(text)    
    text = text.translate(translator)    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)    
    # stemming
    text = stemmer.stem_sentence(text)
    # Clear multiple spaces
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    return text

**Preprocess review title and review text columns of train and test data**

In [12]:
t_start = time.time()
train_data['review_title'] = train_data['review_title'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the review title of train data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the review title of train data: 1.60 mins


In [13]:
t_start = time.time()
train_data['review_text'] = train_data['review_text'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the review text of train data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the review text of train data: 16.26 mins


In [14]:
t_start = time.time()
test_data['review_title'] = test_data['review_title'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the review title of train data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the review title of train data: 0.17 mins


In [15]:
t_start = time.time()
test_data['review_text'] = test_data['review_text'].apply(text_preprocessing)
t_end = time.time()
print("Time taken to process the review text of train data: {:.2f} mins".format((t_end-t_start)/60))

Time taken to process the review text of train data: 1.78 mins


## 3. Save the processed data

**Concat the  review title & review text columns of train and test data**

In [17]:
text = pd.concat((train_data["review_title"], test_data["review_title"],\
                     train_data["review_text"], test_data["review_text"]))

**Save the review column in csv chunks**

In [19]:
save_dir = "../input/amazon_review_polarity_csv_chunks/"

In [20]:
# number of folds and steps
n = len(reviews)
k = 100
step = n//k

In [25]:
# iterate through each fold
for i in range(k):

    # start and end indices of the fold
    if i != (k-1):
        start = (step*i)
        end = (step*(i+1)) 
    else:
        start = (step*i)
        end = len(reviews)
        
    print("\rCompleted saving {} folds".format(i+1), end="")
        
    # saving the fold to the designated folder
    reviews[start:end].to_csv(save_dir+"amazon_review_processed-"+str(i)+".csv",
                              index = False, header = False)

Completed saving 100 folds

**Save the processed train and test data to the designated directory**

In [22]:
train_data.to_csv(input_dir+"train_data_processed.csv", index = False, header = True)

In [23]:
test_data.to_csv(input_dir+"test_data_processed.csv", index = False, header = True)