In [1]:
import pandas as pd
import os
import shutil
from wordcloud import WordCloud
import nltk
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOPWORDS = set(stopwords.words("english"))
print('stopwords = ', STOPWORDS)

import matplotlib.pyplot as plt
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


stopwords =  {"should've", 'just', 'from', 'where', 'these', 'and', 'before', "wasn't", 'by', 'same', 'not', "couldn't", "hasn't", 'whom', 'been', 'against', 'up', 'again', 'himself', 'or', 'any', 'having', 'wouldn', "needn't", 'what', 'i', 'once', 'be', "you've", "aren't", 'me', 'through', 'my', 'needn', 'because', 'an', 'did', "don't", 'shan', 'of', 't', 'haven', 'then', "wouldn't", 'own', "doesn't", 'weren', 'hers', 'down', 'to', "you'd", 'is', "that'll", 'nor', 'those', 'if', 'ourselves', 'at', 'y', 'can', 'doesn', 'her', 'until', 'so', 'aren', 'have', 'a', 'no', "shan't", 'ain', 'our', "she's", 'into', 'm', 'hasn', 'didn', 'for', "hadn't", 'more', 'isn', 'too', 'here', 'now', 'yourselves', 'itself', 'out', "mustn't", 'yourself', 'do', 'other', 'during', 'both', 'has', 'themselves', 'while', "isn't", "won't", 'how', 'such', 'mustn', 'all', 'but', 's', 'him', 'don', 'was', 'under', 'shouldn', 'herself', 'won', 'your', 'mightn', 'ours', 'had', 'theirs', 'than', "mightn't", 'with', 's

In [2]:
root = os.getcwd()
data_path = os.path.join(root, 'dataset', 'amazon_review_full_csv', 'train.csv')
assert os.path.exists(data_path), f'data path does not exist {data_path}'

In [3]:
df = pd.read_csv(data_path, names = ['review', 'title', 'body'])
print(df.shape)
df.head()

(3000000, 3)


Unnamed: 0,review,title,body
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


### Cleaning the dataset

In [4]:
#Combining the title and body into 1 big text
df['text'] = df['title'] + " " + df['body']

In [5]:
df.drop(['title', 'body'], axis = 1, inplace = True)

In [6]:
df.head()

Unnamed: 0,review,text
0,3,more like funchuck Gave this to my dad for a g...
1,5,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true Probably the greatest soun...


In [7]:
df.isna().sum()
df.dropna(inplace= True)
print("Dropped NA")
df.isna().sum()

Dropped NA


review    0
text      0
dtype: int64

In [8]:
def remove_urls(text):
    if 'www.' in text or 'http:' in text \
        or 'https:' in text or '.com' in text: 
        text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "", text)
    return text

def remove_punct(text):
    return re.sub(r'[^\w\s]', '', text)

print(remove_urls('www.kaggle.com here'))
print(remove_punct('I need, to remove? punctation ? 1 ! 2'))

 here
I need to remove punctation  1  2


In [24]:
alltext = df['text'].values.copy()
print(alltext[:5])
print()
error_count = 0
sentence_list = []
possible_words = set()
i = 0

def process_text(text):
    #lower the text
    text = text.lower()
    #remove words url from text
    text = remove_urls(text)
    text = remove_punct(text)
    #remove stopwords
    text = ' '.join([ t for t in text.split() if t not in STOPWORDS])
    tokens = ' '.join(nltk.word_tokenize(text))
    return tokens

tqdm.pandas()
df['tokens'] = df['text'].progress_apply(process_text)

['more like funchuck Gave this to my dad for a gag gift after directing "Nunsense," he got a reall kick out of it!'
 'Inspiring I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals, fresh tunes, cross-cultural happiness. Her blues is from the gut. The pop sounds are catchy and mature.'
 "The best soundtrack ever to anything. I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."
 'Chrono Cross OST The music of Yasunori Misuda is without question my close second below the great Nobuo Uematsu.Chrono Cross OST is a wonderful creation

100%|██████████| 2999812/2999812 [09:29<00:00, 5269.56it/s]


In [25]:
df.head()

Unnamed: 0,review,text,tokens
0,3,more like funchuck Gave this to my dad for a g...,"like,funchuck,gave,dad,gag,gift,directing,nuns..."
1,5,Inspiring I hope a lot of people hear this cd....,"inspiring,hope,lot,people,hear,cd,need,strong,..."
2,5,The best soundtrack ever to anything. I'm read...,"best,soundtrack,ever,anything,im,reading,lot,r..."
3,4,Chrono Cross OST The music of Yasunori Misuda ...,"chrono,cross,ost,music,yasunori,misuda,without..."
4,5,Too good to be true Probably the greatest soun...,"good,true,probably,greatest,soundtrack,history..."


In [26]:
path_to_dataset = os.path.join(root, 'dataset', 'processed_amazon_reviews.csv')
assert os.path.exists(path_to_dataset), 'path to dataset not found'
df.to_csv(path_to_dataset, index = False)