# Stage 1: Data Cleaning 
- Necessary cleaning is needed to make the words easily for the machine to process.
- Cleaning involve removing typos as well as replacing short forms into the full word.
- Tokenization and lemmatization then helps remove stopwords that are not useful for analysis


### Importing Libraries

In [1]:
# Importing necessary libraries
import pickle
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [2]:
#Read csv file
df = pd.read_csv('AmazonMicrowave_Reviews.csv')
df

Unnamed: 0,id,profileName,text,date,title,rating,images,helpful
0,R1NC92SJUKH35C,Lynsie,\n My boyfriend bought this for our house and...,"Reviewed in the United States on November 20, ...","Great Echo integration, terrible microwave.",1,,3899
1,R2V7XGOVRJTM4S,RainNW,\n Although the Alexa integration seemed a li...,"Reviewed in the United States on November 16, ...",Very functional microwave at a great price,5,https://images-na.ssl-images-amazon.com/images...,1371
2,R37OK1E0L4DIZ4,Robert McMurrer,\n This is a good small microwave for someon...,"Reviewed in the United States on November 20, ...",Good Small Microwave,5,,916
3,R3GJSRATC2TI1B,Massachusetts,\n A microwave is the most used kitchen appli...,"Reviewed in the United States on November 18, ...",Is a voice control microwave worth it?,4,https://images-na.ssl-images-amazon.com/images...,838
4,RGOPGD4K3RGKU,J.N.,\n This microwave is not powerful and heats f...,"Reviewed in the United States on November 18, ...",Uneven heat,1,,753
...,...,...,...,...,...,...,...,...
4995,ROAM9F0NBIO7L,Travis Dufour,\n Alexa stopped working on the microwave a w...,"Reviewed in the United States on March 9, 2020",My Beloved Microwave,1,,0
4996,R3ISY6I8BODZ7E,H,\n I received this as a Christmas gift and ab...,"Reviewed in the United States on February 8, 2020",It’s great,5,,0
4997,R1D9G3NENJ1DQP,Tiffany Wilsbach,\n Awful this is the worst microwave\n,"Reviewed in the United States on April 6, 2021",Had to replace this microwave right away,1,,0
4998,R39SHPS3C2EX41,Angela cousins,\n The plastic is coming up on the key pad. I...,"Reviewed in the United States on January 27, 2020",Not practical for me,3,,0


### Basic Cleaning of Data

In [3]:
# Basic cleaninging of dataset
def clean_text(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\xa0', '', text)
    text = re.sub(r'\[.*?\]', ' ', text)            # remove text within [ ] (' ' instead of '')
    text = re.sub(r'\<.*?\>', ' ', text)            # remove text within < > (' ' instead of '')
    text = re.sub(r'http\S+', ' ', text)            # remove website ref http
    text = re.sub(r'www\S+', ' ', text)             # remove website ref www

    text = text.replace('€', 'euros')               # replace special character with words
    text = text.replace('£', 'gbp')                 # replace special character with words
    text = text.replace('$', 'dollar')              # replace special character with words
    text = text.replace('%', 'percent')             # replace special character with words
    text = text.replace('\n', ' ')                  # remove \n in text that has it

    text = text.replace('\'', '’')                  # standardise apostrophe
    text = text.replace('&#39;', '’')               # standardise apostrophe

    text = text.replace('’d', ' would')             # remove ’ (for would, should? could? had + PP?)
    text = text.replace('’s', ' is')                # remove ’ (for is, John's + N?)
    text = text.replace('’re', ' are')              # remove ’ (for are)
    text = text.replace('’ll', ' will')             # remove ’ (for will)
    text = text.replace('’ve', ' have')             # remove ’ (for have)
    text = text.replace('’m', ' am')                # remove ’ (for am)
    text = text.replace('can’t', 'can not')         # remove ’ (for can't)
    text = text.replace('won’t', 'will not')        # remove ’ (for won't)
    text = text.replace('n’t', ' not')              # remove ’ (for don't, doesn't)

    text = text.replace('’', ' ')                   # remove apostrophe (in general)
    text = text.replace('&quot;', ' ')              # remove quotation sign (in general)

    text = text.replace('cant', 'can not')          # typo 'can't' (note that cant is a proper word)
    text = text.replace('dont', 'do not')           # typo 'don't'
    text = text.replace('weve', 'we have')          # typo 'we've'
    # text = text.replace('im', 'i am')               # typo 'i'm'

    text = re.sub(r'[^a-zA-Z0-9]', r' ', text)      # only alphanumeric left
    text = text.replace("   ", ' ')                 # remove triple empty space
    text = text.replace("  ", ' ')                  # remove double empty space
    return text

cleaned_text = lambda x: clean_text(x)

In [4]:
# Applying cleaning to dataset

# Product an cleaned dataset named clean_df
clean_df = pd.DataFrame(df.text.apply(cleaned_text))

# Reinsert the essential columns, rating and title

clean_df['rating'] = df.rating
clean_df['title'] = df.title

clean_df

Unnamed: 0,text,rating,title
0,My boyfriend bought this for our house and ou...,1,"Great Echo integration, terrible microwave."
1,Although the Alexa integration seemed a littl...,5,Very functional microwave at a great price
2,This is a good small microwave for someone in...,5,Good Small Microwave
3,A microwave is the most used kitchen applianc...,4,Is a voice control microwave worth it?
4,This microwave is not powerful and heats food...,1,Uneven heat
...,...,...,...
4995,Alexa stopped working on the microwave a week...,1,My Beloved Microwave
4996,I received this as a Christmas gift and absol...,5,It’s great
4997,Awful this is the worst microwave,1,Had to replace this microwave right away
4998,The plastic is coming up on the key pad I jus...,3,Not practical for me


In [5]:
#Convert to Pickle for later use
df.to_pickle("corpus.pkl")

#Convert cleaned data to pickle
clean_df.to_pickle('data_clean.pkl')