## Libraries

In [1]:
# numpy 
import numpy as np
# dataframe
import pandas as pd
# regular expression
import re
# parsing web contents
from bs4 import BeautifulSoup
# stopwords
from nltk.corpus import stopwords
# lemmatize, stemp
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer

## Data

In [2]:
# load data
df = pd.read_csv('../data/imdb_reviews.csv').head(100)

# first few rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Cleaning

### Lower case

In [3]:
# to lower case
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Parse HTML

In [4]:
# removing the html elements
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

df['review'] = df['review'].apply(strip_html)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### URLs

In [5]:
# to remove URLs
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

### Email ids

In [6]:
# to remove email address
def remove_email(text):
    url = re.compile(r'\S+@\S+')
    return url.sub(r'', text)

### Hashtags

In [7]:
# to remove hashtags
def remove_hashtags(text):
    url = re.compile(r'#\S+')
    return url.sub(r'', text)

In [8]:
remove_hashtags('#covid-19 #corona are trending')

'  are trending'

### Mentions

In [9]:
# to remove mentions
def remove_mentions(text):
    url = re.compile(r'@\S+')
    return url.sub(r'', text)

In [10]:
remove_mentions('this message is to @abc23 and @tysf')

'this message is to  and '

### Emoji

In [11]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [12]:
remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

### Remove non alpha-numeric (problem is, what if the message is not in english?!)

In [13]:
# remove non alpha numeric characters
def remove_non_alpha_num(text):
    return re.sub('[^a-zA-Z0-9]', ' ', text)

# df['review'] = df['review'].apply(remove_non_alpha_num)
# df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is...,positive


### Removing punctuation

In [15]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_punct(text):
    table=str.maketrans('', '', string.punctuation)
    return text.translate(table)

df['review'] = df['review'].apply(remove_punct)
df.head()

### Remove words containing numbers

In [1]:
# Remove all words that contain numbers
# data['reviews'] = data['reviews'].apply(lambda x: re.sub('\w*\d+\w*', ' ', x))

### Remove numbers

In [None]:
# remove non alpha numeric characters
def remove_nom(text):
    return re.sub('\w*\d\w*', ' ', text)

df['review'] = df['review'].apply(remove_non_alpha_num)
df.head()

## Removing stopwords

In [14]:
# common stop words
common_sw = stopwords.words('english')

# case specific stopwords
specific_sw = []

# all stopwords
sw = common_sw + specific_sw

In [15]:
def remove_stopwords(text):
    text = [word for word in text.split() if word not in sw]
    return " ".join(text)

df['review'] = df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake thinks zombie...,negative
4,petter mattei love time money visually stunnin...,positive


## Stem

In [16]:
# stemmer = SnowballStemmer("english")

# def stemming(text):    
#     text = [stemmer.stem(word) for word in text.split()]
#     return " ".join(text) 

In [17]:
# stemmer = PorterStemmer()

# def stemming(text):    
#     text = [stemmer.stem(word) for word in text.split()]
#     return " ".join(text) 

## Lemmatize

In [18]:
lemma = WordNetLemmatizer()

def lemmatize(text):
    text = [lemma.lemmatize(word) for word in text.split()]
    return " ".join(text)

df['review'] = df['review'].apply(lemmatize)
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode h...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
