## Basics of Cleaning Text

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv("tripadvisor_hotel_reviews.csv")
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [8]:
df.shape

(20491, 2)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [10]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

In [11]:
df["Rating"] = df["Rating"].map({1:0, 2:0, 3:1, 4:2, 5:2})
df.sample(3)

Unnamed: 0,Review,Rating
10053,boutique style hotel near arc triomphe just ch...,2
1806,good value just returned stay san francisco st...,2
20296,"excellent value prime location, got fantastic ...",2


In [12]:
import re

for x in df.index:
    text=df.iloc[x]['Review'].lower()
    url_find = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if url_find:
        text = re.sub("".join(url_find), " ", text)
    text = re.sub("[^a-z]+", " ", text)
    clean_text = re.sub(r"\s+", " ", text)
    df.iloc[x,0] = clean_text

In [13]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not experience hotel monaco seattle...,1
3,unique great stay wonderful time hotel monaco ...,2
4,great stay great stay went seahawk game awesom...,2


## Text Processing Using nltk

In [16]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ngs11\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ngs11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

sw = stopwords.words('english')
ps = PorterStemmer()

def text_processing(clean_text):
    tok_text = word_tokenize(clean_text)
    filter_text = [text for text in tok_text if text not in sw]
    stem_text = [ps.stem(text) for text in filter_text]
    return " ".join(stem_text)

In [18]:
df['Review'] = df['Review'].apply(lambda x:text_processing(x))

In [19]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,2
1,ok noth special charg diamond member hilton de...,0
2,nice room experi hotel monaco seattl good hote...,1
3,uniqu great stay wonder time hotel monaco loca...,2
4,great stay great stay went seahawk game awesom...,2
...,...,...
20486,best kept secret rd time stay charm star ca n ...,2
20487,great locat price view hotel great quick place...,2
20488,ok look nice modern outsid desk staff n partic...,0
20489,hotel theft ruin vacat hotel open sept guest w...,0
