# Data cleaning

## Import dependencies 

In [1]:
import pandas as pd
import string
import re

## Upload dataset

In [2]:
df= pd.read_csv('website_classification.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


## Lower case cleaned_website_text

In [4]:
df['cleaned_website_text']= df['cleaned_website_text'].apply(lambda x:x.lower())

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


## Remove any thing that is not a letter, number, or space (because the data had many ¥æœ¬èªž ç®€ä½“ä¸­æ–‡ ç¹é«”ä¸­æ–‡ polski ÎµÎ»Î»Î·Î½Î¹ÎºÎ¬)

In [6]:
df['cleaned_website_text']= df['cleaned_website_text'].apply(lambda x:re.sub("[^a-zA-Z0-9\s]+", "", x))

## replace the spaces with one space

In [7]:
df['cleaned_website_text']= df['cleaned_website_text'].apply(lambda x:re.sub(" +", " ", x))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


## Remove any non-english word based on nltk corpus and tokenizing

In [9]:
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\muner\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [10]:
df['English_only_website_text']= df['cleaned_website_text'].apply(lambda z:[w for w in nltk.wordpunct_tokenize(z) if w.lower() in words or not w.isalpha()])

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category,English_only_website_text
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,"[official, site, good, hotel, accommodation, b..."
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,"[hotel, book, like, use, vacation, work, hard,..."
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,"[hotel, book, like, previously, deal, predomin..."
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,"[cheap, search, compare, find, cheap, flight, ..."
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,"[bot, create, free, account, create, free, acc..."


# Remove punctuation

In [12]:
df['English_only_website_text'] = df['English_only_website_text'].apply(lambda x:[re.sub(f'[{string.punctuation}]+','',i) for i in x if i not in list(string.punctuation)])
df.head()       

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category,English_only_website_text
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,"[official, site, good, hotel, accommodation, b..."
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,"[hotel, book, like, use, vacation, work, hard,..."
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,"[hotel, book, like, previously, deal, predomin..."
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,"[cheap, search, compare, find, cheap, flight, ..."
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,"[bot, create, free, account, create, free, acc..."


# Remove stop words

## Import dependencies

In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Remove stop words

In [14]:
df['English_only_website_text'] = df['English_only_website_text'].apply(lambda x:[i for i in x if i not in stopwords.words('english')])

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category,English_only_website_text
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,"[official, site, good, hotel, accommodation, b..."
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,"[hotel, book, like, use, vacation, work, hard,..."
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,"[hotel, book, like, previously, deal, predomin..."
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,"[cheap, search, compare, find, cheap, flight, ..."
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,"[bot, create, free, account, create, free, acc..."


# Lemmatization

## Import depencies

In [16]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\muner\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Lemmatizer

In [17]:
wordnetlemmatizer= WordNetLemmatizer()
df['English_only_website_text']=df['English_only_website_text'].apply(lambda x:[wordnetlemmatizer.lemmatize(i) for i in x])

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category,English_only_website_text
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,"[official, site, good, hotel, accommodation, b..."
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,"[hotel, book, like, use, vacation, work, hard,..."
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,"[hotel, book, like, previously, deal, predomin..."
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,"[cheap, search, compare, find, cheap, flight, ..."
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,"[bot, create, free, account, create, free, acc..."


# Join tokenized words

In [19]:
df['English_only_website_text']= df['English_only_website_text'].apply(lambda x:' '.join(x))

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category,English_only_website_text
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,official site good hotel accommodation big sav...
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,hotel book like use vacation work hard year lo...
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,hotel book like previously deal predominantly ...
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,cheap search compare find cheap flight find co...
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,bot create free account create free account si...


# Dropping unneeded columns

In [21]:
df.drop(['Unnamed: 0', 'website_url', 'cleaned_website_text'],axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,Category,English_only_website_text
0,Travel,official site good hotel accommodation big sav...
1,Travel,hotel book like use vacation work hard year lo...
2,Travel,hotel book like previously deal predominantly ...
3,Travel,cheap search compare find cheap flight find co...
4,Travel,bot create free account create free account si...


# Convert category to labels 

In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Category']= le.fit_transform(df['Category'])

In [24]:
df.head()

Unnamed: 0,Category,English_only_website_text
0,15,official site good hotel accommodation big sav...
1,15,hotel book like use vacation work hard year lo...
2,15,hotel book like previously deal predominantly ...
3,15,cheap search compare find cheap flight find co...
4,15,bot create free account create free account si...


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 2 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Category                   1408 non-null   int32 
 1   English_only_website_text  1408 non-null   object
dtypes: int32(1), object(1)
memory usage: 16.6+ KB


In [26]:
df.to_csv("Preprocessed_website_class_englishOnly.csv", index=False)