# Expedia Hotel Classification with PySpark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import numpy as np
from pyspark.sql.functions import count
import seaborn as sns
from pyspark.sql import functions as F
from sklearn import metrics

In [2]:
spark = SparkSession.builder.getOrCreate()

## Import data

In [3]:
train = spark.read.csv('../data/input/train.csv', header=True)
test = spark.read.csv('../data/input/test.csv', header=True)

In [4]:
train.printSchema()

root
 |-- date_time: string (nullable = true)
 |-- site_name: string (nullable = true)
 |-- posa_continent: string (nullable = true)
 |-- user_location_country: string (nullable = true)
 |-- user_location_region: string (nullable = true)
 |-- user_location_city: string (nullable = true)
 |-- orig_destination_distance: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- is_mobile: string (nullable = true)
 |-- is_package: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- srch_ci: string (nullable = true)
 |-- srch_co: string (nullable = true)
 |-- srch_adults_cnt: string (nullable = true)
 |-- srch_children_cnt: string (nullable = true)
 |-- srch_rm_cnt: string (nullable = true)
 |-- srch_destination_id: string (nullable = true)
 |-- srch_destination_type_id: string (nullable = true)
 |-- is_booking: string (nullable = true)
 |-- cnt: string (nullable = true)
 |-- hotel_continent: string (nullable = true)
 |-- hotel_country: string (nullable = true)
 

In [7]:
train.count()

37670293

In [21]:
train.head(10)

[Row(date_time='2014-08-11 07:46:59', site_name='2', posa_continent='3', user_location_country='66', user_location_region='348', user_location_city='48862', orig_destination_distance='2234.2641', user_id='12', is_mobile='0', is_package='1', channel='9', srch_ci='2014-08-27', srch_co='2014-08-31', srch_adults_cnt='2', srch_children_cnt='0', srch_rm_cnt='1', srch_destination_id='8250', srch_destination_type_id='1', is_booking='0', cnt='3', hotel_continent='2', hotel_country='50', hotel_market='628', hotel_cluster='1'),
 Row(date_time='2014-08-11 08:22:12', site_name='2', posa_continent='3', user_location_country='66', user_location_region='348', user_location_city='48862', orig_destination_distance='2234.2641', user_id='12', is_mobile='0', is_package='1', channel='9', srch_ci='2014-08-29', srch_co='2014-09-02', srch_adults_cnt='2', srch_children_cnt='0', srch_rm_cnt='1', srch_destination_id='8250', srch_destination_type_id='1', is_booking='1', cnt='1', hotel_continent='2', hotel_country=

In [22]:
train.show(10)

+-------------------+---------+--------------+---------------------+--------------------+------------------+-------------------------+-------+---------+----------+-------+----------+----------+---------------+-----------------+-----------+-------------------+------------------------+----------+---+---------------+-------------+------------+-------------+
|          date_time|site_name|posa_continent|user_location_country|user_location_region|user_location_city|orig_destination_distance|user_id|is_mobile|is_package|channel|   srch_ci|   srch_co|srch_adults_cnt|srch_children_cnt|srch_rm_cnt|srch_destination_id|srch_destination_type_id|is_booking|cnt|hotel_continent|hotel_country|hotel_market|hotel_cluster|
+-------------------+---------+--------------+---------------------+--------------------+------------------+-------------------------+-------+---------+----------+-------+----------+----------+---------------+-----------------+-----------+-------------------+------------------------+--

In [17]:
# Save the first ten rows of the train data set into a pandas data fram trainp
trainp = pd.DataFrame(train.head(10), columns=train.columns)

In [18]:
trainp.head(10)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21
5,2014-08-09 18:13:12,2,3,66,442,35390,911.5142,93,0,0,...,0,1,14984,1,0,1,2,50,1457,92
6,2014-07-16 09:42:23,2,3,66,189,10067,,501,0,0,...,0,1,8267,1,0,2,2,50,675,41
7,2014-07-16 09:45:48,2,3,66,189,10067,,501,0,1,...,0,1,8267,1,0,1,2,50,675,41
8,2014-07-16 09:52:11,2,3,66,189,10067,,501,0,0,...,0,1,8267,1,0,1,2,50,675,69
9,2014-07-16 09:55:24,2,3,66,189,10067,,501,0,0,...,0,1,8267,1,0,1,2,50,675,70


**Describe**

In [23]:
train.describe()

DataFrame[summary: string, date_time: string, site_name: string, posa_continent: string, user_location_country: string, user_location_region: string, user_location_city: string, orig_destination_distance: string, user_id: string, is_mobile: string, is_package: string, channel: string, srch_ci: string, srch_co: string, srch_adults_cnt: string, srch_children_cnt: string, srch_rm_cnt: string, srch_destination_id: string, srch_destination_type_id: string, is_booking: string, cnt: string, hotel_continent: string, hotel_country: string, hotel_market: string, hotel_cluster: string]

## Preprocessing