# Libraries for this notebook

In [None]:
import pandas as pd
import seaborn as sns

pd.set_option('display.max_colwidth', -1)

## goibibo dataframe preparation

In [None]:
data_goibibo = pd.read_csv(r"C:\Users\kavya\Documents\4Th Sem\documents_masterProj\projectDataSet\goibibo_com-travel_sample.csv")

data_goibibo['hotel_facilities'] =  data_goibibo['additional_info'] + data_goibibo['hotel_facilities'] + data_goibibo['room_facilities']


ibibo_cols_remove = ['additional_info','room_facilities','area','country','crawl_date','similar_hotel','uniq_id',
                     'locality','province','qts','query_time_stamp','property_id','review_count_by_category','point_of_interest',
                     'room_area','hotel_category','guest_recommendation','hotel_brand']

data_goibibo = data_goibibo.drop(ibibo_cols_remove,axis=1)

print(data_goibibo.shape)
print(data_goibibo.dtypes)

data_goibibo.head(5)


## booking dataframe preparation

In [None]:
data_booking = pd.read_csv(r"C:\Users\kavya\Documents\4Th Sem\documents_masterProj\projectDataSet\booking_com-travel_sample.csv")

booking_cols_remove = ['country','crawl_date','property_id','province','qts','similar_hotel','zone',
                       'special_tag','uniq_id','locality','hotel_brand']

data_booking = data_booking.drop(booking_cols_remove, axis=1)

print(data_booking.shape)
print(data_booking.dtypes)

data_booking.head(5)

## Listing similar columns from dataframes

In [None]:
ibibo_cols = data_goibibo.columns.values.tolist()
booking_cols = data_booking.columns.values.tolist()

def same_cols_list_func(ibibo_cols, booking_cols):
    same_cols = [i for i in ibibo_cols + booking_cols if i in ibibo_cols or i in booking_cols] 
    return same_cols

same_cols_list = same_cols_list_func(ibibo_cols,booking_cols)

same_cols_list

## Listing different columns from dataframes

In [None]:
def diff_cols_list_func(ibibo_cols, booking_cols):
    diff_cols = [i for i in ibibo_cols + booking_cols if i not in ibibo_cols or i not in booking_cols] 
    return diff_cols

diff_cols_list = diff_cols_list_func(ibibo_cols,booking_cols)

diff_cols_list

# handling dtype issue

### identifying difference in dtypes between the columns of dataframes

In [None]:
for i_col in ibibo_cols:
    if(data_goibibo[i_col].dtype != data_booking[i_col].dtype):
        print(data_goibibo[i_col].name,data_goibibo[i_col].dtype)
        print(data_booking[i_col].name,data_booking[i_col].dtype)
        print('--xx--')


### handling string to number conversion and regular expression errors in columns

In [None]:
print(data_booking['site_review_count'].dtype)
data_booking['site_review_count'] = data_booking['site_review_count'].replace('\,','',regex=True).astype(float)
print(data_booking['site_review_count'].dtype)

print('--xx--')

print(data_booking['hotel_star_rating'].dtype)
data_booking['hotel_star_rating'] = data_booking['hotel_star_rating'].str.extract('(\d+)')
data_booking['hotel_star_rating'] = pd.to_numeric(data_booking['hotel_star_rating'])
print(data_booking['hotel_star_rating'].dtype)

### casting identified dissimilar dtypes of columns to common type in dataframes

In [None]:
col_dataTypeFix = {
    'hotel_star_rating' : float,
    'image_count' : float,
    'room_count' : float,
    'site_review_count' : float
}

data_goibibo.astype(col_dataTypeFix)
data_booking.astype(col_dataTypeFix)


## Merging / Appending two dataframes 
#### Only if columns of dataframes have been prepared well

In [None]:
data_goibibo_booking = data_goibibo.append(data_booking,ignore_index = True, sort = False)

print(data_goibibo_booking.shape)

data_goibibo_booking.columns.values.tolist()

# Cleartrip dataframe preparation

In [None]:
data_clearTrip = pd.read_csv(r"C:\Users\kavya\Documents\4Th Sem\documents_masterProj\projectDataSet\cleartrip_com-travel_sample.csv")

data_clearTrip['hotel_facilities'] = data_clearTrip['hotel_facilities'] + data_clearTrip['room_facilities']

data_clearTrip = data_clearTrip.rename(columns={'tad_review_count':'site_review_count','tad_review_rating':'site_review_rating',
                              'tad_stay_review_rating':'site_stay_review_rating'})

clearTrip_cols_remove = ['area','country','cleartrip_seller_rating','crawl_date','image_urls','landmark','locality','property_id',
                        'province','qts','room_area','room_facilities','similar_hotel','tripadvisor_seller_rating','uniq_id'];


data_clearTrip = data_clearTrip.drop(clearTrip_cols_remove,axis=1)

print(data_clearTrip.shape)
print(data_clearTrip.columns.values.tolist())

### Handling dtypes issue cleartrip  


In [None]:
ibibo_booking_cols = data_goibibo_booking.columns.values.tolist()
clear_cols = data_clearTrip.columns.values.tolist()

for i_b_col in ibibo_booking_cols:
    if(data_goibibo_booking[i_b_col].dtype != data_clearTrip[i_b_col].dtype):
        print(data_goibibo_booking[i_b_col].name,data_goibibo_booking[i_b_col].dtype)
        print(data_clearTrip[i_b_col].name,data_clearTrip[i_b_col].dtype)
        print('--xx--')

In [None]:
print(data_clearTrip['hotel_star_rating'].dtype)
data_clearTrip['hotel_star_rating'] = data_clearTrip['hotel_star_rating'].str.extract('(\d+)')
data_clearTrip['hotel_star_rating'] = pd.to_numeric(data_booking['hotel_star_rating'])
print(data_clearTrip['hotel_star_rating'].dtype)


#### making the dtypes similar in cleartrip

In [None]:
clearTrip_dataTypeFix = {
    'hotel_star_rating' : float,
    'image_count' : float,
    'room_count' : float
}

data_clearTrip.astype(clearTrip_dataTypeFix)

## merging data_clearTrip and ibibo_booking_data dataframes

In [None]:
data_goibibo_booking_clear = data_goibibo_booking.append(data_clearTrip,ignore_index = True, sort = False)

print(data_goibibo_booking_clear.shape)

data_goibibo_booking_clear.columns.values.tolist()

data_goibibo_booking_clear

print('-- xx Available values xx--')
print(data_goibibo_booking_clear.count())

print('-- xx NA values xx--')
print(data_goibibo_booking_clear.isna().sum())

print('-- xx null values xx--')
print(data_goibibo_booking_clear.isnull().sum())

### taking care of NAN and null values

In [None]:
data_goibibo_booking_clear = data_goibibo_booking_clear.dropna(subset=['hotel_facilities', 'site_stay_review_rating'])

data_goibibo_booking_clear = data_goibibo_booking_clear

print('-- xx Available values xx--')
print(data_goibibo_booking_clear.count())

print('-- xx NA values xx--')
print(data_goibibo_booking_clear.isna().sum())

print('-- xx null values xx--')
print(data_goibibo_booking_clear.isnull().sum())

data_goibibo_booking_clear.shape

#### creating those extra columns for SSRR

In [None]:
data_goibibo_booking_clear.insert(1,'SSRR1',0)
data_goibibo_booking_clear.insert(2,'SSRR2',0)
data_goibibo_booking_clear.insert(3,'SSRR3',0)
data_goibibo_booking_clear.insert(4,'SSRR4',0)
data_goibibo_booking_clear.insert(5,'SSRR5',0)
data_goibibo_booking_clear.insert(6,'SSRR6',0)

def extractNumericValue(indiSSRR):
    rating = 0
    if(len(indiSSRR) == 2):
        if(len(indiSSRR[1]) == 0):
            rating = 0
        else:
            rating = float(indiSSRR[1])
            rating = 0.5*(1+rating)
    return rating

def extractNumericValue2(indiSSRR):
    rating = 0
    if(len(indiSSRR) == 2):
        if(len(indiSSRR[1]) == 0):
            rating = 0
        else:
            rating = float(indiSSRR[1])
    return rating

def manageSSRR_revised_2():
    s_s_r_r = data_goibibo_booking_clear[['site_stay_review_rating']]
    
    for ind in data_goibibo_booking_clear.index:
        valSSRR = data_goibibo_booking_clear.loc[ind,'site_stay_review_rating']
        valSiteName = data_goibibo_booking_clear.loc[ind,'sitename']
    
        splitSSRR = valSSRR.split("|")
        
        if(valSiteName == 'http://www.booking.com/'):
            for i in range(len(splitSSRR)):
                if(i == 0):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR1'] = rating
                elif(i == 1):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR2'] = rating
                elif(i == 2):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR3'] = rating
                elif(i == 3):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR4'] = rating
                elif(i == 4):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR5'] = rating
                elif(i == 5):
                    indiSSRR = splitSSRR[i].split(":")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR6'] = rating
                    
        else:
            for i in range(len(splitSSRR)):
                if(i == 0):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR1'] = rating
                elif(i == 1):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR2'] = rating
                elif(i == 2):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR3'] = rating
                elif(i == 3):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR4'] = rating
                elif(i == 4):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR5'] = rating
                elif(i == 5):
                    indiSSRR = splitSSRR[i].split("::")
                    rating = extractNumericValue(indiSSRR)
                    data_goibibo_booking_clear.loc[ind,'SSRR6'] = rating
                                   
manageSSRR_revised_2()
    
data_goibibo_booking_clear

### feature engineering : site_stay_review_rating 

In [None]:
def manageSSRR_revised():
    s_s_r_r = data_goibibo_booking_clear[['site_stay_review_rating']]
    
    for ind in data_goibibo_booking_clear.index:
        valSSRR = data_goibibo_booking_clear.loc[ind,'site_stay_review_rating']
        valSiteName = data_goibibo_booking_clear.loc[ind,'sitename']
    
        splitSSRR = valSSRR.split("|")
        
        sumRR = 0
        averageRR = 0
        
        if(valSiteName == 'http://www.booking.com/'):
            for i in range(len(splitSSRR)):
                indiSSRR = splitSSRR[i].split(":")
                if(len(indiSSRR) == 2):
                    if(len(indiSSRR[1]) == 0):
                        rating = 0
                    else:
                        rating = float(indiSSRR[1])
                        ratingScaleChange = 0.5*(1+rating)
                        sumRR += ratingScaleChange
                else:
                    sumRR = 0
            if(sumRR > 0):
                averageRR = round((sumRR/(len(splitSSRR))),2)
            else:
                averageRR = 0
            data_goibibo_booking_clear.loc[ind,'site_stay_review_rating'] = averageRR
        else:
            for i in range(len(splitSSRR)):
                indiSSRR = splitSSRR[i].split("::")
                if(len(indiSSRR) == 2):
                    if(len(indiSSRR[1]) == 0):
                        rating = 0
                    else:
                        rating = float(indiSSRR[1])
                        sumRR += rating
                else:
                    sumRR = 0
            if(sumRR > 0):
                averageRR = round((sumRR/(len(splitSSRR))),2)
            else:
                averageRR = 0
            data_goibibo_booking_clear.loc[ind,'site_stay_review_rating'] = averageRR
                                   
manageSSRR_revised()

data_goibibo_booking_clear


#### finding unique values in state column , there are 36 states in total. 29S + 7UT

In [None]:
print(data_goibibo_booking_clear.state.dtypes)

def fixStateNames():
    uniqueState = data_goibibo_booking_clear['state'].unique().tolist()
    print(len(uniqueState))
    print(uniqueState)
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].apply(lambda x: str(x).lower()) 
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('delhi ncr','delhi')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('jammu & kashmir','jammu and kashmir')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('jammuandkashmir','jammu and kashmir')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('andaman and nicobar','andaman and nicobar islands')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('andhrapradesh','andhra pradesh')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('tamilnadu','tamil nadu')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('himachalpradesh','himachal pradesh')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('madhyapradesh','madhya pradesh')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('uttarpradesh','uttar pradesh')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('uttarakhand','uttaranchal')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('westbengal','west bengal')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('damananddiu','daman and diu')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('andamanandnicobarislands','andaman and nicobar islands')
    data_goibibo_booking_clear['state'] = data_goibibo_booking_clear['state'].str.replace('arunachalpradesh','arunachal pradesh')
    uniqueState2 = data_goibibo_booking_clear['state'].unique().tolist()
    print(len(uniqueState2))
    print(uniqueState2)

fixStateNames()
    

#### Adding an extra columns to store hotel facilities count

In [None]:
data_goibibo_booking_clear.insert(10,'hotel_facilities_count',0)


#### counting the number of facilities : for visualisation

In [None]:
def manageHotelFacilities():
    for ind in data_goibibo_booking_clear.index:
        valHotelFacilities = data_goibibo_booking_clear.loc[ind,'hotel_facilities']
        splitHF = valHotelFacilities.split("|")
        facilitiesCount = len(splitHF)
        data_goibibo_booking_clear.loc[ind,'hotel_facilities_count'] = facilitiesCount

manageHotelFacilities()

data_goibibo_booking_clear

#### finding duplicates based on address 

In [None]:
data_goibibo_booking_clear_propertyDup = data_goibibo_booking_clear[data_goibibo_booking_clear.duplicated(['address'])]
data_goibibo_booking_clear_propertyDup = data_goibibo_booking_clear_propertyDup.sort_values(by='address')

data_goibibo_booking_clear_propertyDup1 = data_goibibo_booking_clear_propertyDup[['address','property_type','sitename','pageurl']]

with pd.option_context("display.max_rows", 3000):
    display(data_goibibo_booking_clear_propertyDup1)

#### cleaning and substitution of numbers to string values in property_type column

In [None]:
propertyType_values_to_replace = {'204':'hotel','208':'hotel','219':'hotel','216':'hotel','218':'hotel','223':'hotel','203':'hotel','225':'hotel','222':'homestay','201':'apartments','212':'bunglows','215':'houseboat','224':'cottages','220':'cottages','221':'lodges','213':'villas','206':'resort','231':'resort'}
data_goibibo_booking_clear = data_goibibo_booking_clear.replace({"property_type":propertyType_values_to_replace})

data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].apply(lambda x: str(x).lower())

data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('bed_and_breakfast','bed and breakfast')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('serviced apartments','service apartment')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('bnb','bed and breakfast')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('guest_house','guest house')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('farm_holiday','farm holiday')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('holiday_home','holiday home')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('villas','villa')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('bunglows','bunglow')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('lodges','lodge')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('boat','houseboat')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('househouseboat','houseboat')
data_goibibo_booking_clear['property_type'] = data_goibibo_booking_clear['property_type'].str.replace('bungalow','bunglow')

propertyTypeList2 = data_goibibo_booking_clear["property_type"].unique().tolist()
print(len(propertyTypeList2))
print(propertyTypeList2)