# FIT5120 - Industry Experience Studio Project  S1 2022

### Project Name: HOTEL REVIEW ASSISTANT
### Task Name: Data Preprocessing - Iteration 3 - Landlord Reliability


Team information
- Team Name: AntiFake
- Team Number: TA 36

Date: 16/05/2022

Version: 1.0

Programming Language: Python 3.8 and Jupyter notebook

Python Libraries used:
- pandas (For data manipulation and analysis)

## Table of Contents

* [1. Import Library](#sec_1)
* [2. Circular Gauge Chart](#sec_2)
* [3. Bar Chart](#sec_3)

### 1. Import Library

In [1]:
import pandas as pd

### 2. Circular Gauge Chart

In [2]:
# Extract the data from dataset
df_all = pd.read_csv('./listings.csv')
df_all.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,17834.0,17834.0,17834.0,17829.0,17829.0,0.0,17834.0,17834.0,17834.0,0.0,...,13244.0,13250.0,13244.0,13244.0,0.0,17834.0,17834.0,17834.0,17834.0,13621.0
mean,29038740.0,20211210000000.0,115203100.0,10.981715,10.981715,,-37.826927,145.016131,3.372379,,...,4.80577,4.81722,4.8281,4.679695,,8.648985,6.800493,1.709375,0.054054,0.887215
std,14718030.0,0.0,111446900.0,38.331455,38.331455,,0.072105,0.145288,2.244491,,...,0.443381,0.442884,0.37237,0.49064,,21.690211,17.546721,10.681517,0.457337,1.248908
min,9835.0,20211210000000.0,9082.0,0.0,0.0,,-38.22411,144.54922,1.0,,...,0.0,1.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.01
25%,17500000.0,20211210000000.0,23830820.0,1.0,1.0,,-37.85503,144.957552,2.0,,...,4.8,4.83,4.8,4.61,,1.0,0.0,0.0,0.0,0.11
50%,29600130.0,20211210000000.0,72213760.0,1.0,1.0,,-37.819365,144.97969,2.0,,...,4.95,4.97,4.94,4.8,,1.0,1.0,0.0,0.0,0.43
75%,41351610.0,20211210000000.0,181501200.0,4.0,4.0,,-37.800975,145.030265,4.0,,...,5.0,5.0,5.0,5.0,,4.0,3.0,1.0,0.0,1.2
max,53713100.0,20211210000000.0,434709800.0,457.0,457.0,,-37.4823,145.83784,16.0,,...,5.0,5.0,5.0,5.0,,151.0,133.0,116.0,8.0,32.25


In [3]:
# Find the unique category for host response time
df_all['host_response_time'].unique()

array([nan, 'within an hour', 'within a few hours', 'within a day',
       'a few days or more'], dtype=object)

In [4]:
# Impute the new value for host response time
df_all['host_response_time'] = df_all['host_response_time'].fillna('Not Available')
# Reset index
df_all = df_all.reset_index()
# Display the result
df_all['host_response_time']

0             Not Available
1             Not Available
2            within an hour
3            within an hour
4             Not Available
                ...        
17829        within an hour
17830        within an hour
17831    within a few hours
17832        within an hour
17833         Not Available
Name: host_response_time, Length: 17834, dtype: object

In [5]:
# Define the category for host response time
def label_race (row):
    if row['host_response_time'] == 'within an hour' :
        return 87.5
    if row['host_response_time'] == 'within a few hours' :
        return 62.5
    if row['host_response_time'] == 'within a day' :
        return 37.5
    if row['host_response_time'] == 'a few days or more' :
        return 12.5
    else:
        return 0

In [6]:
# The response time will be converted into the radical scale value.
df_all['Gauge_response_Time'] = df_all.apply (lambda row: label_race(row), axis=1)
# Display the result
df_all['Gauge_response_Time']

0         0.0
1         0.0
2        87.5
3        87.5
4         0.0
         ... 
17829    87.5
17830    87.5
17831    62.5
17832    87.5
17833     0.0
Name: Gauge_response_Time, Length: 17834, dtype: float64

In [7]:
# Calculate the averahe response Time for each landlord
new6 = df_all.copy()
new6 = new6[(new6[['Gauge_response_Time']] != 0).all(axis=1)]
new6 = new6.groupby('host_id')['Gauge_response_Time'].mean()
# Display the result
new6

host_id
9082         87.5
17308        87.5
26687        87.5
112497       37.5
117431       62.5
             ... 
432223297    87.5
432268990    37.5
432681418    87.5
433098114    37.5
433225384    87.5
Name: Gauge_response_Time, Length: 4464, dtype: float64

In [8]:
# Mege the average into main datafram 
df_all22 = df_all.merge(new6, on='host_id', how='left')
df_all22['Gauge_response_Time_y'] = df_all22['Gauge_response_Time_y'].fillna(0)
# Subset the main dataframe
df_all23 = df_all22[['name', 'host_response_time', 'Gauge_response_Time_y']]
df_all23 = df_all23.reset_index()

In [9]:
# Generate the output file
df_all23.to_csv('Gauge_chart.csv', index=False)

### 3. Bar Chart

In [10]:
# Remove Unesscary character
df_all['host_response_rate'] = df_all['host_response_rate'].str[:-1]
df_all['host_acceptance_rate'] = df_all['host_acceptance_rate'].str[:-1]

# Impute '0' into null value
df_all['host_response_rate'] = df_all['host_response_rate'].fillna(0)
df_all['host_acceptance_rate'] = df_all['host_acceptance_rate'].fillna(0)
df_all['review_scores_accuracy'] = df_all['review_scores_accuracy'].fillna(0)

# Converting Data type
df_all['host_acceptance_rate'] = df_all['host_acceptance_rate'].astype('float')
df_all['host_response_rate'] = df_all['host_response_rate'].astype('float')
df_all['review_scores_accuracy'] = df_all['review_scores_accuracy'].astype('float')

In [11]:
# Generate the average peroformance for each landloard
new2 = df_all.groupby('host_id')['review_scores_accuracy','host_acceptance_rate', 
                                 'host_response_rate'].mean()
new2

  new2 = df_all.groupby('host_id')['review_scores_accuracy','host_acceptance_rate',


Unnamed: 0_level_0,review_scores_accuracy,host_acceptance_rate,host_response_rate
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9082,4.83,0.0,100.0
17308,4.93,0.0,100.0
18785,4.89,0.0,0.0
26687,4.97,96.0,100.0
33057,4.33,0.0,0.0
...,...,...,...
433871001,0.00,100.0,0.0
434070273,5.00,100.0,0.0
434283907,0.00,100.0,0.0
434681527,0.00,100.0,0.0


In [12]:
# Mege the result into dataframe
df_all333 = df_all.merge(new2, on='host_id', how='left')
df_all333 = df_all333.dropna(subset=['name', 'host_id'])
# Validate the dataframe 
df_all333.shape

(17832, 79)

In [13]:
# Rename and cpnvert the accuracy into percentage format
df_new = df_all333
df_new['review_scores_accuracy_x'] = df_new['review_scores_accuracy_x'] *20
df_new['review_scores_accuracy_y'] = df_new['review_scores_accuracy_y'] *20

In [14]:
# Rouund the figures in 2 decimal format
df_new = df_new.round(2)
# Reset Indext
df_new = df_new.reset_index()
# Drop duplicate landlord ID 
df_new = df_new.drop_duplicates(subset=['host_id'])
df_new22 = df_new.copy()
df_new22 = df_new22[(df_new22[['review_scores_accuracy_y','host_acceptance_rate_y',
                              'host_response_rate_y']] != 0).all(axis=1)]

In [15]:
# Calculate the average perforamance for all landlords
df22223 = df22223.round(2)
df22223 = df_new22[["review_scores_accuracy_y","host_acceptance_rate_y",
                   'host_response_rate_y']].mean()
# Descriptiopn
df22223

NameError: name 'df22223' is not defined

In [None]:
# Assign new value in new column
df_all333 = df_all333.round(2)
df_all333['bar_chart'] = 1

In [None]:
# Encapsulated in a required format for bar chart creation in JavaScript.
for ind in df_all333.index:
    new_list = []
    Reliability =  ['Accuracy of Description', df_all333['review_scores_accuracy_y'][ind],96.35]
    Acceptance = ['Acceptance Rate', df_all333['host_acceptance_rate_y'][ind], 87.56]
    Response = ['Response Rate', df_all333['host_response_rate_y'][ind], 96.89]
    new_list.append(Reliability)
    new_list.append(Acceptance)
    new_list.append(Response)
    print(new_list)
    df_all333['bar_chart'][ind] = new_list

In [None]:
# Generate the output file
df_all333_out = df_all333[['name', 'bar_chart']]
df_all333_out = df_all333_out.round(2)
df_all333_out.to_csv('bar_chart.csv', index=False)