In [10]:
import pandas as pd

In [11]:
listings_original = pd.read_csv('./Resources/listings.csv')
cols = listings_original.columns.tolist()
col_dict = {}
for i in range(len(cols)):
    col_dict[i] = cols[i]
# col_dict

In [12]:
# column 3 used to get most recent entry for each host (see below)
hosts_df = listings_original.iloc[:, 16:34].merge(listings_original.iloc[:, [3, 16]], on='host_id')

# if true, then we need to cull redundant host IDs before creating table
len(hosts_df['host_id']) > len(hosts_df['host_id'].unique())

True

In [13]:
# look at redundant host data to determine best course of how to consolidate columns
idx = 1 # index var for quick switching
redundant_host_id = hosts_df['host_id'].value_counts().keys()[idx]
hosts_df[hosts_df['host_id'] == redundant_host_id].head(3)

Unnamed: 0,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,last_scraped
7754,4962900,https://www.airbnb.com/users/show/4962900,Jordan,2013-02-04,"Spokane, Washington, United States",Stay Alfred was created based on the idea of o...,within an hour,99%,100%,f,https://a1.muscache.com/ac/users/4962900/profi...,https://a1.muscache.com/ac/users/4962900/profi...,Central Business District,169.0,169.0,"['email', 'phone', 'linkedin', 'reviews', 'jum...",t,t,2016-01-04
7755,4962900,https://www.airbnb.com/users/show/4962900,Jordan,2013-02-04,"Spokane, Washington, United States",Stay Alfred was created based on the idea of o...,within an hour,99%,100%,f,https://a1.muscache.com/ac/users/4962900/profi...,https://a1.muscache.com/ac/users/4962900/profi...,Central Business District,169.0,169.0,"['email', 'phone', 'linkedin', 'reviews', 'jum...",t,t,2016-01-04
7756,4962900,https://www.airbnb.com/users/show/4962900,Jordan,2013-02-04,"Spokane, Washington, United States",Stay Alfred was created based on the idea of o...,within an hour,99%,100%,f,https://a1.muscache.com/ac/users/4962900/profi...,https://a1.muscache.com/ac/users/4962900/profi...,Central Business District,169.0,169.0,"['email', 'phone', 'linkedin', 'reviews', 'jum...",t,t,2016-01-04


In [14]:
# observed little variation between redundant host rows, okay to cull all but most recent
sorted_hosts_df = hosts_df.sort_values('last_scraped', ascending=False).drop_duplicates('host_id')
# make same redundancy check
len(sorted_hosts_df['host_id']) == len(sorted_hosts_df['host_id'].unique())

True

In [15]:
# remove extraneous columns; axis specifies to drop on col name, inplace modifies orignal DF
sorted_hosts_df.drop(['host_url', 'host_name', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'last_scraped'], axis=1, inplace=True)
# remaining columns might be used to perform analysis on host qualities vs listing/host ratings
# name removed for data ethics
sorted_hosts_df.columns.to_list()

['host_id',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified']

In [16]:
sorted_hosts_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2751 entries, 0 to 16327
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   host_id                    2751 non-null   int64  
 1   host_since                 2749 non-null   object 
 2   host_location              2743 non-null   object 
 3   host_response_time         2247 non-null   object 
 4   host_response_rate         2247 non-null   object 
 5   host_acceptance_rate       2020 non-null   object 
 6   host_is_superhost          2749 non-null   object 
 7   host_neighbourhood         2472 non-null   object 
 8   host_listings_count        2749 non-null   float64
 9   host_total_listings_count  2749 non-null   float64
 10  host_verifications         2749 non-null   object 
 11  host_has_profile_pic       2749 non-null   object 
 12  host_identity_verified     2749 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 30

In [17]:
# TODO - reformat certain column data types: 
# host_since => days since host joined (int)
sorted_hosts_df['host_since'] =  pd.to_datetime(sorted_hosts_df['host_since'])

# host_response_rate, host_acceptance_rate => float
# sorted_hosts_df['host_response_rate'] = sorted_hosts_df['host_response_rate'].astype('float64')
sorted_hosts_df['host_response_rate'] = sorted_hosts_df['host_response_rate'].astype(str)
sorted_hosts_df['host_response_rate'] = pd.to_numeric(sorted_hosts_df['host_response_rate'].str.replace("%", ""), errors='coerce')

sorted_hosts_df['host_acceptance_rate'] = sorted_hosts_df['host_acceptance_rate'].astype(str)
sorted_hosts_df['host_acceptance_rate'] = pd.to_numeric(sorted_hosts_df['host_acceptance_rate'].str.replace("%", ""), errors='coerce')

# host_is_superhost, host_has_profile_pic => boolean
sorted_hosts_df['host_is_superhost'] = sorted_hosts_df['host_is_superhost'].astype('bool')
sorted_hosts_df['host_has_profile_pic'] = sorted_hosts_df['host_has_profile_pic'].astype('bool')

# condense listings_count and total_listings_count - DELETE total_listings_count
del sorted_hosts_df['host_total_listings_count']

# Filling missing values for numeric columns
sorted_hosts_df['host_response_rate'].fillna(-1.0, inplace=True)
sorted_hosts_df['host_acceptance_rate'].fillna(-1.0, inplace=True)

# Filling missing values for categorical columns
sorted_hosts_df['host_response_time'].fillna('Unknown', inplace=True)

# host_verifications => host number types verified (int)
sorted_hosts_df.to_csv('./Resources/hosts.csv', index=False)

In [18]:
sorted_hosts_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2751 entries, 0 to 16327
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   host_id                 2751 non-null   int64         
 1   host_since              2749 non-null   datetime64[ns]
 2   host_location           2743 non-null   object        
 3   host_response_time      2751 non-null   object        
 4   host_response_rate      2247 non-null   float64       
 5   host_acceptance_rate    2751 non-null   float64       
 6   host_is_superhost       2751 non-null   bool          
 7   host_neighbourhood      2472 non-null   object        
 8   host_listings_count     2749 non-null   float64       
 9   host_verifications      2749 non-null   object        
 10  host_has_profile_pic    2751 non-null   bool          
 11  host_identity_verified  2749 non-null   object        
dtypes: bool(2), datetime64[ns](1), float64(3), int64(1),