**Import Libraries**

In [1]:
import pandas as pd
import re

In [2]:
pd.set_option('display.max_columns', None)

**Function to clean Listings information**

In [3]:
def listings(x):

    #change titles to lower case 
    lower = []
    for i in x.columns:
        if ' ' in i:
            i = i.replace(' ', '_')
            lower.append(i.lower())
        else:    
            lower.append(i.lower())
    x.columns = lower
    
    
    #drop unneccessary columns
    x = x.drop(columns=['name','description','property_type','neighborhood_overview','host_neighbourhood','listing_url','scrape_id','last_scraped','picture_url','host_url','host_location','host_since','host_about','host_response_time','host_thumbnail_url','host_picture_url','host_verifications','host_has_profile_pic','minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','calendar_updated','has_availability','availability_30','availability_60','availability_90','availability_365','calendar_last_scraped','number_of_reviews_ltm','number_of_reviews_l30d','first_review','last_review','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms'])

    
    #host_is_superhost, host_identity_verified, instant bookable - to upper
    x['host_is_superhost'] = x['host_is_superhost'].str.upper()
    x['host_identity_verified'] = x['host_identity_verified'].str.upper()
    x['instant_bookable'] = x['instant_bookable'].str.upper()
    
    
    #Strip percentage signs and create decimal point to measure between 0 and 1 
    x['host_response_rate'] = x['host_response_rate'].str.strip('%')
    x['host_response_rate'] = x['host_response_rate'].astype(float)
    x['host_response_rate'] = x['host_response_rate'].fillna(0)
    x['host_response_rate'] = x['host_response_rate']/100
    x['host_acceptance_rate'] = x['host_acceptance_rate'].str.strip('%')
    x['host_acceptance_rate'] = x['host_acceptance_rate'].astype(float)
    x['host_acceptance_rate'] = x['host_acceptance_rate'].fillna(0)
    x['host_acceptance_rate'] = x['host_acceptance_rate']/100

    
    #fill na in host_name to Unknown
    x['host_name'] = x['host_name'].fillna('Unknown')
    x['host_response_rate'] = x['host_response_rate'].fillna(0)
    x['host_acceptance_rate'] = x['host_acceptance_rate'].fillna(0)
    x['host_is_superhost'] = x['host_is_superhost'].fillna('F')
    x['host_identity_verified'] = x['host_identity_verified'].fillna('F')


    #Fill the number of beds with the number of bedrooms & vice versa
    x["beds"] = x["beds"].fillna(2)
    x["beds"] = x['beds'].astype(int)
    x["bedrooms"] = x["bedrooms"].fillna(1)
    x["bedrooms"] = x['bedrooms'].astype(int)

    
    #host_listings_count, host_total_listings_count to int
    x['host_listings_count'] = x['host_listings_count'].fillna(0)
    x['host_total_listings_count'] = x['host_total_listings_count'].fillna(0)
    x['host_listings_count'] = x['host_listings_count'].astype(int)
    x['host_total_listings_count'] = x['host_total_listings_count'].astype(int)


    #rename column 'neighbourhood' to 'city', fill with 'madrid'
    x = x.rename(columns={'neighbourhood':'city'})

    for i in x['city']:
        x['city'] = 'Madrid'


    #room type values: Entire place, Private room, Shared room, Hotel room
    x['room_type'] = x['room_type'].str.replace("Entire home/apt","Entire place")

    
    #bathrooms_text - split at space and take first part fill nans and find any other irregular values
    x['bathrooms_text'] = x['bathrooms_text'].fillna('0 baths')
    x['bathrooms'] = x['bathrooms_text'].str.split(" ", n = 0, expand = True)[0]

    
    #replace any non-numerica value with a 1 (there are only alpha values for existing bathrooms)
    baths = []
    for i in x['bathrooms']:
        if i.isnumeric() == False:
            baths.append(1)
        else:
            baths.append(i)
    x['bathrooms'] = baths

    
    #Drop Bathrooms_text, no longer needed, change bathrooms 
    x = x.drop(columns=['bathrooms_text'])
    x['bathrooms'] = x['bathrooms'].astype(int)



    #price - convert to euros
    x['price'] = x['price'].str.lstrip('$')
    x['price'] = x['price'].replace(',', '', regex=True)
    x['price'] = x['price'].astype(float)
    x['price'] = x['price']*0.99
    x = x.rename(columns={'price':'price_€'})


    #Review part - fill na with 0
    def fillnil(y):
        for i in y:
            y = y.fillna(0)
        return y


    x['number_of_reviews'] = fillnil(x['number_of_reviews'])
    x['review_scores_rating'] = fillnil(x['review_scores_rating'])
    x['review_scores_accuracy'] = fillnil(x['review_scores_accuracy'])
    x['review_scores_cleanliness'] = fillnil(x['review_scores_cleanliness'])
    x['review_scores_checkin'] = fillnil(x['review_scores_checkin'])
    x['review_scores_communication'] = fillnil(x['review_scores_communication'])
    x['review_scores_location'] = fillnil(x['review_scores_location'])
    x['review_scores_value'] = fillnil(x['review_scores_value'])

    #License - change to boolean T F

    license = []
    for i in x['license']:
        if i is None:
            license.append('F')
        else:
            license.append('T') 
    x['license'] = license 

    # reviews per month - fill 0    
    x['reviews_per_month'] = fillnil(x['reviews_per_month'])
    
    
    
    #loop using Regex to extract values and insert them into a list, finally putting them into their own columns and scoring them

    lift = []
    heating = []
    aircon = []
    exterior = []
    garden = []
    pool = []
    terrace = []
    balcony = []
    storage = []

    for i in x['amenities']:
            if re.findall('Elevator|Lift', i):
                lift.append(1)
            else:
                lift.append(0)
    x['lift'] = lift

    for i in x['amenities']:
            if re.findall('heating', i):
                heating.append(1)
            else:
                heating.append(0)
    x['heating'] = heating

    for i in x['amenities']:
            if re.findall('ondition|ircon', i):
                aircon.append(1)
            else:
                aircon.append(0)
    x['aircon'] = aircon

    for i in x['amenities']:
            if re.findall('exterior|Exterior|Outside|outside', i):
                exterior.append(1)
            else:
                exterior.append(0)
    x['exterior'] = exterior

    for i in x['amenities']:
            if re.findall('garden|Garden', i):
                garden.append(1)
            else:
                garden.append(0)
    x['garden'] = garden

    for i in x['amenities']:
            if re.findall('pool|Pool', i):
                pool.append(1)
            else:
                pool.append(0)
    x['pool'] = pool

    for i in x['amenities']:
            if re.findall('errace', i):
                terrace.append(1)
            else:
                terrace.append(0)
    x['terrace'] = terrace

    for i in x['amenities']:
            if re.findall('alcony', i):
                balcony.append(1)
            else:
                balcony.append(0)
    x['balcony'] = balcony

    for i in x['amenities']:
            if re.findall('torage', i):
                storage.append(1)
            else:
                storage.append(0)
    x['storage'] = storage

    
    #Final amenity score, finding a total for each lodging depending on their amenities
    x['amenity_score'] = x['lift'] + x['heating'] + x['aircon'] + x['exterior'] + x['garden'] + x['pool'] + x['terrace'] + x['balcony'] + x['storage']
    
    
    #Reorder columns
    x = x[['id','host_id','host_name','host_response_rate','host_acceptance_rate','host_is_superhost','host_listings_count','host_total_listings_count','host_identity_verified','city','neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude','price_€','room_type','accommodates','bathrooms','bedrooms','beds','lift','heating','aircon','exterior','garden','pool','terrace','balcony','storage','amenity_score','minimum_nights','maximum_nights','number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','license','instant_bookable','reviews_per_month']]
    
    return x


**Import Dataframes, see the shape and Dataframe contents before the clean...**

In [4]:
Sep21 = pd.read_csv('raw_data/AirbnbData/10Sep21/listings.csv')
Dec21 = pd.read_csv('raw_data/AirbnbData/8Dec21/listings.csv')
Mar22 = pd.read_csv('raw_data/AirbnbData/10Mar22/listings.csv')
Jun22 = pd.read_csv('raw_data/AirbnbData/7Jun22/listings.csv')
print('Sep 21 shape:',Sep21.shape)
print('Dec 21 shape:',Dec21.shape)
print('March 22 shape:',Mar22.shape)
print('Jun 22 shape:',Jun22.shape)
Sep21.head(2)

Sep 21 shape: (18909, 74)
Dec 21 shape: (17831, 74)
March 22 shape: (18310, 74)
Jun 22 shape: (19446, 74)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,6369,https://www.airbnb.com/rooms/6369,20210910193531,2021-09-11,"Rooftop terrace room , ensuite bathroom",Excellent connection with the AIRPORT and EXHI...,,https://a0.muscache.com/pictures/683224/4cc318...,13660,https://www.airbnb.com/users/show/13660,Simon,2009-04-16,"Madrid, Community of Madrid, Spain","Gay couple, heterofriendly, enjoy having guest...",within a few hours,100%,83%,t,https://a0.muscache.com/im/pictures/user/1c793...,https://a0.muscache.com/im/pictures/user/1c793...,Hispanoamérica,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,,Hispanoamérica,Chamartín,40.45724,-3.67688,Private room in rental unit,Private room,2,,1 shared bath,1.0,1.0,"[""Wifi"", ""Shampoo"", ""Extra pillows and blanket...",$60.00,1,1125,1.0,1.0,1125.0,1125.0,1.0,1125.0,,t,30,60,90,180,2021-09-11,80,4,0,2016-03-31,2019-05-14,4.87,4.91,4.81,4.8,4.89,4.77,4.85,,f,2,0,2,0,1.21
1,21853,https://www.airbnb.com/rooms/21853,20210910193531,2021-09-11,Bright and airy room,We have a quiet and sunny room with a good vie...,We live in a leafy neighbourhood with plenty o...,https://a0.muscache.com/pictures/68483181/87bc...,83531,https://www.airbnb.com/users/show/83531,Abdel,2010-02-21,"Madrid, Madrid, Spain",EN-ES-FR\r\nEN\r\nHi everybody: I'm Abdel. I'm...,,,,f,https://a0.muscache.com/im/users/83531/profile...,https://a0.muscache.com/im/users/83531/profile...,Aluche,2.0,2.0,"['email', 'phone', 'reviews', 'manual_offline'...",t,t,"Madrid, Spain",Cármenes,Latina,40.40381,-3.7413,Private room in rental unit,Private room,1,,1 bath,1.0,1.0,"[""Free parking on premises"", ""Shampoo"", ""Pocke...",$31.00,4,40,4.0,4.0,40.0,40.0,4.0,40.0,,t,29,59,89,364,2021-09-11,33,0,0,2014-10-10,2018-05-29,4.58,4.72,4.56,4.75,4.82,4.21,4.67,,f,2,0,2,0,0.39


**...and after the clean**

In [5]:
s21 = listings(Sep21)
d21 = listings(Dec21)
m22 = listings(Mar22)
j22 = listings(Jun22)
print(s21.shape)
print(d21.shape)
print(m22.shape)
print(j22.shape)
s21.head(2)

(18909, 43)
(17831, 43)
(18310, 43)
(19446, 43)


Unnamed: 0,id,host_id,host_name,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_identity_verified,city,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price_€,room_type,accommodates,bathrooms,bedrooms,beds,lift,heating,aircon,exterior,garden,pool,terrace,balcony,storage,amenity_score,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,reviews_per_month
0,6369,13660,Simon,1.0,0.83,T,1,1,T,Madrid,Hispanoamérica,Chamartín,40.45724,-3.67688,59.4,Private room,2,1,1,1,1,0,1,0,0,0,0,0,0,2,1,1125,80,4.87,4.91,4.81,4.8,4.89,4.77,4.85,T,F,1.21
1,21853,83531,Abdel,0.0,0.0,F,2,2,T,Madrid,Cármenes,Latina,40.40381,-3.7413,30.69,Private room,1,1,1,1,1,0,1,0,0,0,0,0,0,2,4,40,33,4.58,4.72,4.56,4.75,4.82,4.21,4.67,T,F,0.39


**Concatenate the tables into one.**

In [6]:
year = pd.concat([s21,d21,m22,j22], axis = 0, ignore_index = True)
print(year.shape)
year.head(2)

(74496, 43)


Unnamed: 0,id,host_id,host_name,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_identity_verified,city,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price_€,room_type,accommodates,bathrooms,bedrooms,beds,lift,heating,aircon,exterior,garden,pool,terrace,balcony,storage,amenity_score,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,reviews_per_month
0,6369,13660,Simon,1.0,0.83,T,1,1,T,Madrid,Hispanoamérica,Chamartín,40.45724,-3.67688,59.4,Private room,2,1,1,1,1,0,1,0,0,0,0,0,0,2,1,1125,80,4.87,4.91,4.81,4.8,4.89,4.77,4.85,T,F,1.21
1,21853,83531,Abdel,0.0,0.0,F,2,2,T,Madrid,Cármenes,Latina,40.40381,-3.7413,30.69,Private room,1,1,1,1,1,0,1,0,0,0,0,0,0,2,4,40,33,4.58,4.72,4.56,4.75,4.82,4.21,4.67,T,F,0.39


*There are over 8157 duplicates after the concat. We can remove them on the 'host_id' column* 

In [7]:
year.duplicated().sum()

8157

In [8]:
HostListings = year.drop_duplicates(subset=['host_id'], keep='last')
HostListings

Unnamed: 0,id,host_id,host_name,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_identity_verified,city,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price_€,room_type,accommodates,bathrooms,bedrooms,beds,lift,heating,aircon,exterior,garden,pool,terrace,balcony,storage,amenity_score,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,reviews_per_month
22,75609,401552,Mercedes,1.00,0.88,F,2,2,F,Madrid,Embajadores,Centro,40.40892,-3.69697,62.37,Entire place,3,1,1,2,1,0,1,0,0,0,0,0,0,2,3,365,56,4.76,4.85,4.76,4.96,4.94,4.81,4.70,T,T,0.64
51,199016,967721,Santi,0.75,0.14,F,2,2,T,Madrid,Universidad,Centro,40.42570,-3.70383,27.72,Private room,1,1,1,1,1,0,0,0,0,0,0,0,0,1,2,365,25,4.46,4.70,4.70,4.74,4.63,4.96,4.54,T,F,0.34
83,318826,1636378,Francisco Tomas,1.00,1.00,T,1,1,F,Madrid,Aluche,Latina,40.39577,-3.75985,99.00,Entire place,4,1,3,3,1,0,1,0,0,0,0,1,1,4,5,1125,16,4.93,5.00,5.00,4.94,5.00,4.50,4.81,T,F,0.15
90,339813,1713273,Antonio,0.00,0.00,F,2,2,T,Madrid,Cortes,Centro,40.41204,-3.69750,79.20,Entire place,6,1,2,3,0,0,0,0,0,0,0,0,0,0,2,25,2,0.00,5.00,4.00,4.00,4.00,5.00,4.00,T,F,0.02
96,345233,1095831,Rosalind & Tino,0.00,0.00,F,5,5,T,Madrid,Universidad,Centro,40.42504,-3.70400,76.23,Private room,2,1,1,0,0,0,1,0,0,0,0,0,0,1,3,1125,4,5.00,5.00,5.00,5.00,5.00,5.00,5.00,T,F,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74490,53601180,71818704,María MG,1.00,0.00,F,2,2,T,Madrid,Casa de Campo,Moncloa - Aravaca,40.41847,-3.79295,48.51,Private room,1,1,1,1,0,0,0,0,0,0,0,0,0,0,7,60,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,T,F,0.00
74491,604066064140955430,14857326,Lupe,1.00,1.00,F,1,1,T,Madrid,Aravaca,Moncloa - Aravaca,40.43442,-3.80449,32.67,Private room,1,1,1,1,0,0,0,0,0,1,0,0,0,1,7,180,1,5.00,5.00,5.00,5.00,5.00,5.00,5.00,T,T,0.64
74493,588983931844544540,9547334,Ivan,1.00,1.00,T,13,13,T,Madrid,El Goloso,Fuencarral - El Pardo,40.54140,-3.65173,102.96,Entire place,6,1,1,5,0,0,1,0,0,1,0,0,0,2,1,365,10,4.30,4.40,4.20,4.10,4.30,3.80,4.10,T,T,4.11
74494,38980107,298641465,Martha,0.67,0.67,F,0,0,F,Madrid,Valdefuentes,Hortaleza,40.52269,-3.63849,59.40,Private room,1,2,1,1,1,0,1,0,0,1,0,0,0,3,2,7,1,5.00,5.00,5.00,5.00,5.00,5.00,5.00,T,F,1.00


**Export the listings table to CSV file**

In [9]:
#HostListings.to_csv('HostListings.csv', index = False)

**Create and Export Df with Geographical Coordinates**

n.b - I had to do a small manual intervention on the latlong coordinates, as the 'rent' tables did not contain all of the same neighbourhoods as the listings table, as they were either not there, spelled differently, or contained extra names. 

This manual work included: 

- creation of latitude/longitude coordinates 
- adding extra values to the list. 

In [10]:
latlon = HostListings[['neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude']]
latlon['neighbourhood_cleansed'] = latlon['neighbourhood_cleansed'].drop_duplicates()
latlon = latlon.rename(columns={'neighbourhood_cleansed':'neighbourhood','neighbourhood_group_cleansed':'district'})
latlon['neighbourhood'] = latlon['neighbourhood'].str.strip()
latlon = latlon.dropna()
latlon

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latlon['neighbourhood_cleansed'] = latlon['neighbourhood_cleansed'].drop_duplicates()


Unnamed: 0,neighbourhood,district,latitude,longitude
22,Embajadores,Centro,40.408920,-3.696970
51,Universidad,Centro,40.425700,-3.703830
83,Aluche,Latina,40.395770,-3.759850
90,Cortes,Centro,40.412040,-3.697500
116,Entrevías,Puente de Vallecas,40.379000,-3.674230
...,...,...,...,...
32102,Fuentelareina,Fuencarral - El Pardo,40.481430,-3.737570
35913,Aeropuerto,Barajas,40.498534,-3.544844
54048,El Plantío,Moncloa - Aravaca,40.473900,-3.835620
66546,Horcajo,Moratalaz,40.406110,-3.630100


In [11]:
#DO NOT EXPORT 
#latlon.to_csv('latlong.csv', index = False)