In [69]:
# Data manipulation
import pandas as pd # data manipulation and dataframes
from pandas_geojson import read_geojson, write_geojson

In [70]:
# Pandas configuration
pd.set_option('display.max_columns', None)  # shows all columns
pd.set_option('display.max_colwidth', None)  # shows all cell content

In [71]:
# Reads csvs
madrid_df = pd.read_csv('../data/airbnb/madrid/listings.csv', index_col = 0)
porto_df = pd.read_csv('../data/airbnb/porto/listings.csv', index_col = 0)
lisbon_df = pd.read_csv('../data/airbnb/lisbon/listings.csv', index_col = 0)

# Index
madrid_df.reset_index(drop = False, inplace = True)
porto_df.reset_index(drop = False, inplace = True)
lisbon_df.reset_index(drop = False, inplace = True)

### limpieza para introducir en kepler.gl

In [72]:
def get_description_score_facilities(df_name_col):

    name = df_name_col.tolist()

    aux = []

    for n in name:

        aux.append(n.split('·'))

    description = []
    score = []
    facilities_prev = []

    for sublist in aux:

        description.append(sublist[0].capitalize().rstrip(' '))
        sublist.remove(sublist[0])

        if '★' in sublist[0]:

            score.append(sublist[0].rstrip(' ').lstrip(' '))
            sublist.remove(sublist[0])
        
        else:

            score.append('★Unknown')

        facilities_prev.append(sublist[:])

    for sublist in facilities_prev:

        sublist[-1] = sublist[-1] + ' '

    facilities = []

    for sublist in facilities_prev:

        aux2 = []

        for e in sublist:

            e = e.rstrip(' ').lstrip(' ')

            aux2.append(e)

        facilities.append(aux2)

    return [description, score, facilities]

In [73]:
# Splits column name into description, score and facilities

madrid_df['description'] = get_description_score_facilities(madrid_df.name)[0]
madrid_df['score'] = get_description_score_facilities(madrid_df.name)[1]
madrid_df['facilities'] = get_description_score_facilities(madrid_df.name)[2]
madrid_df['facilities'] = madrid_df['facilities'].apply(lambda x: ', '.join(map(str, x)))

porto_df['description'] = get_description_score_facilities(porto_df.name)[0]
porto_df['score'] = get_description_score_facilities(porto_df.name)[1]
porto_df['facilities'] = get_description_score_facilities(porto_df.name)[2]
porto_df['facilities'] = porto_df['facilities'].apply(lambda x: ', '.join(map(str, x)))

lisbon_df['description'] = get_description_score_facilities(lisbon_df.name)[0]
lisbon_df['score'] = get_description_score_facilities(lisbon_df.name)[1]
lisbon_df['facilities'] = get_description_score_facilities(lisbon_df.name)[2]
lisbon_df['facilities'] = lisbon_df['facilities'].apply(lambda x: ', '.join(map(str, x)))

In [74]:
# Drops
madrid_df.drop(['id', 'host_id', 'name', 'reviews_per_month', 'calculated_host_listings_count', 'number_of_reviews_ltm', 'license'], axis=1, inplace=True)
porto_df.drop(['id', 'host_id', 'name', 'reviews_per_month', 'calculated_host_listings_count', 'number_of_reviews_ltm', 'license'], axis=1, inplace=True)
lisbon_df.drop(['id', 'host_id', 'name', 'reviews_per_month', 'calculated_host_listings_count', 'number_of_reviews_ltm', 'license'], axis=1, inplace=True)

# Rename col
madrid_df = madrid_df.rename(columns={'neighbourhood_group': 'district'})
porto_df = porto_df.rename(columns={'neighbourhood_group': 'district'})
lisbon_df = lisbon_df.rename(columns={'neighbourhood_group': 'district'})

In [75]:
# Orders columns

col_order = ['description', 'score', 'facilities', 
            'district', 'neighbourhood', 
            'room_type', 'price', 'minimum_nights',
            'host_name','number_of_reviews', 'last_review',
            'latitude', 'longitude']

madrid_df = madrid_df[col_order]
porto_df = porto_df[col_order]
lisbon_df = lisbon_df[col_order]

In [76]:
columnas_nuevas = {col: col.capitalize().replace('_',' ') for col in madrid_df.columns}
madrid_df.rename(columns=columnas_nuevas, inplace=True)

columnas_nuevas = {col: col.capitalize().replace('_',' ') for col in porto_df.columns}
porto_df.rename(columns=columnas_nuevas, inplace=True)

columnas_nuevas = {col: col.capitalize().replace('_',' ') for col in lisbon_df.columns}
lisbon_df.rename(columns=columnas_nuevas, inplace=True)

In [77]:
# Saves df to csv

madrid_df.to_csv('../data/kepler/airbnb_madrid.csv', index = False, sep = ',')
porto_df.to_csv('../data/kepler/airbnb_porto.csv', index = False, sep = ',')
lisbon_df.to_csv('../data/kepler/airbnb_lisbon.csv', index = False, sep = ',')

In [78]:
def clean_save_geojson(city):

    path_to_file = f'../data/airbnb/{city}/neighbourhoods.geojson'
    geo_json = read_geojson(path_to_file)

    for i in range(0, len(geo_json)):

        geo_json['features'][i]['properties']['Neighbourhood'] = geo_json['features'][i]['properties']['neighbourhood']
        del geo_json['features'][i]['properties']['neighbourhood']

        geo_json['features'][i]['properties']['District'] = geo_json['features'][i]['properties']['neighbourhood_group']
        del geo_json['features'][i]['properties']['neighbourhood_group']

    write_geojson(geo_json, filename=f'../data/kepler/{city}_neighbourhoods.geojson')

In [79]:
cities = ['madrid', 'porto', 'lisbon']

for c in cities:
    clean_save_geojson(c)