In [1]:
# Data manipulation
import pandas as pd # data manipulation and dataframes
from pandas_geojson import read_geojson, write_geojson

In [2]:
# Pandas configuration
pd.set_option('display.max_columns', None)  # shows all columns
pd.set_option('display.max_colwidth', None)  # shows all cell content

In [3]:
def read_csv(city):

    # Reads csv
    df_city = pd.read_csv(f'../data/airbnb/{city}/listings.csv', index_col = 0)

    # Index
    df_city.reset_index(drop = False, inplace = True)

    return df_city

In [4]:
def get_description_score_facilities(df_name_col):

    name = df_name_col.tolist()

    aux = []

    for n in name:

        aux.append(n.split('·'))

    description = []
    score = []
    facilities_prev = []

    for sublist in aux:

        description.append(sublist[0].capitalize().rstrip(' '))
        sublist.remove(sublist[0])

        if '★' in sublist[0]:

            score.append(sublist[0].rstrip(' ').lstrip(' '))
            sublist.remove(sublist[0])
        
        else:

            score.append('★Unknown')

        facilities_prev.append(sublist[:])

    for sublist in facilities_prev:

        sublist[-1] = sublist[-1] + ' '

    facilities = []

    for sublist in facilities_prev:

        aux2 = []

        for e in sublist:

            e = e.rstrip(' ').lstrip(' ')

            aux2.append(e)

        facilities.append(aux2)

    return [description, score, facilities]

In [5]:
# Splits column name into description, score and facilities

def split_name_col(df_city):

    df_city['description'] = get_description_score_facilities(df_city.name)[0]
    df_city['score'] = get_description_score_facilities(df_city.name)[1]
    df_city['facilities'] = get_description_score_facilities(df_city.name)[2]
    df_city['facilities'] = df_city['facilities'].apply(lambda x: ', '.join(map(str, x)))

In [6]:
def drop_rename(df_city):

    #Drops useless cols
    df_city.drop(['id', 'host_id', 'name', 'reviews_per_month', 'calculated_host_listings_count', 'number_of_reviews_ltm', 'license'], axis=1, inplace=True)

    # Renames neighbourhood_group col by district
    df_city = df_city.rename(columns={'neighbourhood_group': 'district'})

    df_city['district'] = df_city['district'].apply(lambda x: x.lower().capitalize())

    return df_city

In [7]:
def new_col_order(df_city):

        col_order = ['description', 'score', 'facilities', 
                     'district', 'neighbourhood', 
                     'room_type', 'price', 'minimum_nights', 
                     'host_name', 'number_of_reviews', 
                     'last_review', 'latitude', 'longitude']

        # Orders columns
        df_city = df_city[col_order]

        return df_city

In [8]:
def col_names_kepler(df_city):

    new_cols = {col: col.capitalize().replace('_',' ') for col in df_city.columns}
    df_city.rename(columns=new_cols, inplace=True)

In [9]:
# Saves df to csv

def save_csv(df_city, city):

    df_city.to_csv(f'../data/kepler/airbnb_{city}.csv', index = False, sep = ',')

In [10]:
def clean_save_geojson(city):

    path_to_file = f'../data/airbnb/{city}/neighbourhoods.geojson'
    geo_json = read_geojson(path_to_file)

    for i in range(0, len(geo_json)):

        geo_json['features'][i]['properties']['Neighbourhood'] = geo_json['features'][i]['properties']['neighbourhood']
        del geo_json['features'][i]['properties']['neighbourhood']

        geo_json['features'][i]['properties']['District'] = geo_json['features'][i]['properties']['neighbourhood_group']
        del geo_json['features'][i]['properties']['neighbourhood_group']

    write_geojson(geo_json, filename=f'../data/kepler/neighbourhood_{city}.geojson')

In [11]:
cities = ['madrid', 'porto', 'lisbon']

for city in cities:
    
    df_city = read_csv(city)
    split_name_col(df_city)
    df_city = drop_rename(df_city)
    df_city = new_col_order(df_city)
    col_names_kepler(df_city)
    save_csv(df_city, city)
    clean_save_geojson(city)