In [16]:
# Data manipulation
import pandas as pd

# Geojson
from pandas_geojson import read_geojson, write_geojson

# Datetime
from datetime import datetime

In [17]:
def read_csv(city):

    # Reads csv
    df_city = pd.read_csv(f'../raw/airbnb/{city}/listings.csv', encoding='utf-8')

    return df_city

In [18]:
def get_description_score_facilities(df_name_col):

    name = df_name_col.tolist()

    aux = []

    for n in name:

        aux.append(n.split('·'))

    description = []
    score = []
    facilities_prev = []

    for sublist in aux:

        description.append(sublist[0].capitalize().rstrip(' '))
        sublist.remove(sublist[0])

        if '★' in sublist[0]:

            score.append(sublist[0].rstrip(' ').lstrip(' '))
            sublist.remove(sublist[0])
        
        else:

            score.append('★Unknown')

        facilities_prev.append(sublist[:])

    for sublist in facilities_prev:

        sublist[-1] = sublist[-1] + ' '

    facilities = []

    for sublist in facilities_prev:

        aux2 = []

        for e in sublist:

            e = e.rstrip(' ').lstrip(' ')

            aux2.append(e)

        facilities.append(aux2)

    return [description, score, facilities]

In [19]:
# Splits column name into description, score and facilities

def split_name_col(df_city):

    df_city['description'] = get_description_score_facilities(df_city.name)[0]
    df_city['score'] = get_description_score_facilities(df_city.name)[1]
    df_city['facilities'] = get_description_score_facilities(df_city.name)[2]
    df_city['facilities'] = df_city['facilities'].apply(lambda x: ', '.join(map(str, x)))

In [20]:
def drop_rename_timestamp(df_city):

    # Drops useless cols
    df_city.drop(['id', 'host_id', 'name', 'reviews_per_month', 'calculated_host_listings_count', 'number_of_reviews_ltm', 'license'], axis=1, inplace=True)

    # Renames neighbourhood_group col by district
    df_city = df_city.rename(columns={'neighbourhood_group': 'district'})

    #df_city['district'] = df_city['district'].apply(lambda x: x.lower().capitalize())

    #df_city['last_review'] = pd.to_datetime(df_city['last_review'], dayfirst=True)
    #df_city['last_review'] = pd.to_datetime(df_city['last_review'], format='%Y/%m/%d %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S')
    df_city['last_review'] = pd.to_datetime(df_city['last_review'], format='%Y-%m-%d', errors='coerce')

    return df_city

In [21]:
def new_col_order(df_city):

        col_order = ['description', 'score', 'facilities', 
                     'district', 'neighbourhood', 
                     'room_type', 'price', 'minimum_nights', 
                     'host_name', 'number_of_reviews', 
                     'last_review', 'latitude', 'longitude']

        # Orders columns
        df_city = df_city[col_order]

        return df_city

In [22]:
def col_names_kepler(df_city):

    new_cols = {col: col.capitalize().replace('_',' ') for col in df_city.columns}
    df_city.rename(columns=new_cols, inplace=True)

In [23]:
# Saves df to csv

def save_csv(df_city, city):

    df_city.to_csv(f'../raw/eda-result/airbnb_{city}.csv', index = False, sep = ',')

In [24]:
def clean_save_geojson(city):

    path_to_file = f'../raw/airbnb/{city}/neighbourhoods.geojson'
    geo_json = read_geojson(path_to_file)

    for i in range(0, len(geo_json['features'])):

        geo_json['features'][i]['properties']['Neighbourhood'] = geo_json['features'][i]['properties']['neighbourhood']
        del geo_json['features'][i]['properties']['neighbourhood']

        geo_json['features'][i]['properties']['District'] = geo_json['features'][i]['properties']['neighbourhood_group']
        del geo_json['features'][i]['properties']['neighbourhood_group']

    write_geojson(geo_json, filename=f'../raw/eda-result/neighbourhood_{city}.geojson')

In [25]:
cities = ['madrid', 'porto', 'lisbon']

for city in cities:
    
    df_city = read_csv(city)
    split_name_col(df_city)
    df_city = drop_rename_timestamp(df_city)
    df_city = new_col_order(df_city)
    col_names_kepler(df_city)
    save_csv(df_city, city)
    clean_save_geojson(city)