# Aula 5: Funções e estrutura de dados

In [None]:
import pandas as pd

# load dataset
data = (pd.read_csv('kc_house_data.csv')).head(20)

In [None]:
# data dimensions
print('Number of rows: {}'.format(data.shape[0]))
print('Number of columns: {}'.format(data.shape[1]))

In [None]:
# data types
data.dtypes

In [None]:
# convert object to date
data['date'] =pd.to_datetime(data['date'])

In [None]:
# descriptive statistics
num_attributes = data.select_dtypes(include=['int64', 'float64'])

# central tendency -> mean, median
import numpy as np
pd.set_option('display.float_format', lambda x: '%.5f' % x)

media = pd.DataFrame(num_attributes.apply(np.mean))
mediana = pd.DataFrame(num_attributes.apply(np.median))

# dispersion -> std, min, max
std = pd.DataFrame(num_attributes.apply(np.std))
min_ = pd.DataFrame(num_attributes.apply(np.min))
max_ = pd.DataFrame(num_attributes.apply(np.max))

df1 = pd.concat([max_, min_, media, mediana, std], axis=1).reset_index()
df1.columns = ['attributes', 'max', 'min', 'media', 'mediana', 'desvio']

In [None]:
# data dimensions
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

In [None]:
# crie uma nova coluna chamada: 'dormitory_type'
# se o valor da coluna 'bedrooms' for igual a 1 -> 'studio'
# se o valor da coluna 'bedrooms' for igual a 2 -> 'apartament'
# se o valor da coluna 'bedrooms' for maior que 2 -> 'house'
data['dormitory_type'] = 'NA'

for i in range(len(data)):
    if data.loc[i, 'bedrooms'] == 1:
        data.loc[i, 'dormitory_type'] = 'studio'
        
    elif data.loc[i, 'bedrooms'] == 2:
        data.loc[i, 'dormitory_type'] = 'apartament'
        
    else:
        data.loc[i, 'dormitory_type'] = 'house'

In [None]:
# data dimensions
print('Number of rows: {}'.format(data.shape[0]))
print('Number of columns: {}'.format(data.shape[1]))

In [None]:
# exemplo de aplicação 1: definir os niveis de preços
# 0 até 321.950              -> level 0
# entre 321.950 até 450.000  -> level 1
# entre 450.000 até 645.000  -> level 2
# acima de 645.000           -> level 3
data['level'] = 'NA'

for i in range(len(data)):
    if data.loc[i, 'price'] <= 321950:
        data.loc[i, 'level'] = 0
        
    elif (data.loc[i, 'price'] > 321950) & (data.loc[i, 'price'] <= 450000):
        data.loc[i, 'level'] = 1
        
    elif (data.loc[i, 'price'] > 450000) & (data.loc[i, 'price'] <= 645000):
        data.loc[i, 'level'] = 2
    
    else:
        data.loc[i, 'level'] = 3

In [None]:
from geopy.geocoders import Nominatim

# initialize API
geolocator = Nominatim(user_agent='geopiExercises')

# creaty empty
data.loc[:, 'road'] = 'NA'
data.loc[:, 'house_number'] = 'NA'

for i in range(len(data)):
    print('Loop {}/{}'.format(i, len(data)))
    
    query = str(data.loc[i, 'lat']) + ',' + str(data.loc[i, 'long'])
    response = geolocator.reverse(query)
    
    if 'house_number' in response.raw['address']:
        data.loc[i, 'house_number'] = response.raw['address']['house_number']
        
    if 'road' in response.raw['address']:
        data.loc[i, 'road'] = response.raw['address']['road']     

In [None]:
import plotly.express as px

In [None]:
# map
houses = data[['id', 'lat', 'long', 'price', 'level']].copy()

fig = px.scatter_mapbox(houses,
                 lat='lat',
                 lon='long',
                 color='level',
                 size='price',
                 color_continuous_scale=px.colors.cyclical.IceFire,
                 size_max=15,
                 zoom=10)

fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(height=600, margin={'r': 0, 'l': 0, 'b': 0, 't': 0})
fig.show()

# 2.0 Refazendo o código

In [None]:
# ETL - Extração, Transformação e Load

# Extração

In [None]:
# Libraries
# ------------------------
import pandas as pd
from geopy.geocoders import Nominatim
import plotly.express as px
import numpy as np

# ------------------------
# Functions
def show_dtypes(data):
    print(data.dtypes)
    
    return None

def show_dimensions(data):
    print('Number of Rows: {}'.format(data.shape[0]))
    print('Number of Columns: {}'.format(data.shape[1]))
    
    return None


def collect_geodata(data, cols):
    geolocator = Nominatim(user_agent='geopiExercises')

    data = data.head(20)
    
    data.loc[:, cols[0]] = 'NA'
    data.loc[:, cols[1]] = 'NA'

    for i in range(len(data)):
        print('Loop {}/{}'.format(i, len(data)))

        query = str(data.loc[i, 'lat']) + ',' + str(data.loc[i, 'long'])
        response = geolocator.reverse(query)

        if cols[0] in response.raw['address']:
            data.loc[i, 'house_number'] = response.raw['address'][cols[0]]

        if cols[1] in response.raw['address']:
            data.loc[i, 'road'] = response.raw['address'][cols[1]] 

    return data

def data_collect(path):
    # load dataset
    data = pd.read_csv(path)

    # 1.1 Extracion Analysis
    # data dimensions
    show_dimensions(data)

    # data types
    show_dtypes(data)
    
    return data

def data_transform(data):
    # convert object to date
    data['date'] = pd.to_datetime(data['date'])

    # descriptive statistics
    num_attributes = data.select_dtypes(include=['int64', 'float64'])

    # central tendency -> mean, median
    pd.set_option('display.float_format', lambda x: '%.5f' % x)

    media = pd.DataFrame(num_attributes.apply(np.mean))
    mediana = pd.DataFrame(num_attributes.apply(np.median))

    # dispersion -> std, min, max
    std = pd.DataFrame(num_attributes.apply(np.std))
    min_ = pd.DataFrame(num_attributes.apply(np.min))
    max_ = pd.DataFrame(num_attributes.apply(np.max))

    df1 = pd.concat([max_, min_, media, mediana, std], axis=1).reset_index()
    df1.columns = ['attributes', 'max', 'min', 'media', 'mediana', 'desvio']

    show_dimensions(df1)

    data['dormitory_type'] = 'NA'

    for i in range(len(data)):
        if data.loc[i, 'bedrooms'] == 1:
            data.loc[i, 'dormitory_type'] = 'studio'

        elif data.loc[i, 'bedrooms'] == 2:
            data.loc[i, 'dormitory_type'] = 'apartament'

        else:
            data.loc[i, 'dormitory_type'] = 'house'


    show_dimensions(data)
    
    cols = ['road', 'house_number']
    df = data.head(20)
    
    df1 = collect_geodata(df, cols)
    
    show_dimensions(df1)
    
    return df1

    
    data['level'] = 'NA'
    for i in range(len(data)):
        if data.loc[i, 'price'] <= 321950:
            data.loc[i, 'level'] = 0

        elif (data.loc[i, 'price'] > 321950) & (data.loc[i, 'price'] <= 450000):
            data.loc[i, 'level'] = 1

        elif (data.loc[i, 'price'] > 450000) & (data.loc[i, 'price'] <= 645000):
            data.loc[i, 'level'] = 2

        else:
            data.loc[i, 'level'] = 3
    
    return data

def data_load(data):
    houses = data[['id', 'lat', 'long', 'price', 'level']].copy()

    fig = px.scatter_mapbox(houses,
                     lat='lat',
                     lon='long',
                     color='level',
                     size='price',
                     color_continuous_scale=px.colors.cyclical.IceFire,
                     size_max=15,
                     zoom=10)

    fig.update_layout(mapbox_style='open-street-map')
    fig.update_layout(height=600, margin={'r': 0, 'l': 0, 'b': 0, 't': 0})
    fig.show()
    
    return None


if __name__ == '__main__':
    # ETL
    
    # Collect
    data_raw = data_collect('kc_house_data.csv')
    
    # Transform
    data_processing = data_transform(data_raw)
    
    # Load
    data_load(data_processing)