# 0.0. IMPORTS

## 0.1. Libraries

In [2]:
import pandas as pd
import numpy as np

## 0.2. Functions

### 0.2.1. Get Attributes

In [160]:
def get_numerical_attributes(data):
    return data.select_dtypes(include=['int64', 'float64'])

In [159]:
def get_categorical_attributes(data):
    return data.select_dtypes(exclude=['int64', 'flaot64', 'datetime64[ns]'])

### 0.2.2. Get Descriptive Info

In [156]:
def get_descriptive_info(data):
    num_attributes = get_numerical_attributes(data)
    
    # Central Tendency - mean, median
    df_mean = pd.DataFrame(num_attributes.apply(np.mean)).T
    df_median = pd.DataFrame(num_attributes.apply(np.median)).T
    
    # Dipersion - std, min, max, range, skew, kurtosis
    df_std = pd.DataFrame(num_attributes.apply(np.std)).T
    df_min = pd.DataFrame(num_attributes.apply(min)).T
    df_max = pd.DataFrame(num_attributes.apply(max)).T
    df_range = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
    df_skew = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
    df_kurtosis = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T
    
    # min, max, range, median, std, skew, kurtosis
    df_descriptive_info = pd.concat([df_min, df_max, df_range,
                                     df_mean, df_median, df_std,
                                     df_skew, df_kurtosis]).T.reset_index()
    
    df_descriptive_info.columns = ['attributes', 'min', 'max',
                                   'range', 'mean', 'median',
                                   'std', 'skew', 'kurtosis']
    
    return df_descriptive_info

### 0.2.3. Analyze Outliers

In [168]:
def find_and_analyze_outliers(data):
    num_attributes = get_numerical_attributes(data)
    
    Q1 = num_attributes.quantile(0.25)
    Q3 = num_attributes.quantile(0.75)
    IQR = Q3 - Q1
    
    inf_lim = Q1 - 1.5 * IQR
    sup_lim = Q3 + 1.5 * IQR
    
    outliers = ((num_attributes < inf_lim) | (num_attributes > sup_lim))
    
    outliers_count = outliers.sum()
    outliers_percentage = (outliers_count / len(num_attributes)) * 100
    
    df_outliers = data[outliers.any(axis=1)]
    
    df_outliers_analyze = pd.DataFrame({
        'Outliers Count' : outliers_count,
        'Outliers Percentage' : outliers_percentage
    })
    
    df_outliers_analyze = df_outliers_analyze[df_outliers_analyze['Outliers Count'] > 0]
    
    df_outliers_analyze = df_outliers_analyze.sort_values(by='Outliers Count', ascending = False)
    
    return df_outliers, df_outliers_analyze

## 0.3. Loading Data

In [7]:
df_raw = pd.read_csv("../data/raw/teste_indicium_precificacao.csv")

# 1.0. DESCRIPTION DATA

In [8]:
df1 = df_raw.copy()

In [10]:
df1.sample(5).T

Unnamed: 0,7592,16277,37696,26019,42236
id,5697556,13112601,29881973,20763877,32789261
nome,Magical rowhouse and garden in Williamsburg,Beautiful light and lots of space,Small Full Sized Bedroom in Historic Manhattan,Charming 1BR in Lively Bed-Stuy neighborhood,Spacious 2 bedroom apt in quiet area
host_id,946943,13769025,223715716,35645777,246653349
host_name,Annie,Alberto,James,Yomi,Juan
bairro_group,Brooklyn,Brooklyn,Manhattan,Brooklyn,Bronx
bairro,Williamsburg,Bedford-Stuyvesant,Harlem,Bedford-Stuyvesant,Baychester
latitude,40.71212,40.69563,40.82424,40.68251,40.8694
longitude,-73.94965,-73.95149,-73.93753,-73.94102,-73.84171
room_type,Entire home/apt,Private room,Private room,Entire home/apt,Entire home/apt
price,155,43,35,175,101


## 1.1. Data Dimension

In [11]:
print(f'Number of Rows: {df1.shape[0]}')
print(f'Number of Features: {df1.shape[1]}')

Number of Rows: 48894
Number of Features: 16


## 1.2. Data Types

In [14]:
df1.dtypes

id                                        int64
nome                                     object
host_id                                   int64
host_name                                object
bairro_group                             object
bairro                                   object
latitude                                float64
longitude                               float64
room_type                                object
price                                     int64
minimo_noites                             int64
numero_de_reviews                         int64
ultima_review                    datetime64[ns]
reviews_por_mes                         float64
calculado_host_listings_count             int64
disponibilidade_365                       int64
dtype: object

## 1.3. Check Duplicated

In [64]:
df1.duplicated().sum()

0

## 1.4. Check NA

In [15]:
df1.isna().sum()

id                                   0
nome                                16
host_id                              0
host_name                           21
bairro_group                         0
bairro                               0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimo_noites                        0
numero_de_reviews                    0
ultima_review                    10052
reviews_por_mes                  10052
calculado_host_listings_count        0
disponibilidade_365                  0
dtype: int64

### 1.4.1. nome

Cada row nulo receberá, respectivamente, a junção dos dados em host_name + bairro_group + bairro no seu campo.

In [98]:
aux1 = df1[df1['nome'].isna()]

aux1[['host_name', 'bairro_group', 'bairro']].groupby('host_name').count()

Unnamed: 0_level_0,bairro_group,bairro
host_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Andrea,1,1
Anna,1,1
Carolina,1,1
Huei-Yin,1,1
Jeff,1,1
Jesse,1,1
Jonathan,1,1
Josh,1,1
Juliette,1,1
Kathleen,1,1


### 1.4.2. host_name
Cada row nulo receberá, respectivamente, o dado do row nome.

In [108]:
aux2 = df1[df1['host_name'].isna()]

aux2['nome'].duplicated().any()

False

### 1.4.3. ultima_review e reviews_por_mes

**ultima_review** receberá a data 2010-01-01 

**reviews_por_mes** receberá o valor 0

1. a data mais antiga na ultima_review de dados é 2011-03-28
2. reviews_por_mes não contem valores iguais a 0 
3. ambos os dados nulos são respectivamente correspondente entre si o que pode sinalizar:
    1. nunca foram alugados
    2. foram alugados mas não houve registros

In [109]:
df1['ultima_review'].isna().equals(df1['reviews_por_mes'].isna())

True

In [145]:
df1.sort_values(by='ultima_review', ascending=True).T

Unnamed: 0,316,162,329,124,142,122,19,607,354,1097,...,48884,48885,48886,48887,48888,48889,48890,48891,48892,48893
id,74860,40039,81739,28396,32363,27883,7801,229874,98330,464231,...,36482809,36483010,36483152,36484087,36484363,36484665,36485057,36485431,36485609,36487245
nome,"Sunlit and Cozy Williamsburg/Greenpoint, Brooklyn",Luxurious Condo in DUBMO with View,Loft w/ Terrace @ Box House Hotel,Modern Apt with Spectacular Views,Fully Furnished Basement Apartment,East Village Sanctuary,Sweet and Spacious Brooklyn Loft,Oversized Studio in Park Slope,LOVELY APARTMENT IN THE HEART OF NY,Large Room w/ Private Entrance,...,Stunning Bedroom NYC! Walking to Central Park!!,Comfy 1 Bedroom in Midtown East,Garden Jewel Apartment in Williamsburg New York,"Spacious Room w/ Private Rooftop, Central loca...",QUIT PRIVATE HOUSE,Charming one bedroom - newly renovated rowhouse,Affordable room in Bushwick/East Williamsburg,Sunny Studio at Historical Neighborhood,43rd St. Time Square-cozy single bed,Trendy duplex in the very heart of Hell's Kitchen
host_id,394752,171851,417504,6197784,140025,120223,21207,507304,31374,1530310,...,131529729,274311461,208514239,274321313,107716952,8232441,6570630,23492952,30985759,68119814
host_name,Allison,Henry,The Box House Hotel,Jo,Fredah,Jen,Chaya,Derrick,Shon,Jacques,...,Kendall,Scott,Melki,Kat,Michael,Sabrina,Marisol,Ilgar & Aysel,Taz,Christophe
bairro_group,Brooklyn,Brooklyn,Brooklyn,Brooklyn,Queens,Manhattan,Brooklyn,Brooklyn,Manhattan,Brooklyn,...,Manhattan,Manhattan,Brooklyn,Manhattan,Queens,Brooklyn,Brooklyn,Manhattan,Manhattan,Manhattan
bairro,Greenpoint,DUMBO,Greenpoint,Williamsburg,Flushing,East Village,Williamsburg,Sunset Park,Kips Bay,Bedford-Stuyvesant,...,East Harlem,Midtown,Williamsburg,Hell's Kitchen,Jamaica,Bedford-Stuyvesant,Bushwick,Harlem,Hell's Kitchen,Hell's Kitchen
latitude,40.72488,40.70207,40.73842,40.71923,40.74028,40.72245,40.71842,40.66293,40.73877,40.68905,...,40.79633,40.75561,40.71232,40.76392,40.69137,40.67853,40.70184,40.81475,40.75751,40.76404
longitude,-73.95018,-73.98571,-73.95312,-73.96468,-73.83168,-73.98527,-73.95718,-73.99833,-73.97707,-73.9541,...,-73.93605,-73.96723,-73.9422,-73.99183,-73.80844,-73.94995,-73.93317,-73.94867,-73.99112,-73.98933
room_type,Private room,Private room,Private room,Private room,Private room,Entire home/apt,Entire home/apt,Entire home/apt,Entire home/apt,Private room,...,Private room,Entire home/apt,Entire home/apt,Private room,Private room,Private room,Private room,Entire home/apt,Shared room,Private room
price,55,250,249,90,140,100,299,200,125,65,...,75,200,170,125,65,70,40,115,55,90


In [135]:
df1[df1['reviews_por_mes'] == 0].count()

id                               0
nome                             0
host_id                          0
host_name                        0
bairro_group                     0
bairro                           0
latitude                         0
longitude                        0
room_type                        0
price                            0
minimo_noites                    0
numero_de_reviews                0
ultima_review                    0
reviews_por_mes                  0
calculado_host_listings_count    0
disponibilidade_365              0
dtype: int64

## 1.5. Tratamento de nulos

In [153]:
# nome
df1['nome'] = df1.apply(lambda x: f"{x['host_name']}  {x['bairro_group']} {x['bairro']}" if pd.isna(x['nome']) else x['nome'], axis=1)

# host_name
df1['host_name'] = df1.apply(lambda x: x['nome'] if pd.isna(x['host_name']) else x['host_name'], axis=1)

# ultima_review
df1['ultima_review'] = df1.apply(lambda x: '2010-01-01' if pd.isna(x['ultima_review']) else x['ultima_review'], axis=1)
df1['ultima_review'] = pd.to_datetime(df1['ultima_review'])

# reviews_por_mes
df1['reviews_por_mes'] = df1.apply(lambda x: 0 if pd.isna(x['reviews_por_mes']) else x['reviews_por_mes'], axis=1)

## 1.6. Data Descriptive Info

In [166]:
get_descriptive_info(df1)

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,2595.0,36487240.0,36484650.0,19017530.0,19677430.0,10982770.0,-0.090265,-1.227738
1,host_id,2438.0,274321300.0,274318900.0,67621390.0,30795530.0,78610370.0,1.206189,0.169041
2,latitude,40.49979,40.91306,0.41327,40.72895,40.72308,0.05452883,0.237157,0.148937
3,longitude,-74.24442,-73.71299,0.53143,-73.95217,-73.95568,0.04615665,1.284179,5.021498
4,price,0.0,10000.0,10000.0,152.7208,106.0,240.1542,19.118743,585.660822
5,minimo_noites,1.0,1250.0,1249.0,7.030085,3.0,20.51053,21.827092,854.05664
6,numero_de_reviews,0.0,629.0,629.0,23.27476,5.0,44.55054,3.690589,19.529325
7,reviews_por_mes,0.0,58.5,58.5,1.090928,0.37,1.597278,3.30069,43.531087
8,calculado_host_listings_count,1.0,327.0,326.0,7.144005,1.0,32.95252,7.933091,67.549426
9,disponibilidade_365,0.0,365.0,365.0,112.7762,45.0,131.6173,0.763459,-0.99743


## 1.7. Find and Analyze Outliers

In [170]:
df_outliers, df_outliers_analyze = find_and_analyze_outliers(df1)

In [172]:
df_outliers_analyze

Unnamed: 0,Outliers Count,Outliers Percentage
calculado_host_listings_count,7080,14.480304
minimo_noites,6607,13.512905
numero_de_reviews,6021,12.314394
reviews_por_mes,3312,6.773837
price,2972,6.078455
longitude,2832,5.792122
host_id,1526,3.121037
latitude,425,0.869227


## 1.8. Saved actual data

In [178]:
path = '..\data\processed\df1_description_data.csv'
df1.to_csv(path, index=False)

# 2.0. FEATURE ENGINEERING

In [179]:
df2 = df1.copy()

In [183]:
df2.head().T

Unnamed: 0,0,1,2,3,4
id,2595,3647,3831,5022,5099
nome,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park,Large Cozy 1 BR Apartment In Midtown East
host_id,2845,4632,4869,7192,7322
host_name,Jennifer,Elisabeth,LisaRoxanne,Laura,Chris
bairro_group,Manhattan,Manhattan,Brooklyn,Manhattan,Manhattan
bairro,Midtown,Harlem,Clinton Hill,East Harlem,Murray Hill
latitude,40.75362,40.80902,40.68514,40.79851,40.74767
longitude,-73.98377,-73.9419,-73.95976,-73.94399,-73.975
room_type,Entire home/apt,Private room,Entire home/apt,Entire home/apt,Entire home/apt
price,225,150,89,80,200
