#### Import libraries and packages

In [2]:
import psycopg2
import pandas as pd
import numpy as np
import seaborn as sns

#### Set connection with postgres database

In [43]:
host = 'postgresfib.fib.upc.edu'
dbname = 'ADSDBjordi.cluet'
user = 'jordi.cluet'
pwd = 'DB151199'
port = 6433
sslmode = 'require'

conn = psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(host, port, dbname, user, pwd))
cursor = conn.cursor()

#### Select whole table as dataframe

In [44]:
sql = "SELECT * from formatted_zone.zenodo_fotocasa_2020_21_12_06;"
df = pd.read_sql_query(sql, conn)
df.head()

Unnamed: 0,id,address,bathrooms,building_subtype,building_type,conservation_state,extraction_date,discount,floor_elevator,is_new_construction,link,price,real_estate,real_estate_id,rooms,sq_meters,neighbourhood,neighbourhood_mean_price
0,0,"Carrer Roger de Lluria, Dreta de l'Eixample",3,Flat,Flat,0,2020-10-28,0,1,False,/es/alquiler/vivienda/barcelona-capital/calefa...,5000.0,SOMOS REAL ESTATE SERVICES,9202765680731,3,208.0,la dreta de l'eixample,1307.616842
1,1,Sant Antoni,1,Flat,Flat,1,2020-10-28,0,1,False,/es/alquiler/vivienda/barcelona-capital/aire-a...,1250.0,TECNOCASA RIERA ALTA MAR,9202751363773,3,93.0,sant antoni,994.909158
2,2,"Carrer Valencia, La Nova Esquerra de l'Eixample",2,Flat,Flat,0,2020-10-28,0,1,False,/es/alquiler/vivienda/barcelona-capital/calefa...,1600.0,SOMOS REAL ESTATE SERVICES,9202765680731,4,129.0,la nova esquerra de l'eixample,1053.832393
3,3,"Carrer Muntaner, Sant Gervasi- Galvany",2,Flat,Flat,0,2020-10-28,0,1,False,/es/alquiler/vivienda/barcelona-capital/aire-a...,3200.0,SOMOS REAL ESTATE SERVICES,9202765680731,2,133.0,sant gervasi - galvany,1397.273005
4,4,"Rambla del Raval, El Raval",1,Flat,Flat,2,2020-10-28,0,1,False,/es/alquiler/vivienda/barcelona-capital/ascens...,1000.0,ESTUDI JOAN MIRO.SL.,9202762439813,3,67.0,el raval,850.112343


#### Remove unuseful columns

In [45]:
df = df.drop(['extraction_date', 'link'], axis = 1)  # non-useful columns

#### Check data types

In [46]:
df.dtypes

id                            int64
address                      object
bathrooms                     int64
building_subtype             object
building_type                object
conservation_state            int64
discount                      int64
floor_elevator                int64
is_new_construction            bool
price                       float64
real_estate                  object
real_estate_id               object
rooms                         int64
sq_meters                   float64
neighbourhood                object
neighbourhood_mean_price    float64
dtype: object

#### Correct some data types

In [54]:
df['id'] = df['id'].astype("object")
df['address'] = df['address'].astype("string")
df['building_subtype'] = df['building_subtype'].astype("category")
df['building_type'] = df['building_type'].astype("category")
df['conservation_state'] = df['conservation_state'].astype("category")
df['floor_elevator'] = df['floor_elevator'].astype("bool")
df['real_estate'] = df['real_estate'].astype("string")
df['real_estate_id'] = df['real_estate_id'].astype("object")
df['neighbourhood'] = df['neighbourhood'].astype("category")

In [55]:
df.dtypes

id                            object
address                       string
bathrooms                      int64
building_subtype            category
building_type               category
conservation_state          category
discount                       int64
floor_elevator                  bool
is_new_construction             bool
price                        float64
real_estate                   string
real_estate_id                object
rooms                          int64
sq_meters                    float64
neighbourhood               category
neighbourhood_mean_price     float64
dtype: object

#### Summary of numerical variables

In [56]:
df.describe()

Unnamed: 0,bathrooms,discount,price,rooms,sq_meters,neighbourhood_mean_price
count,21856.0,21856.0,21848.0,21856.0,21856.0,21852.0
mean,1.521596,63.394674,1537.024075,2.417094,89.233895,1058.909173
std,0.806908,220.567465,1861.089098,1.192245,67.510855,207.084508
min,0.0,0.0,380.0,0.0,0.0,419.427143
25%,1.0,0.0,920.0,2.0,59.0,895.897039
50%,1.0,0.0,1150.0,2.0,75.0,1004.723491
75%,2.0,75.0,1500.0,3.0,98.0,1162.32163
max,11.0,7627.0,29000.0,22.0,1450.0,1860.55898


#### Summary of categorical variables

In [58]:
df.describe(include = ['category', 'bool'])

Unnamed: 0,building_subtype,building_type,conservation_state,floor_elevator,is_new_construction,neighbourhood
count,21856,21856,21856,21856,21856,21856
unique,10,1,6,2,2,67
top,Flat,Flat,0,True,False,la dreta de l'eixample
freq,17512,21856,11982,15716,21672,1826


#### Check levels of categorical variables

##### building_type
We remove it since it only has 1 unique value.

In [32]:
df = df.drop(['building_type'], axis = 1)

##### building_subtype
There are 10 different [sub]types of building. Most of them are flats.

In [65]:
df['building_subtype'].value_counts()

Flat                     17512
Apartment                 2258
Attic                      922
Duplex                     388
Loft                       282
Study                      204
House_Chalet               160
GroundFloorWithGarden       82
SemidetachedHouse           28
SemiDetached                20
Name: building_subtype, dtype: int64

##### conservation_state
There are 6 different conservation states in Fotocasa classification. We properly re-encode them as they are displayed on their website. Most of the homes are new constructions, surprisingly.

In [63]:
df['conservation_state'].value_counts()

0    11982
1     3718
2     3108
3     2898
8      118
4       32
Name: conservation_state, dtype: int64

In [64]:
df['conservation_state'] = df['conservation_state'].replace({
    0: 'New construction', 
    1: 'Nearly new', 
    2: 'Very good', 
    3: 'Good', 
    4: 'To renovate', 
    8: 'Renovated'
  })
df['conservation_state'].value_counts()

New construction    11982
Nearly new           3718
Very good            3108
Good                 2898
Renovated             118
To renovate            32
Name: conservation_state, dtype: int64

##### floor_elevator
Most of the homes have an elevator.

In [66]:
df['floor_elevator'].value_counts()

True     15716
False     6140
Name: floor_elevator, dtype: int64

##### is_new_construction
Almost no house is newly constructed according to this variable. This contradicts what the variable 'conservation_state' indicates.

In [67]:
df['is_new_construction'].value_counts()

False    21672
True       184
Name: is_new_construction, dtype: int64

##### neighbourhood
The database contains homes in 67 out of the 73  neighboourhoods of Barcelona.

In [38]:
len(df['neighbourhood'].unique().tolist())

67