# **Data preprocessing and features creation**

In [193]:
import pandas as pd
import numpy as np
from tqdm import tqdm

- For this class we are going to use Properati´s Open Data for properties in Bogotá. You can get acces to this data [here](https://www.properati.com.co/data).

In [247]:
df = pd.read_csv('co_properties.csv.gz', compression='gzip', header=0,    sep=',', quotechar='"', error_bad_lines=False)

In [249]:
df.shape[1]

25

In [195]:
df.columns

Index(['id', 'ad_type', 'start_date', 'end_date', 'created_on', 'lat', 'lon',
       'l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'rooms', 'bedrooms', 'bathrooms',
       'surface_total', 'surface_covered', 'price', 'currency', 'price_period',
       'title', 'description', 'property_type', 'operation_type'],
      dtype='object')

- First, we are going to make a data frame that contains only data from properties in Bogotá.

In [196]:
df

Unnamed: 0,id,ad_type,start_date,end_date,created_on,lat,lon,l1,l2,l3,...,bathrooms,surface_total,surface_covered,price,currency,price_period,title,description,property_type,operation_type
0,TwS+KTQ1LigejcnVcR8wQA==,Propiedad,2020-10-10,2021-01-09,2020-10-10,6.274566,-75.642269,Colombia,Antioquia,Medellín,...,,,,150000000.0,COP,,MARAVILLOSO LOTE EN VENTA _ SECTOR SAN CRISTOB...,"COD.20071 Maravilloso lote en venta, hermosa y...",Otro,Venta
1,exe9suxTarpGHR3XC7E5DQ==,Propiedad,2020-10-10,2020-10-30,2020-10-10,4.592129,-74.123863,Colombia,Cundinamarca,Bogotá D.C,...,,,,11000000.0,COP,,Local comercial esquinero - Centro Mayor,Excelente local esquinero\n\nCon doble vitrina...,Otro,Arriendo
2,O7bfpWBSwJOh+1j2tNIT7Q==,Propiedad,2020-10-10,2020-11-13,2020-10-10,10.985000,-74.795000,Colombia,Atlántico,Barranquilla,...,,250.0,250.0,900000000.0,COP,,Bodega En Arriendo/venta En Barranquilla Bosto...,"BODEGA EN BUEN ESTADO, SE ENCUENTRA UBICADA EN...",Otro,Venta
3,9aGw3lg9cblCh3D/BookWg==,Propiedad,2020-10-10,2020-11-12,2020-10-10,3.366000,-76.541000,Colombia,Valle del Cauca,Cali,...,,77.0,77.0,550000000.0,COP,,Consultorio En Venta En Cali Ciudad Jardn Cod....,Vendo consultorio o oficina en edifico exclusi...,Oficina,Venta
4,6rglCGhfwQeesD0zNLy/pw==,Propiedad,2020-10-10,2021-02-12,2020-10-10,6.229068,-75.536363,Colombia,Antioquia,Medellín,...,,,,125000000.0,COP,,Venta Apartamento en Buenos Aires Bosque Verde,En el momento de invertir en la compra de un i...,Apartamento,Venta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5GJ0XVQ7+rn95XRI3tSB+Q==,Propiedad,2020-07-23,2020-08-01,2020-07-23,6.198000,-75.565000,Colombia,Antioquia,Medellín,...,2.0,200.0,200.0,12500000.0,COP,,Local En Arriendo En Medellin El Poblado Cod. ...,Jairo Ochoa Propiedad Raz ofrece para arrendam...,Local comercial,Arriendo
999996,VEkggjq5UPdxX5osneJGHQ==,Propiedad,2020-07-23,2021-08-09,2020-07-23,4.600097,-74.075562,Colombia,Cundinamarca,Bogotá D.C,...,2.0,,,2800000.0,COP,,"LOCAL EN ARRIENDO, BOGOTA-LA CANDELARIA",Local para arriendo en el sector de la candela...,Local comercial,Arriendo
999997,YXGr5K1neic8fuwfTPpYBQ==,Propiedad,2020-07-23,2020-08-01,2020-07-23,3.420000,-76.546000,Colombia,Valle del Cauca,Cali,...,3.0,,400.0,9500000.0,COP,,Local En Arriendo En Cali Urbanizacin Tequenda...,LOCAL CALI TEQUENDAMA ALQUILER 400 m consta de...,Local comercial,Arriendo
999998,/qBt6Y8IqDeiKi3DC6XBLw==,Propiedad,2020-07-23,2021-03-16,2020-07-23,,,Colombia,Cundinamarca,Bogotá D.C,...,3.0,102.0,254.0,10000000.0,COP,,Local En Arriendo En Bogota Prado Veraniego Co...,Arrienda Local Comercial de 254 M2. Ubicado en...,Local comercial,Arriendo


In [197]:
df=df[df['l3']=='Bogotá D.C']

In [198]:
df=df.drop(columns=['l1', 'l2', 'l3'])

- Let´s check the currencies in which prices are reported to have consistency in porpeties prices

In [199]:
df.groupby('currency').agg({'id':'count'})

Unnamed: 0_level_0,id
currency,Unnamed: 1_level_1
ARS,2
COP,166941
USD,4


In [200]:
df=df[df['currency']=='COP']

In [205]:
df

Unnamed: 0,id,ad_type,start_date,end_date,created_on,lat,lon,l4,l5,l6,...,bathrooms,surface_total,surface_covered,price,currency,price_period,title,description,property_type,operation_type
1,exe9suxTarpGHR3XC7E5DQ==,Propiedad,2020-10-10,2020-10-30,2020-10-10,4.592129,-74.123863,Zona Sur,Antonio Nariño,,...,,,,11000000.0,COP,,Local comercial esquinero - Centro Mayor,Excelente local esquinero\n\nCon doble vitrina...,Otro,Arriendo
189,6dVpYVaQbKbsG52zeifVZg==,Propiedad,2020-10-10,2020-11-12,2020-10-10,4.704000,-74.047000,Zona Norte,Usaquén,,...,,500.0,400.0,12000000.0,COP,,Casa En Arriendo En Bogota Santa Brbara Cod. A...,CASA EXCELENTE UBICACION CERCA A UNICENTROUSO ...,Casa,Arriendo
190,sCNiDi252cmfBpOM9yeu1A==,Propiedad,2020-10-10,2020-11-13,2020-10-10,4.685000,-74.031000,Zona Norte,Usaquén,,...,,,210.0,6200000.0,COP,,Casa En Arriendo En Bogota Santa Ana Oriental ...,INF: FLOR JIMENEZ 64154776748 SUPER EXCLUSIVO ...,Casa,Arriendo
191,z9uvWccPYAiKGp/KPXSqKQ==,Propiedad,2020-10-10,2021-07-01,2020-10-10,4.691062,-74.038856,Zona Norte,Usaquén,Santa Ana,...,,,,7500000.0,COP,,"OFICINA EN ARRIENDO, BOGOTA-SANTA ANA","Edificio inteligente, excelente seguridad, ár...",Oficina,Arriendo
192,ouliuIq3I6hNLU6T+Z+B0A==,Propiedad,2020-10-10,2020-11-12,2020-10-10,4.699000,-74.054000,Zona Norte,Usaquén,,...,,,43.0,450000000.0,COP,,Oficina En Venta En Bogota Santa Barbara Occid...,PADS te trae oficina para la venta modular y a...,Oficina,Venta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999985,kv5OAhtSoqnU+7uTtV1W+g==,Propiedad,2020-07-23,2020-08-01,2020-07-23,4.666000,-74.054000,Zona Chapinero,Chapinero,El Retiro,...,1.0,,24.0,650000000.0,COP,,Local En Venta En Bogota El Retiro Cod. VINH2705,"Local remodelado, ubicado en Centro comercia...",Local comercial,Venta
999986,K4NSO+KjpX1jBHNwiUTlqA==,Propiedad,2020-07-23,2020-08-01,2020-07-23,,,Zona Noroccidental,Suba,Salitre,...,1.0,30.0,28.0,3000000.0,COP,,Local En Arriendo En Bogota Salitre Cod. ARYG1...,EXCELENTE LOCAL COMERCIAL MUY BIEN UBICADO SOB...,Local comercial,Arriendo
999987,ZARXoLFNyoYE4EIiSblC9g==,Propiedad,2020-07-23,2020-08-01,2020-07-23,4.741000,-74.046000,Zona Norte,Usaquén,,...,1.0,62.0,62.0,2100000.0,COP,,Local En Arriendo En Bogota Calle 161 # 21 - 3...,"LOCAL SOBRE VIA PRINCIPAL, CON SALIDA PARA CUA...",Local comercial,Arriendo
999996,VEkggjq5UPdxX5osneJGHQ==,Propiedad,2020-07-23,2021-08-09,2020-07-23,4.600097,-74.075562,Zona Centro,La Candelaria,,...,2.0,,,2800000.0,COP,,"LOCAL EN ARRIENDO, BOGOTA-LA CANDELARIA",Local para arriendo en el sector de la candela...,Local comercial,Arriendo


-What can we do with time data?

In [206]:
df['start_date']=pd.to_datetime(df['start_date'])
df['end_date']=pd.to_datetime(df['end_date'],errors='coerce')

In [209]:
date_variables=['start_date','end_date']
for i in date_variables:
    df[str(i)+'_year']=df[i].dt.year.astype(str)
    df[str(i)+'_month']=df[i].dt.month.astype(str)
    
    

In [212]:
df.columns

Index(['id', 'ad_type', 'start_date', 'end_date', 'created_on', 'lat', 'lon',
       'l4', 'l5', 'l6', 'rooms', 'bedrooms', 'bathrooms', 'surface_total',
       'surface_covered', 'price', 'currency', 'price_period', 'title',
       'description', 'property_type', 'operation_type', 'start_date_year',
       'start_date_month', 'end_date_year', 'end_date_month'],
      dtype='object')

In [216]:
df=df.drop(columns=[ 'ad_type', 'start_date', 'end_date', 'created_on'])

- Our data is missing information that is realy important to predict the price of a property. Let´s try extract this information from the ad description

In [217]:
descriptions=df['description'].to_list()

In [221]:
elevator=[]
for i in tqdm(descriptions):
    try:
        if ('ascensor' in i) | ('Ascensor' in i) | ('ASCENSOR' in i) | ('ELEVADOR' in i) | ('Elevador' in i) | ('elevador' in i):
            elevator.append(1)
        else:
            elevator.append(0)
    except:
        elevator.append(0)
        

100%|██████████| 166941/166941 [00:00<00:00, 422153.37it/s]


In [222]:
df['elevator']=elevator

In [223]:
df['elevator'] = df['elevator'].replace(0,np.nan)

In [224]:
df.groupby('elevator').agg({'id':'count'})

Unnamed: 0_level_0,id
elevator,Unnamed: 1_level_1
1.0,30554


- Can you think of a way to extract the number of garajes of a property from the description?

In [225]:
df=df.drop(columns=['title', 'description','currency','price_period'])

- Know we are going to save non continuos variables as catageorical data

In [226]:
df.columns

Index(['id', 'lat', 'lon', 'l4', 'l5', 'l6', 'rooms', 'bedrooms', 'bathrooms',
       'surface_total', 'surface_covered', 'price', 'property_type',
       'operation_type', 'start_date_year', 'start_date_month',
       'end_date_year', 'end_date_month', 'elevator'],
      dtype='object')

In [227]:
dummies= pd.get_dummies(df[[ 'l4', 'l5', 'l6','property_type', 'operation_type', 'start_date_year',
       'start_date_month', 'end_date_year', 'end_date_month']])

In [228]:
dummies

Unnamed: 0,l4_Zona Centro,l4_Zona Chapinero,l4_Zona Noroccidental,l4_Zona Norte,l4_Zona Occidental,l4_Zona Sur,l4_Zona Suroccidental,l5_Antonio Nariño,l5_Barrios Unidos,l5_Bosa,...,end_date_month_12.0,end_date_month_2.0,end_date_month_3.0,end_date_month_4.0,end_date_month_5.0,end_date_month_6.0,end_date_month_7.0,end_date_month_8.0,end_date_month_9.0,end_date_month_nan
1,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
189,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
191,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
192,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999985,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
999986,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
999987,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
999996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [229]:
df=df.drop(columns=[ 'l4', 'l5', 'l6','property_type', 'operation_type', 'start_date_year',
       'start_date_month', 'end_date_year', 'end_date_month','id'])

In [230]:
data = pd.concat([df, dummies], axis=1)

In [231]:
data

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,elevator,l4_Zona Centro,...,end_date_month_12.0,end_date_month_2.0,end_date_month_3.0,end_date_month_4.0,end_date_month_5.0,end_date_month_6.0,end_date_month_7.0,end_date_month_8.0,end_date_month_9.0,end_date_month_nan
1,4.592129,-74.123863,,,,,,11000000.0,,0,...,0,0,0,0,0,0,0,0,0,0
189,4.704000,-74.047000,,,,500.0,400.0,12000000.0,,0,...,0,0,0,0,0,0,0,0,0,0
190,4.685000,-74.031000,4.0,4.0,,,210.0,6200000.0,,0,...,0,0,0,0,0,0,0,0,0,0
191,4.691062,-74.038856,,0.0,,,,7500000.0,,0,...,0,0,0,0,0,0,1,0,0,0
192,4.699000,-74.054000,,,,,43.0,450000000.0,,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999985,4.666000,-74.054000,,,1.0,,24.0,650000000.0,,0,...,0,0,0,0,0,0,0,1,0,0
999986,,,,,1.0,30.0,28.0,3000000.0,,0,...,0,0,0,0,0,0,0,1,0,0
999987,4.741000,-74.046000,,,1.0,62.0,62.0,2100000.0,,0,...,0,0,0,0,0,0,0,1,0,0
999996,4.600097,-74.075562,,0.0,2.0,,,2800000.0,,1,...,0,0,0,0,0,0,0,1,0,0


## **Missing values and data imputation**

In [232]:
data.isnull().sum()[:12]

lat                       12392
lon                       12392
rooms                    121931
bedrooms                  74204
bathrooms                 39723
surface_total            156451
surface_covered          144742
price                         0
elevator                 136387
l4_Zona Centro                0
l4_Zona Chapinero             0
l4_Zona Noroccidental         0
dtype: int64

In [233]:
data=data.drop(columns=['rooms'])

- As we can see, our data set has a lot of missing values. Deleting all missing values might not be the best strategy because we will be droping a lot of valuable information. For this reason, it could be usefull to use imputation methods to avoid loosing data. Take into acount:
  1. If you are developing a Machine Learning project, dont impute the train data and the test data at once. If you do this your train data would have information about the test data distribution so your out of sample  error would not be a good measurment of your model acurracy
  2. Dont impute your dependant variable with your features. If you do this, your data would have information about the variable its trying to predict so your prediction error wont be a good measurment of yout model acurracy

**"Manual" imputing**

- Example: In Colombia, every property with more that 5 storeys must have an elevator. Having this information, if we had a variable in our data set that gave us information about the ammount of floors of the building in which a give property is located, we could impute the elevator variable as follows:


In [78]:
for i in data.index:
    if (data['number_of_floors'][i]>5):
        data['elevator']=1

KeyError: 'number_of_floors'

**simple imputation methods**

- Another way you could impute missing values of a given variable is by replacing the missing values with the mean or the median of this variable. 

In [234]:
data1=data[['bedrooms','surface_total']]

In [236]:
a=data1['bedrooms'].mean()

In [237]:
data1['bedrooms'] = data1['bedrooms'].replace(np.nan,a)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['bedrooms'] = data1['bedrooms'].replace(np.nan,a)


In [239]:
data1['surface_total'] = data1['surface_total'].replace(np.nan,data1['surface_total'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['surface_total'] = data1['surface_total'].replace(np.nan,data1['surface_total'].median())


In [240]:
data1.isnull().sum()

bedrooms         0
surface_total    0
dtype: int64

**Iterative imputer**

In [241]:
X=data.drop(columns=['price'])


In [189]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [190]:
imputer = IterativeImputer()

imputer.fit(X)

new_X = imputer.transform(X)



- Other nice imputations methods you could check out are MICE (Multiple Imputation with Chained Equations) and KNN imputation

In [243]:
columnas=data.columns

In [244]:
columnas

Index(['lat', 'lon', 'bedrooms', 'bathrooms', 'surface_total',
       'surface_covered', 'price', 'elevator', 'l4_Zona Centro',
       'l4_Zona Chapinero',
       ...
       'end_date_month_12.0', 'end_date_month_2.0', 'end_date_month_3.0',
       'end_date_month_4.0', 'end_date_month_5.0', 'end_date_month_6.0',
       'end_date_month_7.0', 'end_date_month_8.0', 'end_date_month_9.0',
       'end_date_month_nan'],
      dtype='object', length=239)

In [246]:
pd.DataFrame(new_X,columns=columnas)

ValueError: Shape of passed values is (166941, 238), indices imply (166941, 239)