#### Import libraries and packages

In [1]:
import pickle
import psycopg2
import psycopg2.extras as extras
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

#### Load final model from pickle file

In [34]:
pkl_filename = 'final_model.pkl'
with open(pkl_filename, 'rb') as file:
    [pickle_model, feature_order] = pickle.load(file)

#### Ask user for input

In [3]:
print("Hi, please introduce the characteristics of the flat whose price you want to predict.")

# bathrooms
bathrooms = int(input("Number of bathrooms [0,20]: "))
assert bathrooms in range(0,21), "Number of bathrooms is not valid."

# building_subtype
building_subtype = input("Building subtype (write 'list' for available options): ")
building_subtypes = ['Flat', 'Apartment', 'Attic', 'Duplex', 'Loft', 'Study', 'House_Chalet', 'GroundFloorWithGarden', 'SemidetachedHouse', 'SemiDetached']
if building_subtype == 'list':
    print(building_subtypes)
    building_subtype = input("Building subtype: ")
assert building_subtype in building_subtypes, "Building subtype is not valid."

# conservation_state
conservation_state = input("Conservation state (write 'list' for available options): ")
conservation_states = ['New construction', 'Nearly new', 'Very good', 'Good', 'Renovated', 'To renovate']
if conservation_state == 'list':
    print(conservation_states)
    conservation_state = input("Conservation state: ")
assert conservation_state in conservation_states, "Conservation state is not valid."
    
# floor_elevator
floor_elevator = bool(input("Elevator (True/False): "))
assert floor_elevator in [True, False], "Elevator is not valid."

# rooms
rooms = int(input("Number of rooms [0,20]: "))
assert rooms in range(0,21), "Number of rooms is not valid."
    
# sq_meters
sq_meters = int(input("Squared meters [15,1000]: "))
assert sq_meters in range(0,10001), "Squared meters are not valid."
    
# neighbourhood
neighbourhood = input("Neighbourhood (write 'list' for available options): ")
neighbourhoods = ['el Raval', 'el Barri Gòtic', 'la Barceloneta', 'Sant Pere, Santa Caterina i la Ribera', 'el Fort Pienc', 'la Sagrada Família', "la Dreta de l'Eixample", "l'Antiga Esquerra de l'Eixample", "la Nova Esquerra de l'Eixample", 'Sant Antoni', 'el Poble Sec', 'la Marina del Prat Vermell', 'la Marina de Port', 'la Font de la Guatlla', 'Hostafrancs', 'la Bordeta', 'Sants - Badal', 'Sants', 'les Corts', 'la Maternitat i Sant Ramon', 'Pedralbes', 'Vallvidrera, el Tibidabo i les Planes', 'Sarrià', 'les Tres Torres', 'Sant Gervasi - la Bonanova', 'Sant Gervasi - Galvany', 'el Putxet i el Farró', 'Vallcarca i els Penitents', 'el Coll', 'la Salut', 'la Vila de Gràcia', "el Camp d'en Grassot i Gràcia Nova", 'el Baix Guinardó', 'Can Baró', 'el Guinardó', "la Font d'en Fargues", 'el Carmel', 'la Teixonera', 'Sant Genís dels Agudells', 'Montbau', "la Vall d'Hebron", 'la Clota', 'Horta', 'Vilapicina i la Torre Llobeta', 'Porta', 'el Turó de la Peira', 'Can Peguera', 'la Guineueta', 'Canyelles', 'les Roquetes', 'Verdun', 'la Prosperitat', 'la Trinitat Nova', 'Torre Baró', 'Ciutat Meridiana', 'Vallbona', 'la Trinitat Vella', 'Baró de Viver', 'el Bon Pastor', 'Sant Andreu', 'la Sagrera', 'el Congrés i els Indians', 'Navas', "el Camp de l'Arpa del Clot", 'el Clot', 'el Parc i la Llacuna del Poblenou', 'la Vila Olímpica del Poblenou', 'el Poblenou', 'Diagonal Mar i el Front Marítim del Poblenou', 'el Besòs i el Maresme', 'Provençals del Poblenou', 'Sant Martí de Provençals', 'la Verneda i la Pau']
if neighbourhood == 'list':
    print(neighbourhoods)
    neighbourhood = input("Neighbourhood: ")
assert neighbourhood in neighbourhoods, "Neighbourhood is not valid."

Hi, please introduce the characteristics of the flat whose price you want to predict.


Number of bathrooms [0,20]:  5
Building subtype (write 'list' for available options):  Flat
Conservation state (write 'list' for available options):  Good
Elevator (True/False):  True
Number of rooms [0,20]:  5
Squared meters [15,1000]:  100
Neighbourhood (write 'list' for available options):  Navas


##### Create new dataframe with single row to predict

In [4]:
d = {'bathrooms': [bathrooms],
    'building_subtype': [building_subtype],
    'conservation_state': [conservation_state],
    'floor_elevator': [floor_elevator],
    'rooms': [rooms],
    'sq_meters': [sq_meters],
    'neighbourhood': [neighbourhood]}
df = pd.DataFrame(data=d)

#### Load tables from trusted zone

In [5]:
host = 'postgresfib.fib.upc.edu'
dbname = 'ADSDBjordi.cluet'
user = 'jordi.cluet'
pwd = 'DB151199'
port = 6433
sslmode = 'require'

conn = psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(host, port, dbname, user, pwd))
cursor = conn.cursor()

In [6]:
sql = "SELECT * from trusted_zone.ajunt_crime_2020_21_12_06;"
crime = pd.read_sql_query(sql, conn)

sql = "SELECT * from trusted_zone.AJUNT_BARRIS_2017_21_12_06;"
barris = pd.read_sql_query(sql, conn)

sql = "SELECT * from trusted_zone.ajunt_districtes_2021_21_12_24;"
districts = pd.read_sql_query(sql, conn)

In [7]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def entity(df1,df2,col1,col2):
    names_1 = df1[col1].unique()
    names_2 = df2[col2].unique()

    matching = {}
    for name in names_1:
        best = 0
        best_name = 'None'
        for name1 in names_2:
            distance = similar(name,name1)
            if distance > best:
                best = distance
                best_name = name1
        matching[name] = best_name

    for key in matching:
        df1.loc[df1[col1] == key, col1] = matching[key]
    return df1, matching

In [8]:
crime, matching2 = entity(crime,barris,'districte','nom_districte')
districts, matching3 = entity(districts,barris,'districte','nom_districte')

In [9]:
barris = barris.rename(columns={"nom_barri": "neighbourhood",'nom_districte':'districte'})

In [10]:
merged = pd.merge(barris,districts,on='districte')
merged = pd.merge(merged,crime,on='districte')

In [11]:
merged.drop(['codi_barri', 'codi_districte'], axis=1, inplace=True)

In [12]:
merged

Unnamed: 0,districte,neighbourhood,superficie,poblacio,furt,estafes,danys,rob_viol_intim,rob_en_vehicle,rob_força,...,agressio_sex,conviv_veinal,vigilancia_poli,molesties_espai_pub,contra_prop_priv,incendis,estupefaents,agressions,proves_alcohol,proves_droga
0,Ciutat Vella,el Raval,420.5,107858,11827,1209,1110,2174,915,866,...,43,7645,10703,6353,4089,444,1117,601,3520,278
1,Ciutat Vella,el Barri Gòtic,420.5,107858,11827,1209,1110,2174,915,866,...,43,7645,10703,6353,4089,444,1117,601,3520,278
2,Ciutat Vella,la Barceloneta,420.5,107858,11827,1209,1110,2174,915,866,...,43,7645,10703,6353,4089,444,1117,601,3520,278
3,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",420.5,107858,11827,1209,1110,2174,915,866,...,43,7645,10703,6353,4089,444,1117,601,3520,278
4,Eixample,el Fort Pienc,746.4,270331,14157,3390,1714,1761,1484,1543,...,32,11474,4721,4445,2178,793,451,477,3770,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Sant Martí,Diagonal Mar i el Front Marítim del Poblenou,1043.7,241263,7023,2378,1813,1592,1493,913,...,23,6456,4827,5721,1570,696,325,392,5857,416
69,Sant Martí,el Besòs i el Maresme,1043.7,241263,7023,2378,1813,1592,1493,913,...,23,6456,4827,5721,1570,696,325,392,5857,416
70,Sant Martí,Provençals del Poblenou,1043.7,241263,7023,2378,1813,1592,1493,913,...,23,6456,4827,5721,1570,696,325,392,5857,416
71,Sant Martí,Sant Martí de Provençals,1043.7,241263,7023,2378,1813,1592,1493,913,...,23,6456,4827,5721,1570,696,325,392,5857,416


#### Prediction

##### Augmentate data

In [13]:
dfm = pd.merge(df, merged, on='neighbourhood')
dfm

Unnamed: 0,bathrooms,building_subtype,conservation_state,floor_elevator,rooms,sq_meters,neighbourhood,districte,superficie,poblacio,...,agressio_sex,conviv_veinal,vigilancia_poli,molesties_espai_pub,contra_prop_priv,incendis,estupefaents,agressions,proves_alcohol,proves_droga
0,5,Flat,Good,True,5,100,Navas,Sant Andreu,659.2,151960,...,16,4020,3284,2456,781,366,199,264,1630,125


##### Add categories that are not in the row to be predicted

In [14]:
dfm.building_subtype = dfm.building_subtype.astype('category')
dfm.building_subtype = dfm.building_subtype.cat.add_categories(list(set(building_subtypes) - set([dfm.building_subtype[0]])))

In [15]:
dfm.neighbourhood = dfm.neighbourhood.astype('category')
dfm.neighbourhood = dfm.neighbourhood.cat.add_categories(list(set(neighbourhoods) - set([dfm.neighbourhood[0]])))

In [16]:
dfm.conservation_state = dfm.conservation_state.astype('category')
dfm.conservation_state = dfm.conservation_state.cat.add_categories(list(set(conservation_states) - set([dfm.conservation_state[0]])))

In [17]:
districtes = ['Ciutat Vella', 'Eixample', 'Sants-Montjuïc', 'Les Corts', 'Sarrià-Sant Gervasi', 'Gràcia', 'Horta-Guinardó', 'Nou Barris', 'Sant Andreu', 'Sant Martí']
dfm.districte = dfm.districte.astype('category')
dfm.districte = dfm.districte.cat.add_categories(list(set(districtes) - set([dfm.districte[0]])))

##### One-hot encoding

In [18]:
ohe_bs = pd.get_dummies(dfm.building_subtype, prefix='bs')
ohe_cs = pd.get_dummies(dfm.conservation_state, prefix='cs')
ohe_d = pd.get_dummies(dfm.districte, prefix='d')
ohe_n = pd.get_dummies(dfm.neighbourhood, prefix='n')
dfmoh = pd.concat([dfm, ohe_bs, ohe_cs, ohe_d, ohe_n], axis=1)
dfmoh.drop(['building_subtype', 'conservation_state', 'districte', 'neighbourhood'], axis=1, inplace=True)
dfmoh.columns

Index(['bathrooms', 'floor_elevator', 'rooms', 'sq_meters', 'superficie',
       'poblacio', 'furt', 'estafes', 'danys', 'rob_viol_intim',
       ...
       'n_la Teixonera', 'n_el Congrés i els Indians', 'n_el Raval',
       'n_Vallcarca i els Penitents', 'n_Sants - Badal',
       'n_el Camp de l'Arpa del Clot', 'n_el Poble Sec',
       'n_la Marina del Prat Vermell', 'n_la Nova Esquerra de l'Eixample',
       'n_Hostafrancs'],
      dtype='object', length=129)

##### Execute prediction

In [32]:
assert set(feature_order) == set(list(dfmoh.columns)), "The number of features does not coincide"

In [30]:
dfmoh = dfmoh[feature_order]

In [31]:
print(f"Predicted price is {round(pickle_model.predict(dfmoh)[0], 2)}€.")

Predicted price is 1425.25€.
