# Predicting Price with Neighborhood

In [1]:
pwd

'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model'

## Library importing

In [2]:
#pip install category_encoders

In [3]:
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline

## Data Wrangling

In [4]:
#Loading the dataset with a function
def wrangle (filepath):
    #Load the CSV file into file path-local function applied
    df = pd.read_csv(filepath, encoding='ISO-8859-1')
    #To display 'Capital Federal' only 
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    #To display 'Apartment' only 
    mask_apt = df["property_type"] == "apartment" 
    #To display price below'400,000'
    mask_price = df["price_aprox_usd"] < 400_000
    #Subset
    df = df[ mask_ba & mask_apt & mask_price]
    
    #Remove outliers by 'surface_area_m2' -----removing the 10th and 90th quantiles
    low = df["surface_covered_in_m2"].quantile(0.1)
    high = df["surface_covered_in_m2"].quantile(0.9)
    
    mask_area = df["surface_covered_in_m2"].between(low, high)
    
    df = df[mask_area]
    
    #split the lat-lon column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    
    #split place_with_parent_names
    df["neighbourhood"] = df["place_with_parent_names"].str.split("|", expand=True)[3]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    print('low = ', low, ';high = ', high)
    print('df.shape = ', df.shape)
    
    return df

In [5]:
#list of all files to be imported
files = glob("C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model/buenos-aires-real-estate-*.csv")
files

['C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model\\buenos-aires-real-estate-1.csv',
 'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model\\buenos-aires-real-estate-2.csv',
 'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model\\buenos-aires-real-estate-3.csv',
 'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model\\buenos-aires-real-estate-4.csv',
 'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model\\buenos-aires-real-estate-5.csv']

In [6]:
#loading all the dataframes
frames = []
for file in files:
    df = wrangle(file)
    frames.append(df)

low =  31.0 ;high =  100.0
df.shape =  (1343, 18)
low =  31.0 ;high =  100.0
df.shape =  (1315, 18)
low =  31.0 ;high =  101.70000000000005
df.shape =  (1288, 18)
low =  30.0 ;high =  98.0
df.shape =  (1305, 18)
low =  30.0 ;high =  100.0
df.shape =  (1331, 18)


In [7]:
frames[0].head()

Unnamed: 0.1,Unnamed: 0,operation,property_type,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon,neighbourhood
4,5,sell,apartment,129000.0,USD,1955949.6,129000.0,76.0,70.0,1697.368421,1842.857143,,,,http://chacarita.properati.com.ar/10qlv_venta_...,-34.584651,-58.454693,Chacarita
9,10,sell,apartment,87000.0,USD,1319128.8,87000.0,48.0,42.0,1812.5,2071.428571,,,,http://villa-luro.properati.com.ar/12m82_venta...,-34.638979,-58.500115,Villa Luro
29,30,sell,apartment,118000.0,USD,1789163.2,118000.0,,54.0,,2185.185185,,2.0,,http://caballito.properati.com.ar/11wqh_venta_...,-34.615847,-58.459957,Caballito
40,41,sell,apartment,57000.0,USD,864256.8,57000.0,42.0,42.0,1357.142857,1357.142857,5.0,2.0,364.0,http://constitucion.properati.com.ar/k2f0_vent...,-34.625222,-58.382382,Constitución
41,42,sell,apartment,90000.0,USD,1364616.0,90000.0,57.0,50.0,1578.947368,1800.0,,3.0,450.0,http://once.properati.com.ar/suwa_venta_depart...,-34.61061,-58.412511,Once


In [8]:
#concatenate the dataframes
df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0.1,Unnamed: 0,operation,property_type,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon,neighbourhood
0,5,sell,apartment,129000.0,USD,1955949.6,129000.0,76.0,70.0,1697.368421,1842.857143,,,,http://chacarita.properati.com.ar/10qlv_venta_...,-34.584651,-58.454693,Chacarita
1,10,sell,apartment,87000.0,USD,1319128.8,87000.0,48.0,42.0,1812.5,2071.428571,,,,http://villa-luro.properati.com.ar/12m82_venta...,-34.638979,-58.500115,Villa Luro
2,30,sell,apartment,118000.0,USD,1789163.2,118000.0,,54.0,,2185.185185,,2.0,,http://caballito.properati.com.ar/11wqh_venta_...,-34.615847,-58.459957,Caballito
3,41,sell,apartment,57000.0,USD,864256.8,57000.0,42.0,42.0,1357.142857,1357.142857,5.0,2.0,364.0,http://constitucion.properati.com.ar/k2f0_vent...,-34.625222,-58.382382,Constitución
4,42,sell,apartment,90000.0,USD,1364616.0,90000.0,57.0,50.0,1578.947368,1800.0,,3.0,450.0,http://once.properati.com.ar/suwa_venta_depart...,-34.61061,-58.412511,Once


In [9]:
df.shape

(6582, 18)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6582 entries, 0 to 6581
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  6582 non-null   int64  
 1   operation                   6582 non-null   object 
 2   property_type               6582 non-null   object 
 3   price                       6582 non-null   float64
 4   currency                    6582 non-null   object 
 5   price_aprox_local_currency  6582 non-null   float64
 6   price_aprox_usd             6582 non-null   float64
 7   surface_total_in_m2         4752 non-null   float64
 8   surface_covered_in_m2       6582 non-null   float64
 9   price_usd_per_m2            4536 non-null   float64
 10  price_per_m2                6582 non-null   float64
 11  floor                       1900 non-null   float64
 12  rooms                       5286 non-null   float64
 13  expenses                    1739 

In [11]:
#create a copy of the dataframe
dfc = df.copy()
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6582 entries, 0 to 6581
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  6582 non-null   int64  
 1   operation                   6582 non-null   object 
 2   property_type               6582 non-null   object 
 3   price                       6582 non-null   float64
 4   currency                    6582 non-null   object 
 5   price_aprox_local_currency  6582 non-null   float64
 6   price_aprox_usd             6582 non-null   float64
 7   surface_total_in_m2         4752 non-null   float64
 8   surface_covered_in_m2       6582 non-null   float64
 9   price_usd_per_m2            4536 non-null   float64
 10  price_per_m2                6582 non-null   float64
 11  floor                       1900 non-null   float64
 12  rooms                       5286 non-null   float64
 13  expenses                    1739 

In [12]:
#seperating column for neighbourhood to be subset
#df["neighbourhood"] = df["place_with_parent_names"].str.split("|", expand=True)[3]
    #df.drop(columns="place_with_parent_names", inplace=True)

## Model Building & Testing 

In [13]:
features = ["neighbourhood"]
target = "price_aprox_usd"
X_train = df[features]
y_train = df[target]

In [14]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
print("Training mean:", y_mean)
print("The Baseline Mae:", mean_absolute_error(y_train, y_pred_baseline))

Training mean: 132383.83701458524
The Baseline Mae: 44860.10834274133


In [15]:
#One hot encoding - Instantiate
ohe = OneHotEncoder(use_cat_names=True)
#fit 
ohe.fit(X_train)
#Transform
XT_train = ohe.transform(X_train)
print(XT_train.shape)
print(XT_train.head)

(6582, 57)
<bound method NDFrame.head of       neighbourhood_Chacarita  neighbourhood_Villa Luro  \
0                           1                         0   
1                           0                         1   
2                           0                         0   
3                           0                         0   
4                           0                         0   
...                       ...                       ...   
6577                        0                         0   
6578                        0                         0   
6579                        0                         0   
6580                        0                         0   
6581                        0                         0   

      neighbourhood_Caballito  neighbourhood_Constitución  neighbourhood_Once  \
0                           0                           0                   0   
1                           0                           0                   0   
2      

In [17]:
#Building the model with the pipeline
model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        LinearRegression()
)

model.fit(X_train, y_train)

In [19]:
#Mean_absolute_error
y_pred_baseline = model.predict(X_train)
MAE = mean_absolute_error(y_train, y_pred_baseline)
print("Training MAE:", MAE)

Training MAE: 39346.49551959891


## Generalisation

In [21]:
X_test = pd.read_csv("C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model/buenos-aires-test-features.csv", encoding= "ISO-8859-2")
