In [1]:
pwd

'C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model'

In [2]:
#importing libraries
import numpy as np #mathematical operations
import pandas as pd
import plotly.express as px #for visualisation
import plotly.graph_objects as go #for visualisation
from sklearn.impute import SimpleImputer #for inputting missing values
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline #take diff components of model and put it together 
from sklearn.utils.validation import check_is_fitted

In [3]:
#Loading the dataset with a function
def wrangle (filepath):
    #Load the CSV file into file path-local function applied
    df = pd.read_csv(filepath, encoding='ISO-8859-1')
    #To display 'Capital Federal' only 
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    #To display 'Apartment' only 
    mask_apt = df["property_type"] == "apartment" 
    #To display price below'400,000'
    mask_price = df["price_aprox_usd"] < 400_000
    #Subset
    df = df[ mask_ba & mask_apt & mask_price]
    
    #Remove outliers by 'surface_area_m2' -----removing the 10th and 90th quantiles
    low = df["surface_covered_in_m2"].quantile(0.1)
    high = df["surface_covered_in_m2"].quantile(0.9)
    
    mask_area = df["surface_covered_in_m2"].between(low, high)
    
    df = df[mask_area]
    
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    
    print('low = ', low, ';high = ', high)
    print('df.shape = ', df.shape)
    
    return df

In [7]:
#wrangle to the function
frame1 = wrangle("C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model/buenos-aires-real-estate-1.csv")
frame1.info()
#Copy of the dataset
frame1cpy = frame1.copy()

low =  31.0 ;high =  100.0
df.shape =  (1343, 18)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1343 entries, 4 to 8604
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  1343 non-null   int64  
 1   operation                   1343 non-null   object 
 2   property_type               1343 non-null   object 
 3   place_with_parent_names     1343 non-null   object 
 4   price                       1343 non-null   float64
 5   currency                    1343 non-null   object 
 6   price_aprox_local_currency  1343 non-null   float64
 7   price_aprox_usd             1343 non-null   float64
 8   surface_total_in_m2         965 non-null    float64
 9   surface_covered_in_m2       1343 non-null   float64
 10  price_usd_per_m2            927 non-null    float64
 11  price_per_m2                1343 non-null   float64
 12  floor                       379 non-null

In [8]:
frame2 = wrangle("C:\\Users\\admin\\Desktop\\Personal_stuff\\PROJECTS\\Proj2_Price model/buenos-aires-real-estate-2.csv")
frame2.info()
#Copy of the dataset
frame2cpy = frame2.copy()

low =  31.0 ;high =  100.0
df.shape =  (1315, 18)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1315 entries, 2 to 8585
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  1315 non-null   int64  
 1   operation                   1315 non-null   object 
 2   property_type               1315 non-null   object 
 3   place_with_parent_names     1315 non-null   object 
 4   price                       1315 non-null   float64
 5   currency                    1315 non-null   object 
 6   price_aprox_local_currency  1315 non-null   float64
 7   price_aprox_usd             1315 non-null   float64
 8   surface_total_in_m2         933 non-null    float64
 9   surface_covered_in_m2       1315 non-null   float64
 10  price_usd_per_m2            891 non-null    float64
 11  price_per_m2                1315 non-null   float64
 12  floor                       390 non-null

In [10]:
#Concatenate the dataframe 
df = pd.concat([frame1, frame2], ignore_index=True)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2658 entries, 0 to 2657
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  2658 non-null   int64  
 1   operation                   2658 non-null   object 
 2   property_type               2658 non-null   object 
 3   place_with_parent_names     2658 non-null   object 
 4   price                       2658 non-null   float64
 5   currency                    2658 non-null   object 
 6   price_aprox_local_currency  2658 non-null   float64
 7   price_aprox_usd             2658 non-null   float64
 8   surface_total_in_m2         1898 non-null   float64
 9   surface_covered_in_m2       2658 non-null   float64
 10  price_usd_per_m2            1818 non-null   float64
 11  price_per_m2                2658 non-null   float64
 12  floor                       769 non-null    float64
 13  rooms                       2137 

Unnamed: 0.1,Unnamed: 0,operation,property_type,place_with_parent_names,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon
0,5,sell,apartment,|Argentina|Capital Federal|Chacarita|,129000.0,USD,1955949.6,129000.0,76.0,70.0,1697.368421,1842.857143,,,,http://chacarita.properati.com.ar/10qlv_venta_...,-34.584651,-58.454693
1,10,sell,apartment,|Argentina|Capital Federal|Villa Luro|,87000.0,USD,1319128.8,87000.0,48.0,42.0,1812.5,2071.428571,,,,http://villa-luro.properati.com.ar/12m82_venta...,-34.638979,-58.500115
2,30,sell,apartment,|Argentina|Capital Federal|Caballito|,118000.0,USD,1789163.2,118000.0,,54.0,,2185.185185,,2.0,,http://caballito.properati.com.ar/11wqh_venta_...,-34.615847,-58.459957
3,41,sell,apartment,|Argentina|Capital Federal|Constitución|,57000.0,USD,864256.8,57000.0,42.0,42.0,1357.142857,1357.142857,5.0,2.0,364.0,http://constitucion.properati.com.ar/k2f0_vent...,-34.625222,-58.382382
4,42,sell,apartment,|Argentina|Capital Federal|Once|,90000.0,USD,1364616.0,90000.0,57.0,50.0,1578.947368,1800.0,,3.0,450.0,http://once.properati.com.ar/suwa_venta_depart...,-34.61061,-58.412511
