In [17]:
import numpy as np
import pandas as pd
import boto3
import os
import awswrangler as wr

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [20]:
print(f"Pandas version is {pd.__version__}")
print(f"Scikit-learn version is {sklearn.__version__}")

Pandas version is 1.5.1
Scikit-learn version is 1.2.2


In [2]:
path1 = 's3://test-bucket-vlad-godel/data/olx_house_price_Q122.csv'

df = wr.s3.read_csv([path1], encoding='utf-8')

In [3]:
df.head(10)

Unnamed: 0,offer_title,price,price_per_meter,offer_type,floor,area,rooms,offer_type_of_building,market,city_name,voivodeship,month,year,population,longitude,latitude
0,Kawalerka na sprzedaĹĽ,240000.0,8888.89,Private,1.0,27.0,1,Housing Block,primary,BolesĹ‚awiec,Lower Silesia,January,2022,39603,15.565105,51.263033
1,Nowoczesna kawalerka z winda plus garaĹĽ podzi...,250000.0,7142.86,Private,1.0,35.0,1,Housing Block,primary,Jelcz-Laskowice,Lower Silesia,January,2022,15828,17.349964,51.039831
2,Nowa kawalerka z Balkonem/Legnicka/Magnolia,259000.0,10360.0,Estate Agency,2.0,25.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
3,Kawalerka z balkonem/klucze I kwartaĹ‚ 2022/60...,269000.0,10275.02,Private,3.0,26.18,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
4,40 tys. taniej od dewelopera/Kawalerka/Magnoli...,258000.0,9923.08,Estate Agency,3.0,26.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
5,Mieszkanie na start,255000.0,11283.19,Private,3.0,22.6,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
6,Okazja! Na sprzedaĹĽ nowa kawalerka przy Odrze...,416120.0,12975.37,Estate Agency,5.0,32.07,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
7,"Bulwary Staromiejskie, kawalerka z balkonem wi...",275900.0,14913.51,Estate Agency,4.0,18.5,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
8,B Urban - Kawalerki inwestycyjne pod wynajem,201000.0,8040.0,Estate Agency,,,1,,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978
9,Sprzedam punkt w Pasazu ZieliĹ„skiego,1000.0,100.0,Private,0.0,10.0,1,Other,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.032669,51.108978


In [4]:
df.shape

(62818, 16)

In [5]:
df.dtypes

offer_title                object
price                     float64
price_per_meter           float64
offer_type                 object
floor                     float64
area                      float64
rooms                       int64
offer_type_of_building     object
market                     object
city_name                  object
voivodeship                object
month                      object
year                        int64
population                  int64
longitude                 float64
latitude                  float64
dtype: object

In [6]:
df.isna().sum()

offer_title                  0
price                        0
price_per_meter              0
offer_type                   0
floor                     1487
area                      1487
rooms                        0
offer_type_of_building    1487
market                       0
city_name                    0
voivodeship                  0
month                        0
year                         0
population                   0
longitude                    0
latitude                     0
dtype: int64

In [7]:
categorical_features = ['offer_type', 'offer_type_of_building',
                        'market', 'voivodeship', 'month']

numeric_features = ['floor', 'area', 'rooms', 'longitude', 'latitude']

In [8]:
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=2000))
            ])

numeric_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(initial_strategy='mean', max_iter=5, random_state=42, verbose=0)),
        ('scaler' , StandardScaler())
            ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ])

y = df["price"]
X_train, X_test, y_train, y_test= train_test_split(df, y, test_size=0.2, random_state=42)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

In [9]:
feature_names

array(['num__floor', 'num__area', 'num__rooms', 'num__longitude',
       'num__latitude', 'cat__offer_type_Private',
       'cat__offer_type_of_building_Housing Block',
       'cat__offer_type_of_building_Tenement',
       'cat__offer_type_of_building_infrequent_sklearn',
       'cat__market_primary', 'cat__voivodeship_Kuyavia-Pomerania',
       'cat__voivodeship_Lesser Poland', 'cat__voivodeship_Lodzkie',
       'cat__voivodeship_Lower Silesia', 'cat__voivodeship_Lublin',
       'cat__voivodeship_Masovia', 'cat__voivodeship_Pomerania',
       'cat__voivodeship_Silesia', 'cat__voivodeship_Warmia-Masuria',
       'cat__voivodeship_West Pomerania',
       'cat__voivodeship_infrequent_sklearn', 'cat__month_January',
       'cat__month_March'], dtype=object)

In [10]:
X_train_transformed.shape, X_test_transformed.shape

((50254, 23), (12564, 23))

In [11]:
estimator = LinearRegression()
cv_results = cross_validate(estimator, X_train_transformed, y_train, cv=5, n_jobs=-1,
            scoring=('neg_mean_absolute_percentage_error', 'neg_root_mean_squared_error'),
            return_train_score=True
            )

In [12]:
pd.DataFrame(cv_results)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_percentage_error,train_neg_mean_absolute_percentage_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,0.045767,0.0,-0.658261,-19.437666,-5028840.0,-186530.8
1,0.054599,0.0,-17.588477,-14.564083,-197952.2,-2518851.0
2,0.041067,0.007999,-55.155057,-5.161016,-184246.5,-2519114.0
3,0.049586,0.008417,-1.600374,-18.471697,-195365.3,-2518902.0
4,0.049586,0.008417,-0.810955,-18.620937,-208181.7,-2518613.0
