In [2]:
import matplotlib.pyplot as plt
import numpy
import pandas

import seaborn
seaborn.set_context('talk')

In [3]:
melb_df = pandas.read_csv(
    'https://cs.famaf.unc.edu.ar/~mteruel/datasets/diplodatos/melb_data.csv')
melb_df[:3]

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
melb_df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

## StandardScaler

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(melb_df[["Price"]])

In [6]:
melb_df["scaled_Price"] = scaler.transform(melb_df[["Price"]])

In [7]:
melb_df["scaled_Price"]

0        0.632448
1       -0.063640
2        0.608984
3       -0.353025
4        0.820157
           ...   
13575    0.264851
13576   -0.069897
13577    0.147533
13578    2.227975
13579    0.327421
Name: scaled_Price, Length: 13580, dtype: float64

In [8]:
melb_df["scaled_Price"].mean(axis=0), melb_df["scaled_Price"].std(axis=0)

(1.4441074747407044e-16, 1.0000368208848183)

## Ordinal Encoding

In [9]:
from sklearn.preprocessing import OrdinalEncoder

melb_df["SellerG"]

0          Biggin
1          Biggin
2          Biggin
3          Biggin
4          Nelson
           ...   
13575       Barry
13576    Williams
13577       Raine
13578     Sweeney
13579     Village
Name: SellerG, Length: 13580, dtype: object

In [12]:
oe = OrdinalEncoder()
oe.fit(melb_df[["SellerG"]])

In [13]:
oe.categories_

[array(['@Realty', 'ASL', "Abercromby's", 'Ace', 'Alexkarbon', 'Allens',
        'Anderson', 'Appleby', 'Aquire', 'Area', 'Ascend', 'Ash', 'Asset',
        'Assisi', 'Australian', 'Barlow', 'Barry', 'Bayside', 'Bekdon',
        'Beller', 'Bells', 'Besser', 'Better', 'Biggin', 'Blue',
        'Boutique', 'Bowman', 'Brace', 'Brad', 'Buckingham', 'Bullen',
        'Burnham', 'Buxton', 'Buxton/Advantage', 'Buxton/Find', 'C21',
        'CASTRAN', 'Caine', 'Calder', 'Carter', 'Castran', 'Cayzer',
        'Century', 'Chambers', 'Changing', 'Charlton', 'Chisholm',
        'Christopher', 'Clairmont', 'Collins', 'Community', 'Compton',
        'Conquest', 'Considine', 'Coventry', 'Craig', 'Crane', "D'Aprano",
        'Daniel', 'Darras', 'Darren', 'David', 'Del', 'Dingle', 'Direct',
        'Dixon', 'Domain', 'Douglas', 'Edward', 'Elite', 'Eview', 'FN',
        'First', 'Fletchers', 'Fletchers/One', 'Follett', 'Frank', 'Free',
        'GL', 'Galldon', 'Gardiner', 'Garvey', 'Gary', 'Geoff', 'Grant

In [15]:
melb_df["encoded_SellerG"] = oe.transform(melb_df[["SellerG"]])

In [27]:
oe.inverse_transform(melb_df[["encoded_SellerG"]])

array([['Biggin'],
       ['Biggin'],
       ['Biggin'],
       ...,
       ['Raine'],
       ['Sweeney'],
       ['Village']], dtype=object)

## Discretizers

In [17]:
from sklearn.preprocessing import KBinsDiscretizer

In [20]:
kbe = KBinsDiscretizer(n_bins=[3], encode='ordinal')

In [21]:
kbe.fit(melb_df[["Price"]])

In [22]:
kbe.bin_edges_

array([array([  85000.,  730000., 1180000., 9000000.])], dtype=object)

In [23]:
melb_df["discretized_Price"] = kbe.transform(melb_df[["Price"]])

In [31]:
melb_df[["discretized_Price", "Price"]]

Unnamed: 0,discretized_Price,Price
0,2.0,1480000.0
1,1.0,1035000.0
2,2.0,1465000.0
3,1.0,850000.0
4,2.0,1600000.0
...,...,...
13575,2.0,1245000.0
13576,1.0,1031000.0
13577,1.0,1170000.0
13578,2.0,2500000.0


## Polynomial Features

In [36]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
poly.fit(melb_df[["Price", "Distance"]])

In [38]:
poly_features = poly.transform(melb_df[["Price", "Distance"]])

In [39]:
poly_features.shape

(13580, 6)

## Pipelines

In [46]:
melb_df["label"] = (melb_df["Price"] > 1000000).replace({True: 1, False: 0})

In [49]:
melb_df["label"].value_counts()

0    7837
1    5743
Name: label, dtype: int64

In [50]:
melb_df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount', 'scaled_Price',
       'encoded_SellerG', 'discretized_Price', 'label'],
      dtype='object')

In [78]:
X, y = melb_df[["Rooms", "Type", "Method", "Distance"]], melb_df["label"]

In [79]:
X

Unnamed: 0,Rooms,Type,Method,Distance
0,2,h,S,2.5
1,2,h,S,2.5
2,3,h,SP,2.5
3,3,h,PI,2.5
4,4,h,VB,2.5
...,...,...,...,...
13575,4,h,S,16.7
13576,3,h,SP,6.8
13577,3,h,S,6.8
13578,4,h,PI,6.8


In [80]:
y

0        1
1        1
2        1
3        0
4        1
        ..
13575    1
13576    1
13577    1
13578    1
13579    1
Name: label, Length: 13580, dtype: int64

In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [103]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

col_transformer = ColumnTransformer([
    ("categ", OneHotEncoder(), ["Type", "Method"]),
    ("scale", StandardScaler(), ["Rooms", "Distance"]),
    ("poly", PolynomialFeatures(2), ["Rooms", "Distance"])
])

col_transformer.fit(X_train)

In [104]:
X_train

Unnamed: 0,Rooms,Type,Method,Distance
12167,1,u,S,5.0
6524,2,h,SA,8.0
8413,3,h,S,12.6
2919,3,u,SP,13.0
6043,3,h,S,13.3
...,...,...,...,...
13123,3,h,SP,5.2
3264,3,h,S,10.5
9845,4,h,PI,6.7
10799,3,h,S,12.0


In [105]:
col_transformer.transform(X_train)


array([[  0.  ,   0.  ,   1.  , ...,   1.  ,   5.  ,  25.  ],
       [  1.  ,   0.  ,   0.  , ...,   4.  ,  16.  ,  64.  ],
       [  1.  ,   0.  ,   0.  , ...,   9.  ,  37.8 , 158.76],
       ...,
       [  1.  ,   0.  ,   0.  , ...,  16.  ,  26.8 ,  44.89],
       [  1.  ,   0.  ,   0.  , ...,   9.  ,  36.  , 144.  ],
       [  1.  ,   0.  ,   0.  , ...,  16.  ,  25.6 ,  40.96]])

In [106]:
col_transformer.transform(X_test)

array([[  1.  ,   0.  ,   0.  , ...,  16.  ,  32.  ,  64.  ],
       [  1.  ,   0.  ,   0.  , ...,   4.  ,  13.2 ,  43.56],
       [  1.  ,   0.  ,   0.  , ...,   9.  ,  31.5 , 110.25],
       ...,
       [  1.  ,   0.  ,   0.  , ...,   9.  ,  32.4 , 116.64],
       [  1.  ,   0.  ,   0.  , ...,  16.  ,  24.8 ,  38.44],
       [  1.  ,   0.  ,   0.  , ...,   4.  ,   3.2 ,   2.56]])

In [107]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
    ("preprocessor", col_transformer),
    ("pca", PCA(n_components=3)),
    ("classifier", LogisticRegression())
])
pipe

In [108]:
pipe.fit(X_train, y_train)

In [109]:
y_pred = pipe.predict(X_test)

In [118]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80      1585
           1       0.76      0.58      0.66      1131

    accuracy                           0.75      2716
   macro avg       0.75      0.73      0.73      2716
weighted avg       0.75      0.75      0.74      2716

