In [360]:
import pandas as pd
import numpy as np
import math

In [361]:
#MeanImputer
class MeanImputer():
    def __init__(self, copy=True):
        self.copy = copy
    def __is_numpy(self, X):
        #X : pandas.DataFrame или numpy.ndarray
        #Is it numpy or not
        return isinstance(X, np.ndarray)
    def fit(self, X, y=None):
        self._encoder_dict = {}
        is_np = self.__is_numpy(X)
        #reshape from 1D to 2D
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        #amount of columns
        ncols = X.shape[1]
        
        if is_np:
            for col in range(ncols):
                self._encoder_dict[col] = np.nanmean(X[:, col])
        else:
            for col in X.columns:
                self._encoder_dict[col] = X[col].mean()
                
        return self
    
    def transform(self, X):
        if self.copy:
            X = X.copy()
        is_np = self.__is_numpy(X)
        
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        ncols = X.shape[1]
        
        if is_np:
            for col in range(ncols):
                X[:, col] = np.nan_to_num(
                    X[:, col],
                nan=self._encoder_dict[col])
        else:
            for col in X.columns:
                X[col] = np.where(X[col].isnull(),
                                    self._encoder_dict[col],
                                    X[col])
        return X    

In [362]:
toy_train = pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]})
toy_train

Unnamed: 0,Balance,Age
0,8.3,23.0
1,,29.0
2,10.2,36.0
3,3.1,


In [363]:
toy_test = pd.DataFrame(
{'Balance': [10.4, np.NaN, 22.5, 1.1],
'Age': [13, 19, 66, np.NaN]})
toy_test

Unnamed: 0,Balance,Age
0,10.4,13.0
1,,19.0
2,22.5,66.0
3,1.1,


In [364]:
#Using mean()
'''
for col in toy_train.columns:
    toy_train[col].fillna(toy_train[col].mean(), inplace=True)
    toy_test[col].fillna(toy_train[col].mean(), inplace=True)
print('обучающий датафрейм')
print(toy_train)
print('')
print('тестовый датафрейм')
print(toy_test)
'''

"\nfor col in toy_train.columns:\n    toy_train[col].fillna(toy_train[col].mean(), inplace=True)\n    toy_test[col].fillna(toy_train[col].mean(), inplace=True)\nprint('обучающий датафрейм')\nprint(toy_train)\nprint('')\nprint('тестовый датафрейм')\nprint(toy_test)\n"

In [365]:
imp = MeanImputer()
imp.fit(toy_train)
toy_train = imp.transform(toy_train)
toy_train

Unnamed: 0,Balance,Age
0,8.3,23.0
1,7.2,29.0
2,10.2,36.0
3,3.1,29.333333


In [366]:
toy_test = imp.transform(toy_test)
toy_test

Unnamed: 0,Balance,Age
0,10.4,13.0
1,7.2,19.0
2,22.5,66.0
3,1.1,29.333333


In [367]:
toy_train = pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]})
# создаем экземпляр класса, отключив копирование
imp = MeanImputer(copy=False)
# обучаем модель
imp.fit(toy_train[['Age']])
# применяем модель
toy_train['Age'] = imp.transform(toy_train[['Age']])
toy_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = np.where(X[col].isnull(),


Unnamed: 0,Balance,Age
0,8.3,23.0
1,,29.0
2,10.2,36.0
3,3.1,29.333333


In [368]:
np_toy_train = np.array(pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]}))
np_toy_train

np_toy_test = np.array(pd.DataFrame(
{'Balance': [10.4, np.NaN, 22.5, 1.1],
'Age': [13, 19, 66, np.NaN]}))
np_toy_test

array([[10.4, 13. ],
       [ nan, 19. ],
       [22.5, 66. ],
       [ 1.1,  nan]])

In [369]:
imp.fit(np_toy_train)
np_toy_train = imp.transform(np_toy_train)
np_toy_train

array([[ 8.3       , 23.        ],
       [ 7.2       , 29.        ],
       [10.2       , 36.        ],
       [ 3.1       , 29.33333333]])

In [370]:
np_toy_test = imp.transform(np_toy_test)
np_toy_test

array([[10.4       , 13.        ],
       [ 7.2       , 19.        ],
       [22.5       , 66.        ],
       [ 1.1       , 29.33333333]])

In [371]:
#KNN Model code;

class KNN_Estimator():
    def _euclidean_distance(self, x1, x2):
        distance = 0
        for i in range(len(x1)):
            distance += pow((x1[i] - x2[i]), 2)
        return math.sqrt(distance)
    
    def _vote(self, neighbor_labels):
        counts = np.bincount(neighbor_labels.astype('int'))
        return counts.argmax()
    
    def __init__(self, k=5, task='classification'):
        self.k = k
        self.task = task
        self.k_nearest_neighbors_ = []
        
    def fit(self, X, y):
        self.X_memorized = X
        self.y_memorized = y
        
    def predict(self, X):
        y_pred = np.empty(X.shape[0])
        if self.task == 'classification':
            for i, test_sample in enumerate(X):
                idx = np.argsort([self._euclidean_distance(
                    test_sample, x) for x in self.X_memorized])[:self.k]
                k_nearest_neighbors = np.array(
                    [self.y_memorized[i] for i in idx])
                self.k_nearest_neighbors_.append(k_nearest_neighbors)
                y_pred[i] = self._vote(self.k_nearest_neighbors_[i])
        if self.task == 'regression':
            for i, test_sample in enumerate(X):
                idx = np.argsort([self._euclidean_distance(
                    test_sample, x) for x in self.X_memorized])[:self.k]
                k_nearest_neighbors = np.array(
                    [self.y_memorized[i] for i in idx])
                self.k_nearest_neighbors_.append(k_nearest_neighbors)
                y_pred[i] = np.mean(self.k_nearest_neighbors_[i])
        return y_pred    
    

In [372]:
#Classification
X_trn = np.array([[0.1, 0.2, 0.3],
                                [0.7, 0.5, 0.2],
                                [0.1, 0.2, 0.2],
                                [0.9, 0.7, 3.5],
                                [0.2, 0.4, 1.4],
                                [0.4, 0.1, 0.5]])

y_trn = np.array([1, 0, 1, 0, 0, 1])

X_tst = np.array([[0.1, 0.7, 1.1],
                                [0.5, 0.3, 2.8],
                                [0.1, 0.1, 0.2],
                                [0.9, 0.7, 1.5]])

In [373]:
knn = KNN_Estimator(k=3, task='classification')
knn.fit(X_trn, y_trn)

pred = knn.predict(X_tst)
pred

array([1., 0., 1., 0.])

In [374]:
knn.k_nearest_neighbors_

[array([0, 1, 1]), array([0, 0, 1]), array([1, 1, 1]), array([0, 1, 0])]

In [375]:
y_trn = np.array([1.2, 0.5, 1.4, 2.2, 3.5, 5.9])

In [376]:
knn = KNN_Estimator(k=3, task='regression')
knn.fit(X_trn, y_trn)

In [377]:
pred = knn.predict(X_tst)
pred

array([3.53333333, 3.86666667, 2.83333333, 3.3       ])

In [378]:
knn.k_nearest_neighbors_

[array([3.5, 5.9, 1.2]),
 array([2.2, 3.5, 5.9]),
 array([1.4, 1.2, 5.9]),
 array([3.5, 5.9, 0.5])]

In [379]:
a = [2, 4, 7, 9, 14, 20, 21, 22]
b = [3, 5, 8, 10, 14, 20, 21, 30]

def seven(a,b):
    for i in range(len(a)):
            if a[i] % 7 == 0 and b[i] % 7 == 0:
                print(a[i])
            else:
                pass

In [380]:
seven(a,b)

14
21


In [381]:
def find(lst1, lst2):
    lst1 = [i for i in lst1 if i % 7 == 0]
    lst2 = [i for i in lst2 if i % 7 == 0]
    return set(lst1) & set(lst2)

find(a,b)

{14, 21}

In [382]:
a = ["a", "b", "c", "d", "e", "f"]
b = [1, 0, 9, 3, 2, 0]

In [383]:
def get_sorted(lst1, lst2):
    lst_tmp = [x for x in zip(lst1, lst2)]
    lst_tmp.sort(key=lambda x: x[1])
    return [x[0] for x in lst_tmp]

In [384]:
get_sorted(a,b)

['b', 'f', 'a', 'e', 'd', 'c']

In [385]:
z = zip(a,b)
z = list(z)
z

[('a', 1), ('b', 0), ('c', 9), ('d', 3), ('e', 2), ('f', 0)]

In [386]:
#Pipeline

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [387]:
os.getcwd()

'f:\\KULIKOV\\ML\\MLCourseAI\\Data Analytics 2023 V2\\ML DA may 2023'

In [388]:
#Loaddataset
from sklearn.datasets import fetch_covtype
import pandas as pd
#https://scikit-learn.ru/7-2-real-world-datasets/
#https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_outlier_detection_bench.html#sphx-glr-auto-examples-miscellaneous-plot-outlier-detection-bench-py

X, y = fetch_covtype(return_X_y=True, as_frame=True)
s = (y == 2) + (y == 4)
X = X.loc[s]
y = y.loc[s]
y = (y != 2).astype(np.int32)

#X, _, y, _ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [389]:
#Scaling
standardscaler = StandardScaler()
standardscaler.fit(X_train)
# Transform ONLY to train and test, not to X. Because in X - other meanings.
X_train_standardscaled = standardscaler.transform(X_train)  
X_test_standardscaled = standardscaler.transform(X_test)

In [390]:
#TFitting
logreg = LogisticRegression(solver='lbfgs', max_iter=400)
logreg.fit(X_train_standardscaled, y_train)
print("Score train: {:.3f}".format(
logreg.score(X_train_standardscaled, y_train)))
print("Score test: {:.3f}".format(
logreg.score(X_train_standardscaled, y_train)))

Score train: 1.000
Score test: 1.000


In [391]:
#Getting score
train_score = logreg.score(X_train_standardscaled, y_train)
test_score = logreg.score(X_test_standardscaled, y_test)
print(f"Score train: {train_score:.3f}")
print(f"Score test: {test_score:.3f}")

Score train: 1.000
Score test: 1.000


In [392]:
print("Score train: %.3f" % train_score)
print("Score test: %.3f" % test_score)

Score train: 1.000
Score test: 1.000


In [393]:
#Predictiong
logreg_pred = logreg.predict(X_test_standardscaled)
logreg_pred

array([0, 0, 0, ..., 0, 0, 0])

In [394]:
#Predicting probability
logreg_pred = logreg.predict_proba(X_test_standardscaled)
logreg_pred

array([[9.99999997e-01, 3.43359067e-09],
       [9.99999998e-01, 2.17678902e-09],
       [1.00000000e+00, 1.21786506e-12],
       ...,
       [1.00000000e+00, 2.28650412e-10],
       [1.00000000e+00, 3.19741413e-12],
       [1.00000000e+00, 6.61051203e-19]])

In [395]:
#Constant
intercept = np.round(logreg.intercept_, 3)
intercept

array([-25.609])

In [396]:
#Coef
coef = np.round(logreg.coef_, 3)
coef

array([[-6.863e+00, -4.450e-01,  4.950e-01, -4.370e-01, -3.320e-01,
         4.905e+00,  9.090e-01,  1.671e+00, -1.179e+00,  1.724e+00,
        -2.480e-01,  1.000e-03, -2.350e-01,  1.721e+00,  2.110e-01,
        -4.000e-03,  5.290e-01,  1.100e-02,  1.590e-01,  1.800e-01,
        -1.000e-03, -1.000e-03, -1.000e-03, -1.590e-01,  5.040e-01,
        -1.900e-02, -1.000e-03,  1.890e-01,  0.000e+00, -6.400e-02,
         2.500e-02, -5.600e-02, -0.000e+00, -3.900e-02,  0.000e+00,
         1.000e-03, -0.000e+00, -1.100e-02,  0.000e+00, -4.000e-03,
         1.000e-03,  0.000e+00, -8.100e-02, -2.860e-01,  1.000e-03,
         2.000e-03,  1.000e-03,  0.000e+00,  0.000e+00,  0.000e+00,
         0.000e+00,  1.000e-03,  0.000e+00,  0.000e+00]])

In [397]:
#Classes
classes = np.round(logreg.classes_, 3)
classes

array([0, 1])

In [398]:
logreg.coef_[0]

array([-6.86335327e+00, -4.44872949e-01,  4.95230289e-01, -4.37066814e-01,
       -3.32491131e-01,  4.90456158e+00,  9.09293036e-01,  1.67094079e+00,
       -1.17893584e+00,  1.72418089e+00, -2.47727420e-01,  1.30181700e-03,
       -2.34978485e-01,  1.72075737e+00,  2.11016259e-01, -3.53628838e-03,
        5.28561367e-01,  1.07845255e-02,  1.59423234e-01,  1.79662757e-01,
       -9.17802709e-04, -1.34119874e-03, -7.05269441e-04, -1.59042417e-01,
        5.03849394e-01, -1.86944028e-02, -9.44020346e-04,  1.89178845e-01,
        0.00000000e+00, -6.44847844e-02,  2.51200087e-02, -5.58369297e-02,
       -2.29150022e-04, -3.91221385e-02,  6.21205535e-05,  8.62002215e-04,
       -2.76901069e-04, -1.08280955e-02,  2.09934128e-04, -4.11126506e-03,
        6.19675404e-04,  2.48454983e-04, -8.14092955e-02, -2.86473120e-01,
        5.78602864e-04,  1.72477402e-03,  5.40896563e-04,  3.61707208e-04,
        8.77095533e-05,  1.56332368e-04,  0.00000000e+00,  6.12474037e-04,
        4.17828904e-04,  

In [399]:
#Coef for features
for i,feature in zip(logreg.coef_[0], X_train.columns):
    print(feature, i)

Elevation -6.863353273500244
Aspect -0.44487294882555245
Slope 0.4952302886105166
Horizontal_Distance_To_Hydrology -0.4370668139499791
Vertical_Distance_To_Hydrology -0.3324911313472247
Horizontal_Distance_To_Roadways 4.904561581176035
Hillshade_9am 0.9092930360668068
Hillshade_Noon 1.6709407882924503
Hillshade_3pm -1.1789358445753517
Horizontal_Distance_To_Fire_Points 1.7241808871037037
Wilderness_Area_0 -0.2477274195599548
Wilderness_Area_1 0.0013018169964458474
Wilderness_Area_2 -0.23497848521813966
Wilderness_Area_3 1.7207573721353782
Soil_Type_0 0.2110162589754352
Soil_Type_1 -0.003536288377687285
Soil_Type_2 0.5285613673827958
Soil_Type_3 0.010784525517597626
Soil_Type_4 0.1594232336686379
Soil_Type_5 0.1796627570199406
Soil_Type_6 -0.0009178027089207292
Soil_Type_7 -0.0013411987397579893
Soil_Type_8 -0.0007052694408819774
Soil_Type_9 -0.1590424166118983
Soil_Type_10 0.5038493940734217
Soil_Type_11 -0.018694402803804383
Soil_Type_12 -0.000944020346388549
Soil_Type_13 0.1891788445

In [400]:
#Bootstrap and out-of-bag
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor

In [401]:
X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2579.0,132.0,6.0,300.0,-15.0,67.0,230.0,237.0,140.0,6031.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2886.0,151.0,11.0,371.0,26.0,5253.0,234.0,240.0,136.0,4051.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,2742.0,134.0,22.0,150.0,69.0,3215.0,248.0,224.0,92.0,6091.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576882,2617.0,29.0,13.0,390.0,128.0,2081.0,215.0,211.0,130.0,592.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
576883,2614.0,21.0,13.0,379.0,125.0,2051.0,211.0,212.0,135.0,618.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
576884,2612.0,17.0,13.0,371.0,123.0,2021.0,208.0,211.0,138.0,644.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
576885,2610.0,16.0,14.0,365.0,110.0,1991.0,208.0,211.0,138.0,671.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [402]:
#Toy regression

var_lst = ['Aspect', 'Slope',
'Horizontal_Distance_To_Hydrology', 'Elevation']
toy_data = X[var_lst].head(10)
toy_labels = toy_data.pop('Elevation')

In [403]:
toy_data.shape

(10, 3)

In [404]:
#Bootstrap - values can be repeated
sample_indices = np.arange(toy_data.shape[0])
rng = np.random.RandomState(42)
bootstrap_indices = rng.choice(sample_indices,
                               size=sample_indices.shape[0],
                               replace=True)
display(bootstrap_indices.shape)
toy_data_boot = toy_data.iloc[bootstrap_indices]
toy_labels_boot = toy_labels.iloc[bootstrap_indices]
display(toy_data_boot)
display(toy_labels_boot)

(10,)

Unnamed: 0,Aspect,Slope,Horizontal_Distance_To_Hydrology
27,148.0,16.0,323.0
11,151.0,11.0,371.0
28,135.0,1.0,212.0
12,134.0,22.0,150.0
27,148.0,16.0,323.0
41,346.0,2.0,0.0
5,132.0,6.0,300.0
27,148.0,16.0,323.0
28,135.0,1.0,212.0
12,134.0,22.0,150.0


27    2962.0
11    2886.0
28    2811.0
12    2742.0
27    2962.0
41    2570.0
5     2579.0
27    2962.0
28    2811.0
12    2742.0
Name: Elevation, dtype: float64

In [405]:
toy_labels_boot.nunique()

6

In [406]:
#Out-of-bag:
toy_data_out_boot = toy_data[~toy_data.index.isin(
toy_data_boot.index)]
toy_labels_out_boot = toy_labels[~toy_labels.index.isin(
toy_data_boot.index)]
display(toy_data_out_boot)
display(toy_labels_out_boot)

Unnamed: 0,Aspect,Slope,Horizontal_Distance_To_Hydrology
2,139.0,9.0,268.0
3,155.0,18.0,242.0
21,209.0,17.0,216.0
35,45.0,19.0,242.0


2     2804.0
3     2785.0
21    2880.0
35    2900.0
Name: Elevation, dtype: float64

In [407]:
#bootstrap and out-of-bag function
def generate_bootstrap(rng, X, y, verbose=True):
        sample_indices = np.arange(X.shape[0])
        bootstrap_indices = rng.choice(sample_indices,
                                       size=sample_indices.shape[0],
                                       replace=True)
        X_boot = X.iloc[bootstrap_indices]
        y_boot = y.iloc[bootstrap_indices]
        X_out_boot = X[~X.index.isin(X_boot.index)]
        y_out_boot = y[~y.index.isin(X_boot.index)]
        if verbose:
            print(f"{i}-iteration")
            print(f"indexes in bootstrap: {X_boot.index.tolist()}")
            print(f"indexes in out-of-bag: {X_out_boot.index.tolist()}\n")
        return X_boot, y_boot, X_out_boot, y_out_boot

In [408]:
rng = np.random.RandomState(42)
standardscaler = StandardScaler()
tree = DecisionTreeRegressor(random_state=42)
test_score_lst = []
for i in range(1, 4):
    X_boot, y_boot, X_out_boot, y_out_boot = generate_bootstrap(
    rng, toy_data, toy_labels)
    tree.fit(X_boot, y_boot)
    test_score = tree.score(X_out_boot, y_out_boot)
    test_score_lst.append(test_score)

1-iteration
indexes in bootstrap: [27, 11, 28, 12, 27, 41, 5, 27, 28, 12]
indexes in out-of-bag: [2, 3, 21, 35]

2-iteration
indexes in bootstrap: [11, 28, 28, 5, 21, 12, 3, 28, 21, 3]
indexes in out-of-bag: [2, 27, 35, 41]

3-iteration
indexes in bootstrap: [12, 2, 41, 21, 35, 2, 41, 5, 27, 11]
indexes in out-of-bag: [3, 28]



In [409]:
print(test_score_lst)

[-9.155198273820488, -0.9437390247197082, -184.94378698224853]


In [410]:
mean_r2 = sum(test_score_lst) / len(test_score_lst)
print("Mean Value R2: %.3f" % mean_r2)

Mean Value R2: -65.014


In [411]:
#R2 in list
import statistics
mean_r2 = statistics.fmean(test_score_lst)
print("Mean Value R2: %.3f" % mean_r2)

Mean Value R2: -65.014


In [412]:

rng = np.random.RandomState(42)

standardscaler = StandardScaler()

logreg = LogisticRegression(solver='lbfgs', max_iter=200)

test_score_lst = []

for i in range(15):
    X_boot, y_boot, X_out_boot, y_out_boot = generate_bootstrap(
        rng, X_train, y_train, verbose=False)
    standardscaler.fit(X_boot)
    X_boot_scaled = standardscaler.transform(X_boot)
    X_out_boot_scaled = standardscaler.transform(X_out_boot)
    logreg.fit(X_boot_scaled, y_boot)
   #     X_out_boot_scaled)
    test_score = logreg.score(
        X_out_boot_scaled, y_out_boot)
    test_score_lst.append(test_score)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [413]:
mean_acc = statistics.fmean(test_score_lst)
print("Mean_acc: %.3f" % mean_acc)

Mean_acc: 1.000


In [414]:
test_score_lst

[0.9994562188175479,
 0.9995115730062681,
 0.999687784218171,
 0.9996344880802501,
 0.9995365068978679,
 0.9996464797541674,
 0.9995660134264596,
 0.9994980124004504,
 0.9996871982483102,
 0.9994707127734651,
 0.9996194049123952,
 0.9995658250003392,
 0.9996463932107497,
 0.9996208684819845,
 0.9994968998150767]

In [415]:
#bias_variance

from mlxtend.data import boston_housing_data
boston_housing_data()
X, y = boston_housing_data()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=123)

#generating bootstrap samples based on a training sample
def _draw_bootstrap_sample(rng, X, y):
    sample_indices = np.arange(X.shape[0])
    bootstrap_indices = rng.choice(sample_indices,
                                    size=sample_indices.shape[0],
                                    replace=True)
    return X[bootstrap_indices], y[bootstrap_indices]

In [416]:
#function that calculates the average expected value of the function of losses, averaged bias, averaged variance

def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
                        num_rounds=200, random_seed=None):
#Author: Sebastian Raschka https://github.com/rasbt

        rng = np.random.RandomState(random_seed)
        all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=int)

        for i in range(num_rounds):
            X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train)
            pred = estimator.fit(X_boot, y_boot).predict(X_test)
            all_pred[i] = pred
            

        avg_expected_loss = np.apply_along_axis(
            lambda x:
            ((x - y_test) ** 2).mean(),
            axis=1,
            arr=all_pred).mean()

        main_predictions = np.mean(all_pred, axis=0)
        avg_bias = np.sum((main_predictions - y_test) ** 2) / y_test.size
        avg_var = np.sum((main_predictions - all_pred) ** 2) / all_pred.size
        return avg_expected_loss, avg_bias, avg_var

In [417]:
#max depth tree
tree = DecisionTreeRegressor(random_state=123)
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree, X_train, y_train, X_test, y_test,
    random_seed=123)

ttl = "avg_expected_loss: %.3f"
print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

avg_expected_loss: 31.756
avg_bias: 13.856
avg_var: 17.900


In [418]:
#depth=1 tree
tree2 = DecisionTreeRegressor(random_state=123, max_depth=1)
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree2, X_train, y_train, X_test, y_test,
    random_seed=123)

print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

avg_expected_loss: 53.199
avg_bias: 41.990
avg_var: 11.209


In [419]:
#what is bias_variance_decomp
X_train = np.array([[29.1, 19000.28, 15],
                    [67.3, 48800.81, 45],
                    [77.9, 89800.55, 188]])
X_test = np.array([[11.9, 89900.28, 199],
                    [37.8, 10600.82, 95],
                    [77.2, 99700.22, 87]])
y_train = np.array([22.6, 89.5, 17.3])
y_test = np.array([12.4, 96.9, 107.9])

rng = np.random.RandomState(123)

all_pred = np.zeros((3, y_test.shape[0]), dtype=int)
all_pred

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [420]:
for i in range(3):
    X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train)
    pred = tree.fit(X_boot, y_boot).predict(X_test)
    all_pred[i] = pred
    
display(all_pred)
y_test

array([[17, 89, 89],
       [17, 22, 22],
       [17, 89, 89]])

array([ 12.4,  96.9, 107.9])

In [421]:
mse_first_iter = (((17 - 12.4)**2) + ((89 - 96.9)**2) +
                 ((89 - 107.9)**2)) / 3
mse_first_iter

146.92666666666676

In [424]:
mse = np.apply_along_axis(
    lambda x:
    ((x - y_test)**2).mean(),
    axis=1,
    arr=all_pred)
mse

array([ 146.92666667, 4336.66      ,  146.92666667])

In [425]:
avg_expected_loss = np.apply_along_axis(
    lambda x:
    ((x - y_test)**2).mean(),
    axis=1,
    arr=all_pred).mean()
avg_expected_loss

1543.504444444445

In [426]:
main_predictions = np.mean(all_pred, axis=0)
main_predictions

array([17.        , 66.66666667, 66.66666667])

In [427]:
avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size
avg_bias

878.4674074074074

In [428]:
avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size
avg_var

665.037037037037

In [430]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree, X_train, y_train, X_test, y_test, num_rounds=3,
    random_seed=123)

print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

avg_expected_loss: 1543.504
avg_bias: 878.467
avg_var: 665.037


In [434]:
#MissingIndicator SimpleImputer
from sklearn.impute import MissingIndicator, SimpleImputer

data = pd.read_csv('./HPData.csv')

data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [437]:
train, test, y_train, y_test = train_test_split(
    data.drop('SalePrice', axis=1),
    data['SalePrice'],
    test_size=.3,
    random_state=100)

In [463]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1022 entries, 210 to 792
Data columns (total 81 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              1022 non-null   int64  
 1   MSSubClass      1022 non-null   int64  
 2   MSZoning        1022 non-null   object 
 3   LotFrontage     845 non-null    float64
 4   LotArea         1022 non-null   int64  
 5   Street          1022 non-null   object 
 6   Alley           64 non-null     object 
 7   LotShape        1022 non-null   object 
 8   LandContour     1022 non-null   object 
 9   Utilities       1022 non-null   object 
 10  LotConfig       1022 non-null   object 
 11  LandSlope       1022 non-null   object 
 12  Neighborhood    1022 non-null   object 
 13  Condition1      1022 non-null   object 
 14  Condition2      1022 non-null   object 
 15  BldgType        1022 non-null   object 
 16  HouseStyle      1022 non-null   object 
 17  OverallQual     1022 non-null   int64

In [464]:
train['LotFrontage'].isnull().sum()

177

In [469]:
train.isnull().sum()

Id                        0
MSSubClass                0
MSZoning                  0
LotFrontage             177
LotArea                   0
                       ... 
YrSold                    0
SaleType                  0
SaleCondition             0
miss_ind_Alley            0
miss_ind_LotFrontage      0
Length: 82, dtype: int64

In [470]:
miss_ind = MissingIndicator()
miss_ind.fit(train[['LotFrontage']])

In [467]:
train['miss_ind_LotFrontage'] = miss_ind.transform(train[['LotFrontage']])
test['miss_ind_LotFrontage'] = miss_ind.transform(test[['LotFrontage']])

In [477]:
train[train["miss_ind_LotFrontage"] == True]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,miss_ind_Alley,miss_ind_LotFrontage
342,343,90,RL,,8544,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2006,WD,Normal,True,True
308,309,30,RL,,12342,Pave,,IR1,Lvl,AllPub,...,,,,0,3,2009,WD,Normal,True,True
452,453,60,RL,,9303,Pave,,IR1,Lvl,AllPub,...,,,,0,7,2007,WD,Normal,True,True
405,406,20,RL,,9991,Pave,,IR1,Lvl,AllPub,...,,GdWo,,0,6,2009,WD,Normal,True,True
1443,1444,30,RL,,8854,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2009,WD,Normal,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,945,20,RL,,14375,Pave,,IR1,Lvl,NoSeWa,...,,,,0,1,2009,COD,Abnorml,True,True
1262,1263,50,RL,,11250,Pave,,Reg,Lvl,AllPub,...,,,,0,11,2009,WD,Normal,True,True
967,968,20,RL,,7390,Pave,,IR1,Lvl,AllPub,...,,,,0,7,2008,WD,Normal,True,True
612,613,60,RL,,11885,Pave,,Reg,Lvl,AllPub,...,,,,0,11,2009,WD,Normal,True,True


In [479]:
train["LotFrontage"].median()

68.0

In [482]:
#For numerical
simp = SimpleImputer(strategy='median')
simp.fit(train[['LotFrontage']])

test['LotFrontage'] = simp.transform(test[['LotFrontage']])
test['LotFrontage'] = simp.transform(test[['LotFrontage']])

0

In [484]:
train['LotFrontage'].isnull().sum()

0

In [485]:
test['LotFrontage'].isnull().sum()

0

In [505]:
train[train['LotFrontage'] == 69]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,miss_ind_Alley,miss_ind_LotFrontage
719,720,20,RL,69.0,9920,Pave,Grvl,IR1,Lvl,AllPub,...,,,,0,5,2006,WD,Normal,True,False
1214,1215,85,RL,69.0,10205,Pave,Grvl,IR1,Lvl,AllPub,...,,,,0,5,2006,WD,Normal,True,False
71,72,20,RL,69.0,7599,Pave,Grvl,Reg,Lvl,AllPub,...,,,,0,6,2007,WD,Normal,True,False
94,95,60,RL,69.0,9337,Pave,Grvl,IR1,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,True,False
129,130,20,RL,69.0,8973,Pave,Grvl,Reg,Lvl,AllPub,...,,MnWw,,0,7,2006,WD,Abnorml,True,False
246,247,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,,,,0,4,2006,WD,Normal,False,False
397,398,60,RL,69.0,7590,Pave,Grvl,Reg,Lvl,AllPub,...,,,,0,7,2007,WD,Normal,True,False
128,129,60,RL,69.0,7590,Pave,Grvl,Reg,Lvl,AllPub,...,,,,0,7,2006,WD,Normal,True,False
556,557,20,RL,69.0,14850,Pave,Grvl,IR1,Lvl,AllPub,...,,MnWw,,0,5,2006,WD,Normal,True,False


In [489]:
train['Alley']

210      NaN
318      NaN
239      NaN
986      NaN
1416     NaN
        ... 
802      NaN
53       NaN
350      NaN
79      Grvl
792      NaN
Name: Alley, Length: 1022, dtype: object

In [502]:
#For catagorical
cat_cols = ['Alley']
simp2 = SimpleImputer(strategy='most_frequent')
simp2.fit(train[cat_cols])

test[cat_cols] = simp2.transform(test[cat_cols])
train[cat_cols] = simp2.transform(train[cat_cols])

In [503]:
train['Alley'].isnull().sum()

0

In [None]:
#Dummy with OneHotEncoder