In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

sns.set_style("darkgrid")

from sklearn.datasets import load_boston

boston = load_boston()
x = boston.data
y = boston.target
columns = boston.feature_names

df = pd.DataFrame(boston.data)
df.columns = columns
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [83]:
df.shape

(506, 13)

In [84]:
z = np.abs(stats.zscore(df))
print(z)

[[0.41978194 0.28482986 1.2879095  ... 1.45900038 0.44105193 1.0755623 ]
 [0.41733926 0.48772236 0.59338101 ... 0.30309415 0.44105193 0.49243937]
 [0.41734159 0.48772236 0.59338101 ... 0.30309415 0.39642699 1.2087274 ]
 ...
 [0.41344658 0.48772236 0.11573841 ... 1.17646583 0.44105193 0.98304761]
 [0.40776407 0.48772236 0.11573841 ... 1.17646583 0.4032249  0.86530163]
 [0.41500016 0.48772236 0.11573841 ... 1.17646583 0.44105193 0.66905833]]


In [85]:
threshold = 3
print(np.where(z > threshold))

(array([ 55,  56,  57, 102, 141, 142, 152, 154, 155, 160, 162, 163, 199,
       200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 216, 218, 219,
       220, 221, 222, 225, 234, 236, 256, 257, 262, 269, 273, 274, 276,
       277, 282, 283, 283, 284, 347, 351, 352, 353, 353, 354, 355, 356,
       357, 358, 363, 364, 364, 365, 367, 369, 370, 372, 373, 374, 374,
       380, 398, 404, 405, 406, 410, 410, 411, 412, 412, 414, 414, 415,
       416, 418, 418, 419, 423, 424, 425, 426, 427, 427, 429, 431, 436,
       437, 438, 445, 450, 454, 455, 456, 457, 466]), array([ 1,  1,  1, 11, 12,  3,  3,  3,  3,  3,  3,  3,  1,  1,  1,  1,  1,
        1,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5,  3,  3,  1,  5,
        5,  3,  3,  3,  3,  3,  3,  1,  3,  1,  1,  7,  7,  1,  7,  7,  7,
        3,  3,  3,  3,  3,  5,  5,  5,  3,  3,  3, 12,  5, 12,  0,  0,  0,
        0,  5,  0, 11, 11, 11, 12,  0, 12, 11, 11,  0, 11, 11, 11, 11, 11,
       11,  0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]

In [86]:
df_without_outliers = df[(z<threshold).all(axis=1)]
df_without_outliers.shape

(415, 13)

In [87]:
# Which datas i lost? I m going to find their index number and throw from  in  Target data.
lost_index_df = df[~df.index.isin(df_without_outliers.index)]
lost_index_df.head()
# ~ ---------->  False to True and Trues to False

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
55,0.01311,90.0,1.22,0.0,0.403,7.249,21.9,8.6966,5.0,226.0,17.9,395.93,4.81
56,0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77
57,0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0,15.1,392.9,3.95
102,0.22876,0.0,8.56,0.0,0.52,6.405,85.4,2.7147,5.0,384.0,20.9,70.8,10.63
141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41


In [88]:
index = lost_index_df.index.values.tolist()
index

[55,
 56,
 57,
 102,
 141,
 142,
 152,
 154,
 155,
 160,
 162,
 163,
 199,
 200,
 201,
 202,
 203,
 204,
 208,
 209,
 210,
 211,
 212,
 216,
 218,
 219,
 220,
 221,
 222,
 225,
 234,
 236,
 256,
 257,
 262,
 269,
 273,
 274,
 276,
 277,
 282,
 283,
 284,
 347,
 351,
 352,
 353,
 354,
 355,
 356,
 357,
 358,
 363,
 364,
 365,
 367,
 369,
 370,
 372,
 373,
 374,
 380,
 398,
 404,
 405,
 406,
 410,
 411,
 412,
 414,
 415,
 416,
 418,
 419,
 423,
 424,
 425,
 426,
 427,
 429,
 431,
 436,
 437,
 438,
 445,
 450,
 454,
 455,
 456,
 457,
 466]

In [89]:
y # Target Values

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [90]:
# I leave the error to understand why we added reversed
y_copy = y.copy()
for i in index:
    print(i)
    y_copy = np.delete(y_copy, i)
    print(y_copy.shape)

55
(505,)
56
(504,)
57
(503,)
102
(502,)
141
(501,)
142
(500,)
152
(499,)
154
(498,)
155
(497,)
160
(496,)
162
(495,)
163
(494,)
199
(493,)
200
(492,)
201
(491,)
202
(490,)
203
(489,)
204
(488,)
208
(487,)
209
(486,)
210
(485,)
211
(484,)
212
(483,)
216
(482,)
218
(481,)
219
(480,)
220
(479,)
221
(478,)
222
(477,)
225
(476,)
234
(475,)
236
(474,)
256
(473,)
257
(472,)
262
(471,)
269
(470,)
273
(469,)
274
(468,)
276
(467,)
277
(466,)
282
(465,)
283
(464,)
284
(463,)
347
(462,)
351
(461,)
352
(460,)
353
(459,)
354
(458,)
355
(457,)
356
(456,)
357
(455,)
358
(454,)
363
(453,)
364
(452,)
365
(451,)
367
(450,)
369
(449,)
370
(448,)
372
(447,)
373
(446,)
374
(445,)
380
(444,)
398
(443,)
404
(442,)
405
(441,)
406
(440,)
410
(439,)
411
(438,)
412
(437,)
414
(436,)
415
(435,)
416
(434,)
418
(433,)
419
(432,)
423
(431,)
424
(430,)
425
(429,)
426
(428,)
427
(427,)
429


IndexError: index 429 is out of bounds for axis 0 with size 427

In [101]:
y_copy = y.copy()
for i in reversed(index):
    print(i)
    y_copy = np.delete(y_copy, i)
    print(y_copy.shape)

466
(505,)
457
(504,)
456
(503,)
455
(502,)
454
(501,)
450
(500,)
445
(499,)
438
(498,)
437
(497,)
436
(496,)
431
(495,)
429
(494,)
427
(493,)
426
(492,)
425
(491,)
424
(490,)
423
(489,)
419
(488,)
418
(487,)
416
(486,)
415
(485,)
414
(484,)
412
(483,)
411
(482,)
410
(481,)
406
(480,)
405
(479,)
404
(478,)
398
(477,)
380
(476,)
374
(475,)
373
(474,)
372
(473,)
370
(472,)
369
(471,)
367
(470,)
365
(469,)
364
(468,)
363
(467,)
358
(466,)
357
(465,)
356
(464,)
355
(463,)
354
(462,)
353
(461,)
352
(460,)
351
(459,)
347
(458,)
284
(457,)
283
(456,)
282
(455,)
277
(454,)
276
(453,)
274
(452,)
273
(451,)
269
(450,)
262
(449,)
257
(448,)
256
(447,)
236
(446,)
234
(445,)
225
(444,)
222
(443,)
221
(442,)
220
(441,)
219
(440,)
218
(439,)
216
(438,)
212
(437,)
211
(436,)
210
(435,)
209
(434,)
208
(433,)
204
(432,)
203
(431,)
202
(430,)
201
(429,)
200
(428,)
199
(427,)
163
(426,)
162
(425,)
160
(424,)
155
(423,)
154
(422,)
152
(421,)
142
(420,)
141
(419,)
102
(418,)
57
(417,)
56
(416,)
55
(415,)


In [102]:
y_copy.shape

(415,)

In [103]:
# Original  
df = sm.add_constant(df)
reg = sm.OLS(endog=y, exog=df, missing="drop")
results = reg.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sat, 30 Jan 2021   Prob (F-statistic):          6.72e-135
Time:                        23:09:31   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4595      5.103      7.144      0.0

In [104]:
# without outliers
df_without_outliers = sm.add_constant(df_without_outliers)
reg = sm.OLS(endog=y_copy, exog=df_without_outliers, missing="drop")
results = reg.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.744
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     97.37
Date:                Sat, 30 Jan 2021   Prob (F-statistic):          7.59e-111
Time:                        23:09:35   Log-Likelihood:                -1172.6
No. Observations:                 415   AIC:                             2371.
Df Residuals:                     402   BIC:                             2424.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         25.9707      5.492      4.729      0.0