In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)

In [3]:
data = np.random.normal(loc=50, scale=5, size=50)

In [4]:
outliers = [100, 110, 120, 130]

In [5]:
# Concatenated data with outliers and named it data_with_outliers

data_with_outliers = np.concatenate([data, outliers])

In [6]:
df = pd.DataFrame(data_with_outliers, columns=['values'])


In [7]:
print(df.describe())

           values
count   54.000000
mean    53.770954
std     18.307407
min     40.201649
25%     46.548451
50%     49.089898
75%     52.655493
max    130.000000


In [8]:
#standardizing the data and calculating teh z-scores to detect outliers

m = df['values'].mean()
std = df['values'].std()
df['zscore'] = (df['values'] - m) / std
df['zscore']


0    -0.070320
1    -0.243742
2    -0.029087
3     0.209980
4    -0.269930
5    -0.269926
6     0.225325
7     0.003617
8    -0.334199
9    -0.057799
10   -0.332545
11   -0.333177
12   -0.139897
13   -0.728522
14   -0.677078
15   -0.359548
16   -0.482598
17   -0.120155
18   -0.453973
19   -0.591699
20    0.194309
21   -0.267642
22   -0.187537
23   -0.595098
24   -0.354658
25   -0.175685
26   -0.520332
27   -0.103371
28   -0.370022
29   -0.285645
30   -0.370314
31    0.299902
32   -0.209666
33   -0.494855
34    0.018668
35   -0.539409
36   -0.148936
37   -0.741192
38   -0.568725
39   -0.152214
40   -0.004295
41   -0.159177
42   -0.237565
43   -0.288215
44   -0.609784
45   -0.402579
46   -0.331786
47    0.082735
48   -0.112133
49   -0.687490
50    2.525155
51    3.071382
52    3.617609
53    4.163836
Name: zscore, dtype: float64

In [9]:
# Flagging outliers  {z-score<-3 & z-score>3}

pos_outliers = df.loc[df['zscore'] > 3, 'values']
neg_outliers = df.loc[df['zscore'] < -3, 'values']


In [10]:
# Concatenating them
all_outliers = pd.concat([pos_outliers, neg_outliers])

print("All outlier values:\n", all_outliers.to_list())


All outlier values:
 [110.0, 120.0, 130.0]


In [11]:
# Applying log transformation
df['log_values'] = np.log(df['values'])
df['log_values']

0     3.960500
1     3.898100
2     3.974781
3     4.053786
4     3.888329
5     3.888331
6     4.058649
7     3.985964
8     3.863938
9     3.964858
10    3.864573
11    3.864331
12    3.935931
13    3.699661
14    3.722687
15    3.854152
16    3.805236
17    3.942964
18    3.816830
19    3.759768
20    4.048793
21    3.889187
22    3.918753
23    3.758318
24    3.856047
25    3.923054
26    3.789743
27    3.948904
28    3.850080
29    3.882420
30    3.849966
31    4.081958
32    3.910672
33    3.800230
34    3.991069
35    3.781818
36    3.932694
37    3.693908
38    3.769516
39    3.931518
40    3.983270
41    3.929015
42    3.900391
43    3.881450
44    3.752028
45    3.837316
46    3.864864
47    4.012513
48    3.945808
49    3.718069
50    4.605170
51    4.700480
52    4.787492
53    4.867534
Name: log_values, dtype: float64

In [12]:
#Remove outliers based on Z-scores

# Get all outliers (already built above)
pos_outliers = df.loc[df['zscore'] > 3, 'values']
neg_outliers = df.loc[df['zscore'] < -3, 'values']
all_outliers = pd.concat([pos_outliers, neg_outliers])

# Remove them from the dataframe
df_no_outliers = df[~df['values'].isin(all_outliers)].reset_index(drop=True)

print(df_no_outliers.describe())


           values     zscore  log_values
count   51.000000  51.000000   51.000000
mean    49.875128  -0.212801    3.898861
std      8.521304   0.465457    0.138447
min     40.201649  -0.741192    3.693908
25%     45.930329  -0.428276    3.827073
50%     48.829315  -0.269926    3.888331
75%     51.798291  -0.107752    3.947356
max    100.000000   2.525155    4.605170
