In [3]:
import numpy as np
import pandas as pd
df=pd.read_csv('../../data/raw/outliers_homework.csv')
print(df)

           date  daily_return  daily_return_2
0    2022-01-03      0.001263        0.003834
1    2022-01-04     -0.020046       -0.009506
2    2022-01-05      0.004739       -0.000535
3    2022-01-06      0.009953        0.012539
4    2022-01-07      0.008872        0.009840
..          ...           ...             ...
110  2022-06-06     -0.010598       -0.001576
111  2022-06-07     -0.004461        0.002448
112  2022-06-08     -0.011624       -0.013451
113  2022-06-09     -0.006220       -0.002045
114  2022-06-10      0.006093        0.004984

[115 rows x 3 columns]


In [8]:
def detect_outliers_iqr(series,k=1.5):
    q1=series.quantile(0.25)
    q3=series.quantile(0.75)
    iqr=q3-q1
    lower=q1-k*iqr
    upper=q3+k*iqr
    return (series<lower)|(series>upper)
a=detect_outliers_iqr(df['daily_return'])
print(a)
a.sum()

0      False
1      False
2      False
3      False
4      False
       ...  
110    False
111    False
112    False
113    False
114    False
Name: daily_return, Length: 115, dtype: bool


np.int64(9)

In [7]:
def detect_outliers_zscore(series,threshold=3):
    mu=series.mean()
    sigma=series.std()
    z=(series-mu)/(sigma if sigma!=0 else 1)
    return z.abs()>threshold
b=detect_outliers_zscore(df['daily_return'])
print(b)
b.sum()

0      False
1      False
2      False
3      False
4      False
       ...  
110    False
111    False
112    False
113    False
114    False
Name: daily_return, Length: 115, dtype: bool


np.int64(5)

In [9]:
def winsorize_series(series,lower=0.05,upper=0.95):
    lo=series.quantile(lower)
    hi=series.quantile(upper)
    return series.clip(lower=lo,upper=hi)


In [11]:
df['daily_return_qr_outliers']=a
print(df)

           date  daily_return  daily_return_2  daily_return_qr_outliers
0    2022-01-03      0.001263        0.003834                     False
1    2022-01-04     -0.020046       -0.009506                     False
2    2022-01-05      0.004739       -0.000535                     False
3    2022-01-06      0.009953        0.012539                     False
4    2022-01-07      0.008872        0.009840                     False
..          ...           ...             ...                       ...
110  2022-06-06     -0.010598       -0.001576                     False
111  2022-06-07     -0.004461        0.002448                     False
112  2022-06-08     -0.011624       -0.013451                     False
113  2022-06-09     -0.006220       -0.002045                     False
114  2022-06-10      0.006093        0.004984                     False

[115 rows x 4 columns]


In [13]:
summ_all=df['daily_return'].describe()[['mean','50%','std']].rename({'50%':'median'})
summ_qr_filtered=df.loc[~df['daily_return_qr_outliers'],'daily_return'].describe()[['mean','50%','std']].rename({'50%':'median'})
summ_winsorized= winsorize_series(df['daily_return']).describe()[['mean','50%','std']].rename({'50%':'median'})
comp=pd.concat(
    {'all':summ_all,
     'filtered_iqr':summ_qr_filtered,
     'winsorized':summ_winsorized
        
    },axis=1
)
comp

Unnamed: 0,all,filtered_iqr,winsorized
mean,-0.001434,-3.9e-05,-0.000251
median,-0.000187,-0.0001,-0.000187
std,0.040579,0.009443,0.010623


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score
X_all=df[['daily_return']].to_numpy()
y_all=df[['daily_return_2']].to_numpy()
X_filtered= df.loc[~df['daily_return_qr_outliers'], ['daily_return']].to_numpy()
y_filtered=df.loc[~df['daily_return_qr_outliers'], ['daily_return_2']].to_numpy()
model_all=LinearRegression().fit(X_all,y_all)
model_filtered=LinearRegression().fit(X_filtered,y_filtered)
mae_all=mean_absolute_error(y_all,model_all.predict(X_all))
mae_filtered=mean_absolute_error(y_filtered,model_all.predict(X_filtered))
results = pd.DataFrame({
        'slope': [model_all.coef_[0], model_filtered.coef_[0]],
        'intercept': [model_all.intercept_, model_filtered.intercept_],
        'r2': [model_all.score(X_all, y_all), model_filtered.score(X_filtered, y_filtered)],
        'mae': [mae_all, mae_filtered]
    }, index=['all', 'filtered_iqr'])
print(results)

                             slope                intercept        r2  \
all           [0.6058687478180388]  [0.0002006627418449236]  0.961859   
filtered_iqr  [0.5896788395551325]  [-4.88346503029787e-05]  0.573566   

                   mae  
all           0.003951  
filtered_iqr  0.003847  


I choose iqr filter to move outliers. Honestly, this choice is only at my willingness. I can not find a reason for it since the raw data set is created at random. I think both iqr filter and z-score filter are ok. The assumption behind iqr filter is that the outliers are abnormal and the stock return should follow normal distribution. The results of iqr filter are shown in the table above: it lowers the standard deviation, which is expected; however, it is amazing that it lower the R square of the linearregression. If the assumption is wrong, the iqr filter will cause several important data points loss and significantly influence the linearregression model.