In [1]:
import sys
sys.path.append('')

from read_csv import *
from missing_values import *

In [2]:
df = read_file('df_master.csv', verbosity = 0)

In [3]:
check_nulls(df, verbosity = 0)

Number of nulls in impl_volatility is : 12763
Percentage of nulls in impl_volatility is : 8.902638077035755
There are too many distinct values to fill for column impl_volatility, avoid filling missing values with mean, median or mode. Drop the missing values if possible
No nulls present in ['cp_flag', 'strike_price', 'best_bid', 'best_offer', 'expiration_date', 'date', 'time_to_expiry', 'PRC', 'SOFR']


In [4]:
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,GridSearchCV

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from lightgbm import LGBMRegressor

from sklearn.metrics import accuracy_score,classification_report,r2_score,mean_squared_error,mean_absolute_error

In [5]:
import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics

data = []

def model(model, name, X_train, X_test, y_train, y_test):
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    score = round(r2_score(y_test, y_pred), 4)
    print(f'R2 score {score}')
    
    cv_score = round(cross_val_score(model, X_train, y_train, scoring = 'r2', cv = 5).mean(), 4)
    print(f'CrossValidation score {cv_score}')
    
    mse = round(mean_squared_error(y_test, y_pred), 3)
    print(f'MSE: {mse}')

    data.append([name, score, cv_score, mse])

In [6]:
def fill_missing_values_regression(X, target_col):
    y = target_col.dropna()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 42)

    xgb = XGBRegressor(n_jobs = -1)
    print(model(xgb, 'XGBoost', X_train, X_test, y_train, y_test))

    lr = LinearRegression()
    print(model(lr, 'Linear Regression', X_train, X_test, y_train, y_test))

    dtr = DecisionTreeRegressor()
    print(model(dtr, 'Decision Tree Regressor', X_train, X_test, y_train, y_test))

    ada = AdaBoostRegressor()
    print(model(ada, 'Ada Boost', X_train, X_test, y_train, y_test))

    gbr = GradientBoostingRegressor()
    print(model(gbr, 'Gradient Boosting Regression', X_train, X_test, y_train, y_test))

    rf = RandomForestRegressor(n_jobs = -1)
    print(model(rf, 'Random Forest Regressor', X_train, X_test, y_train, y_test))

    lgbm = LGBMRegressor(n_jobs = 4)
    print(model(lgbm, 'Light GBM regressor', X_train, X_test, y_train, y_test))

    df_scores = pd.DataFrame(data, columns = ['model', 'r2', 'cv', 'mse']).sort_values(by='r2', ascending=False)
    display(df_scores)
    
    model.
    

def fill_missing_values(df, column_name, method = 'mean', verbosity = 1, inplace = False):

    if (column_name == None):
        raise Exception("Column name is required")
    
    if (method == 'mean' or method == 'median' or method == 'mode'):
        return fill_missing_values_descriptive(df, column_name, method, verbosity, inplace)
    
    if (method == 'interpolate'):
        return fill_missing_values_interpolation(df, column_name, inplace)
    
    if (method == 'ML'):      
        
        if(len(df[column_name].value_counts()) <= 10):
            column_names_not_imputation = get_uncorelated_columns_for_modelling(df, column_name)
            column_names_for_imputation = [i for i in df.columns if i not in column_names_not_imputation]
            model_df = drop_nulls_imputation(df[column_names_for_imputation], column_names_for_imputation)
            display(model_df)
            df[column_name] = fill_missing_values_classification(model_df, df[column_name])
        else:
            column_names_for_imputation = get_corelated_columns_for_modelling(df, column_name)
            model_df = drop_nulls_imputation(df[column_names_for_imputation], column_names_for_imputation)
            display(model_df)
            df[column_name] = fill_missing_values_regression(model_df, df[column_name])
            

In [7]:
fill_missing_values(df, column_name = 'impl_volatility', verbosity = 1, method = 'ML', inplace = False)

Unnamed: 0,strike_price,best_bid,best_offer,impl_volatility,time_to_expiry,PRC
7,1.0,0.00,0.02,2.552783,256,729.77
8,1.0,726.75,730.80,1.903996,382,729.77
9,1.0,0.01,0.02,2.170186,382,729.77
11,1.0,0.02,0.03,1.941874,529,729.77
19,1.0,0.00,0.01,2.414317,255,735.11
...,...,...,...,...,...,...
143357,1700.0,1037.25,1045.50,0.781574,595,793.53
143358,1700.0,170.95,178.85,0.792564,721,793.53
143359,1700.0,1064.35,1071.90,0.773554,721,793.53
143360,1700.0,185.00,190.80,0.791577,777,793.53


R2 score 1.0
CrossValidation score 1.0
MSE: 0.0
None
R2 score 1.0
CrossValidation score 1.0
MSE: 0.0
None
R2 score 1.0
CrossValidation score 1.0
MSE: 0.0
None
R2 score 0.9953
CrossValidation score 0.994
MSE: 0.001
None
R2 score 0.9999
CrossValidation score 0.9999
MSE: 0.0
None
R2 score 1.0
CrossValidation score 1.0
MSE: 0.0
None
R2 score 0.9996
CrossValidation score 0.9996
MSE: 0.0
None


Unnamed: 0,model,r2,cv,mse
0,XGBoost,1.0,1.0,0.0
1,Linear Regression,1.0,1.0,0.0
2,Decision Tree Regressor,1.0,1.0,0.0
5,Random Forest Regressor,1.0,1.0,0.0
4,Gradient Boosting Regression,0.9999,0.9999,0.0
6,Light GBM regressor,0.9996,0.9996,0.0
3,Ada Boost,0.9953,0.994,0.001


In [8]:
df['impl_volatility'].isnull().sum()

143362