In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ogd-goi-test/X_Test_Data_Input.csv
/kaggle/input/ogd-goi-test/Y_Test_Data_Target.csv
/kaggle/input/ogd-goi/Y_Train_Data_Target.csv
/kaggle/input/ogd-goi/X_Train_Data_Input.csv
/kaggle/input/full-cleaned/full_cleaned.csv


In [2]:
import warnings
warnings.filterwarnings('ignore')
from scipy.stats.mstats import winsorize

In [3]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score , accuracy_score

# Data Read

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

def file_read_and_col_verification(path_x: str):
    xcolumns = ["ID", "Column0", "Column1", "Column2", "Column3", "Column4", "Column5", "Column6", "Column7", 
                "Column8", "Column9", "Column10", "Column11", "Column12", "Column13", "Column14", "Column15", 
                "Column16", "Column17", "Column18", "Column19", "Column20", "Column21"]
    try:
        xdf = pd.read_csv(path_x, header=0)
    except Exception as e:
        print(e)
        return 'file read error'
    
    cols = list(xdf.columns)
    if cols != xcolumns:
        return "columns mismatch, check if this is the correct file"
    xdf.drop(['Column9','Column14'], inplace=True, axis=1)  # almost 50% null
    xdf.drop(['Column4', 'Column11', 'Column12', 'Column13'], axis=1, inplace=True)  # highly correlated    
    return xdf

In [5]:
def df_x_cleaner(path: str, cols_list: list) -> pd.DataFrame:
    xdf = file_read_and_col_verification(path)
    ID = xdf['ID']
    xdf.drop(['ID'], axis=1, inplace=True)
    xdf['Column0'] = xdf['Column0'].fillna(1)
#     print('before imputing')
#     print(xdf.isnull().sum())
    imputed_df_sample = xdf.sample(n=100000, random_state=42)
    # Create a list of boolean conditions
    conditions = [imputed_df_sample[col].notnull() for col in cols_list]
    # Combine all conditions using logical AND
    combined_condition = np.logical_and.reduce(conditions)
    # Apply the combined condition to filter the DataFrame
    df_not_missing = imputed_df_sample[combined_condition]
    for i in range(len(cols_list)):
        print(f"####################pass{i}#######################")
        
        try:
            # Separate feature columns and target column
            X_train_sub = df_not_missing.drop(cols_list, axis=1)
            y_train_sub = df_not_missing[cols_list[i]]
            
            # Fit the model
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train_sub, y_train_sub.values.ravel())
            
            # Predict missing values
            X_full_missing = xdf[xdf[cols_list[i]].isnull()].drop(cols_list, axis=1)
            
            predicted_values = model.predict(X_full_missing)
            
        except Exception as e:
            print(e)
            return None
        
        # Prepare for merging
        xdf.reset_index(inplace=True)
        Temp_col = xdf[xdf[cols_list[i]].isnull()][['index', cols_list[i]]].reset_index(drop=True)
        Temp_col['predicted'] = np.nan  # Placeholder for predicted values

        prd = pd.DataFrame(predicted_values, columns=['predicted'])
        prd.reset_index(drop=True, inplace=True)  # Ensure indices match
        
        # Join Temp_col and prd on the row index
        Temp_col['predicted'] = prd['predicted']
        
        xdf = xdf.set_index('index').combine_first(Temp_col.set_index('index')).reset_index()
        xdf[cols_list[i]] = xdf[cols_list[i]].fillna(xdf['predicted'])
        xdf.drop(['predicted'], axis=1, inplace=True)
        
        # Rearrange xdf to match the desired order
        desired_order = ["Column0", "Column1", "Column2", "Column3", "Column5", "Column6", "Column7", 
                         "Column8", "Column10", "Column15","Column16", "Column17", "Column18", "Column19", 
                         "Column20", "Column21"]
        xdf = xdf[desired_order]
    xdf['ID'] = ID   
    print(xdf.isnull().sum())    
    return xdf


In [6]:
try:
    dfx= pd.read_csv('/kaggle/input/full-cleaned/full_cleaned.csv',header=0)
    
except:
    dfx = df_x_cleaner('/kaggle/input/ogd-goi/X_Train_Data_Input.csv', ['Column3', "Column5", "Column6", "Column8", "Column15"])
    dfx.to_csv('full_cleaned.csv')
yfx = pd.read_csv('/kaggle/input/ogd-goi/Y_Train_Data_Target.csv',header=0).drop('ID',axis=1)

In [7]:
dfx['Column0']= dfx['Column0'].astype(int)
dfx['Column2_winsorized'] = dfx['Column2_winsorized'].astype(int)
dfx['Column16']= dfx['Column16'].astype(int)
dfx['Column18']= dfx['Column18'].astype(int)

KeyError: 'Column2_winsorized'

In [None]:
dfx.drop('ID',axis=1,inplace=True)

In [None]:
x = dfx.copy(deep=True)
# dfx.skew()

# outlier detection

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")
num_columns = 6

for i in range(0, len(x.columns), num_columns):

    cols = x.columns[i:i + num_columns]

    plt.figure(figsize=(30, 6))
    sns.set(style="whitegrid")
    for j, column in enumerate(cols, 1):
        plt.subplot(1, num_columns, j)
        sns.boxplot(y=x[column])
        plt.title(column)
    
    plt.tight_layout()
    plt.show()


## - column0, column2, column5, column6, column7, 
## - column8,column10,  column15, column16,column17, 
## - column18, column19, column20, column21

except for column2 others have significant outliers

## columns19, 20 21 needs to balanced, outlier removal wont work on them

## Column1, Column2_winsorised, Column3, Column5_winsorised, Column6_winsorised, Column7_winsorised, Column8_winsorised ✅

In [None]:
x['Column2_winsorized'] = winsorize(x['Column2'], limits=[0.05, 0.05]) #already good
x['Column6_winsorized'] = winsorize(x['Column6'], limits=[0, 0.23]) # best possible
x['Column7_winsorized'] = winsorize(x['Column7'], limits=[0.22, 0.22]) #done works good for now
x['Column8_winsorized'] = winsorize(x['Column8'], limits=[0, 0.05]) # best possible
x['Column5_winsorized'] = winsorize(x['Column5'], limits=[0.075, 0.1]) # best i could do
x.drop(['Column2','Column5','Column6','Column7','Column8'],axis=1,inplace = True)

# x['Column0_winsorized'] = winsorize(x['Column0'], limits=[0.02, 0.02]) not working well
# x['Column10_winsorized'] = winsorize(x['Column10'], limits=[0, 0.2350]) # not working good, use anotehr technique
# x['Column15_winsorized'] = winsorize(x['Column15'], limits=[0.05, 0.05]) *1000000 # not working good, use anotehr technique
# x['Column18_winsorized'] = winsorize(x['Column18'], ) # not working good, use anotehr technique


# x['Column16_winsorized'] = winsorize(x['Column16'], limits=[0, 0.01]) not possible
# x['Column17_winsorized'] = winsorize(x['Column17'], limits=[0, 0.05]) not possible
# x['Column19_winsorized'] = winsorize(x['Column19'], limits=[0.05, 0.05]) not possible
# x['Column20_winsorized'] = winsorize(x['Column20'], limits=[0.05, 0.05]) not possible
# x['Column21_winsorized'] = winsorize(x['Column21'], limits=[0.05, 0.05]) not possible

for i in range(0, len(x.columns), num_columns):
    cols = x.columns[i:i + num_columns]
    plt.figure(figsize=(20, 5))
    sns.set(style="whitegrid")
    for j, column in enumerate(cols, 1):
        plt.subplot(1, num_columns, j)
        sns.boxplot(y=x[column])
        plt.title(column)
    plt.tight_layout()
    plt.show()

### tried robust scalar, no good

In [None]:

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
# x['Column0_scaled'] = scaler.fit_transform(x[['Column0']])
# x['Column5_scaled'] = scaler.fit_transform(x[['Column5']]) # best i could do
# x['Column10_scaled'] = scaler.fit_transform(x[['Column10']])
# x['Column15_scaled'] = scaler.fit_transform(x[['Column15']])
# x['Column18_scaled'] = scaler.fit_transform(x[['Column18']])

# log_cols = x[['Column0_scaled','Column5_scaled','Column10_scaled','Column15_scaled','Column18_scaled']]
# # x['Column16_winsorized'] = winsorize(x['Column16'], limits=[0, 0.01]) not possible
# # x['Column17_winsorized'] = winsorize(x['Column17'], limits=[0, 0.05]) not possible
# # x['Column19_winsorized'] = winsorize(x['Column19'], limits=[0.05, 0.05]) not possible
# # x['Column20_winsorized'] = winsorize(x['Column20'], limits=[0.05, 0.05]) not possible
# # x['Column21_winsorized'] = winsorize(x['Column21'], limits=[0.05, 0.05]) not possible

# for i in range(0, len(log_cols.columns), num_columns):
#     cols = log_cols.columns[i:i + num_columns]
#     plt.figure(figsize=(20, 5))
#     sns.set(style="whitegrid")
#     for j, column in enumerate(cols, 1):
#         plt.subplot(1, num_columns, j)
#         sns.boxplot(y=log_cols[column])
#         plt.title(column)
#     plt.tight_layout()
#     plt.show()

In [None]:
x.columns

In [None]:
# x.drop(['Column0', 'Column1', 'Column2', 'Column3', 'Column5', 'Column6',
#        'Column7', 'Column8', 'Column15', 'Column16', 'Column17',
#        'Column18', 'Column19', 'Column20', 'Column21',            'Column0_scaled', 'Column5_scaled'])

In [None]:
# x['Column2_winsorized'] = x['Column2_winsorized'].astype(int)
# x['Column0']= x['Column0'].astype(int)
# x['Column16']= x['Column16'].astype(int)
# x['Column18']= x['Column18'].astype(int)

In [None]:
x['Column3'].sum()

In [None]:
from imblearn.over_sampling import SMOTENC
from collections import Counter

# Assuming you have the data in a pandas DataFrame 'data'
cate = ['Column0','Column10','Column16','Column17','Column18','Column19','Column20','Column21']
# to be continued

smote_nc = SMOTENC(categorical_features=cate, random_state=42)

# Apply SMOTEnc
X_resampled, y_resampled = smote_nc.fit_resample(x, yfx)

# Convert back to DataFrame for convenience (optional)


In [None]:
y_resampled.shape #value_counts()

In [None]:
df_resampled = pd.DataFrame(X_resampled, columns=x.columns)
df_resampled['Target'] = y_resampled

# print(f"Original class distribution: {np.bincount(yfx['target'])}")
# print(f"Resampled class distribution: {np.bincount(y_resampled)}")
df_resampled.head()

In [None]:
x.skew()

In [None]:
df_resampled.skew() #[cate].nunique()

In [None]:
# sns.heatmap(dfx.corr())
sns.heatmap(x.corr())

In [None]:
dfx[["Column19"]].value_counts()

In [None]:
# x['Column0_log'] = np.log1p(x['Column0'])  # log1p is equivalent to log(1 + x), which handles zeroes

# # Check the transformed distribution
# plt.figure(figsize=(10, 6))
# sns.histplot(x['Column0_log'], bins=30, kde=True)
# plt.title('Log-Transformed Distribution of Column0')
# plt.xlabel('Log(Column0 + 1)')
# plt.ylabel('Frequency')
# plt.show()

In [None]:
# freq = x['Column0'].value_counts()
# weights = (1 / freq)*10000
# data['Column0_weight'] = data['Column0'].map(weights)

In [None]:
# x['Column0'].map(weights)*100000

In [None]:
# plt.figure(figsize=(10, 6))
# sns.boxplot(x['Column0'].map(weights)*100000)
# # plt.title('Log-Transformed Distribution of Column0')
# # plt.xlabel('Log(Column0 + 1)')
# plt.ylabel('Frequency')
# plt.show()

In [None]:
# x['Column0'].value_counts()

In [None]:
# Q1 = x["Column0"].quantile(0.25)
# Q3 = x["Column0"].quantile(0.75)
# IQR = Q3 - Q1

# # Find rows where any value is an outlier
# outliers = x["Column0"][((x["Column0"] < (Q1 - 1.5 * IQR)) | (x["Column0"] > (Q3 + 1.5 * IQR)))]




# skewness handling

In [None]:
# x.skew()

In [None]:
# skewness_handle = ["Column0","Column3","Column5","Column6","Column7","Column8","Column10","Column15","Column16","Column17","Column18","Column19","Column20","Column21"]

In [None]:
# x['Column5_transformed'] = np.arcsinh(x['Column5'])
# x['Column5_transformed'].skew()

# Model functions 

In [None]:
# Calculate metrics for each model
def mats(y_pred_xgb,y_pred_rfr,y_pred_LGBMR,y_test):
    metrics = {
                'XGB': {
                    'Accuracy': accuracy_score(y_test, y_pred_xgb),
                    'MAE': mean_absolute_error(y_test, y_pred_xgb),
                    'MSE': mean_squared_error(y_test, y_pred_xgb),
                    'R2': r2_score(y_test, y_pred_xgb),
                    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred_xgb))
                    },
                'RFR': {
                    'Accuracy': accuracy_score(y_test, y_pred_rfr),
                    'MAE': mean_absolute_error(y_test, y_pred_rfr),
                    'MSE': mean_squared_error(y_test, y_pred_rfr),
                    'R2': r2_score(y_test, y_pred_rfr),
                    'rmse_rfr': np.sqrt(mean_squared_error(y_test, y_pred_rfr))
                    },
                'LGBMR': {
                    'Accuracy': accuracy_score(y_test, y_pred_LGBMR),
                    'MAE': mean_absolute_error(y_test, y_pred_LGBMR),
                    'MSE': mean_squared_error(y_test, y_pred_LGBMR),
                    'R2': r2_score(y_test, y_pred_LGBMR),
                    'rmse_LGBMR':  np.sqrt(mean_squared_error(y_test, y_pred_LGBMR))
                    }
                }

    metrics_df = pd.DataFrame(metrics).T
    print(metrics_df)



def roc_auc(xgb_model,rfr_model,LGBMR_model,X_test,y_test):
    # Assuming your target variable y_test is binary (0 and 1)
    y_pred_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]
    y_pred_prob_rfr = rfr_model.predict_proba(X_test)[:, 1]  # If RandomForestClassifier is used
    y_pred_prob_LGBMR = LGBMR_model.predict_proba(X_test)[:, 1]

    # Calculate AUC
    auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
    auc_rfr = roc_auc_score(y_test, y_pred_prob_rfr)
    auc_LGBMR = roc_auc_score(y_test, y_pred_prob_LGBMR)
    
    y_pred_xgb_binary = (y_pred_prob_xgb > 0.5).astype(int)
    y_pred_rfr_binary = (y_pred_prob_rfr > 0.5).astype(int)
    y_pred_LGBMR_binary = (y_pred_prob_LGBMR > 0.5).astype(int)

    # Calculate F1 Score
    f1_xgb = f1_score(y_test, y_pred_xgb_binary)
    f1_rfr = f1_score(y_test, y_pred_rfr_binary)
    f1_LGBMR = f1_score(y_test, y_pred_LGBMR_binary)

    # Print AUC values
    print(f"AUC (XGB): {auc_xgb}")
    print(f"AUC (RFR): {auc_rfr}")
    print(f"AUC (LGBMR): {auc_LGBMR}")

def confusion(xgb_model,rfr_model,LGBMR_model,X_test,y_test):
       
    y_pred_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]
    y_pred_prob_rfr = rfr_model.predict_proba(X_test)[:, 1]  # If RandomForestClassifier is used
    y_pred_prob_LGBMR = LGBMR_model.predict_proba(X_test)[:, 1]

    # Convert probabilities to binary predictions
    y_pred_xgb_binary = (y_pred_prob_xgb > 0.5).astype(int)
    y_pred_rfr_binary = (y_pred_prob_rfr > 0.5).astype(int)
    y_pred_LGBMR_binary = (y_pred_prob_LGBMR > 0.5).astype(int)

    # Compute confusion matrices
    conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb_binary)
    conf_matrix_rfr = confusion_matrix(y_test, y_pred_rfr_binary)
    conf_matrix_LGBMR = confusion_matrix(y_test, y_pred_LGBMR_binary)

    # Visualize confusion matrices
    def plot_confusion_matrices(cms, titles):
        plt.figure(figsize=(15, 5))  # Adjust size for 3 subplots in a row
        for i, (cm, title) in enumerate(zip(cms, titles), 1):
            plt.subplot(1, len(cms), i)  # Create subplot for each confusion matrix
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                        xticklabels=['Class 0', 'Class 1'], 
                        yticklabels=['Class 0', 'Class 1'])
            plt.title(title)
            plt.xlabel('Predicted Label')
            plt.ylabel('True Label')
        plt.tight_layout()  # Adjust subplots to fit into figure area
        plt.show()

    cms = [conf_matrix_xgb, conf_matrix_rfr, conf_matrix_LGBMR]
    titles = ['XGB', 'RFR', 'LGBMR']
    plot_confusion_matrices(cms, titles)

mats(y_pred_xgb,y_pred_rfr,y_pred_LGBMR,y_test) <br>
roc_auc(xgb_model,rfr_model,LGBMR_model,X_test,y_test) <br>
confusion(xgb_model,rfr_model,LGBMR_model)


In [None]:
# Replace LogisticRegression with LinearRegression
def stackm(df_x: pd.DataFrame, df_y: pd.DataFrame):
    xgb_model = XGBClassifier()
    rfr_model = RandomForestClassifier()
    LGBMR_model = LGBMClassifier()

    # Corrected variable assignment
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)
    
    # Fit the models
    xgb_model.fit(X_train, y_train)
    rfr_model.fit(X_train, y_train)
    LGBMR_model.fit(X_train, y_train)
    
    return xgb_model, rfr_model, LGBMR_model, X_test, y_test


In [None]:
# Assuming df_x and df_y are your feature matrix and target vector, respectively
xgb_model, rfr_model, LGBMR_model, xtest1, ytest1 = stackm(X_resampled, y_resampled)

# on cleaned data, and some outliers removed
y_pred_xgb = xgb_model.predict(xtest1)
y_pred_rfr = rfr_model.predict(xtest1)
y_pred_LGBMR = LGBMR_model.predict(xtest1)
print()
mats(y_pred_xgb,y_pred_rfr,y_pred_LGBMR,ytest1)
print()
roc_auc(xgb_model,rfr_model,LGBMR_model,xtest1,ytest1)
print()
confusion(xgb_model,rfr_model,LGBMR_model,xtest1,ytest1)

In [None]:
xgb_model2, rfr_model2, LGBMR_model2, xtest2, ytest2 = stackm(dfx, yfx)

# on cleaned data, and some outliers removed
y_pred_xgb2 = xgb_model2.predict(xtest2)
y_pred_rfr2 = rfr_model2.predict(xtest2)
y_pred_LGBMR2 = LGBMR_model2.predict(xtest2)
print()
mats(y_pred_xgb2,y_pred_rfr2,y_pred_LGBMR2,ytest2)
print()
roc_auc(xgb_model2,rfr_model2,LGBMR_model2,xtest2,ytest2)
print()
confusion(xgb_model2,rfr_model2,LGBMR_model2,xtest2,ytest2)

In [None]:
type(xgb_model2)