In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time

## Read in data by chunks

In [37]:
previous =  'https://storage.googleapis.com/home_credit_files/previous_application.csv'
train_main = 'https://storage.googleapis.com/home_credit_files/application_train.csv'
test_main =  'https://storage.googleapis.com/home_credit_files/application_test.csv'

chunk_size = 25000

# Initialize empty list to store chunks
chunks_train = []
chunks_test = []
chunks_previous = []

# Read train data in chunks
for train_chunk in pd.read_csv(train_main, index_col=False, chunksize=chunk_size, low_memory=False):
    # Process train_chunk here (e.g., clean, transform, analyze)
    chunks_train.append(train_chunk)

# Concatenate train chunks to create final DataFrame
df_t = pd.concat(chunks_train, ignore_index=True)

# Read test data in chunks
for test_chunk in pd.read_csv(test_main, index_col=False, chunksize=chunk_size, low_memory=False):
    # Process test_chunk here (e.g., clean, transform, analyze)
    chunks_test.append(test_chunk)

# Concatenate test chunks to create final DataFrame
df_e = pd.concat(chunks_test, ignore_index=True)

# Read previous data in chunks
for prev_chunk in pd.read_csv(previous, index_col=False, chunksize=chunk_size, low_memory=False):
    # Process prev_chunk here (e.g., clean, transform, analyze)
    chunks_previous.append(prev_chunk)

# Concatenate previous chunks to create final DataFrame
previous_app_df = pd.concat(chunks_previous, ignore_index=True)

## Reference Data Types

In [38]:
pd.set_option('display.max_rows', None)
data_types = df_t.dtypes
print(data_types)

SK_ID_CURR                        int64
TARGET                            int64
NAME_CONTRACT_TYPE               object
CODE_GENDER                      object
FLAG_OWN_CAR                       bool
FLAG_OWN_REALTY                    bool
CNT_CHILDREN                      int64
AMT_INCOME_TOTAL                float64
AMT_CREDIT                      float64
AMT_ANNUITY                     float64
AMT_GOODS_PRICE                 float64
NAME_TYPE_SUITE                  object
NAME_INCOME_TYPE                 object
NAME_EDUCATION_TYPE              object
NAME_FAMILY_STATUS               object
NAME_HOUSING_TYPE                object
REGION_POPULATION_RELATIVE      float64
DAYS_BIRTH                        int64
DAYS_EMPLOYED                     int64
DAYS_REGISTRATION               float64
DAYS_ID_PUBLISH                   int64
OWN_CAR_AGE                     float64
FLAG_MOBIL                        int64
FLAG_EMP_PHONE                    int64
FLAG_WORK_PHONE                   int64


In [39]:
# numeric_feats = df_t.dtypes[df_t.dtypes != "object"].index
# plt.figure(figsize=(80,18))
# sns.heatmap(df_t[numeric_feats].corr(), annot=False, square=True, cmap='coolwarm')
# plt.show()

## Correlations

In [40]:
# Filter out object type columns from the DataFrame
numeric_df = df_t.select_dtypes(include=['int64', 'float64'])

# Compute correlations with the TARGET column
correlations = numeric_df.corr()['TARGET'].sort_values(ascending=False)

# Select the top 30 positive correlations
top_positive_correlations = correlations.head(30)

# Select the top 30 negative correlations
top_negative_correlations = correlations.tail(30)

# Display the results
print("Top 30 positive correlations:")
print(top_positive_correlations)
print("\nTop 30 negative correlations:")
print(top_negative_correlations)

Top 30 positive correlations:
TARGET                         1.000000
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
REGION_RATING_CLIENT           0.058899
DAYS_LAST_PHONE_CHANGE         0.055218
DAYS_ID_PUBLISH                0.051457
REG_CITY_NOT_WORK_CITY         0.050994
FLAG_EMP_PHONE                 0.045982
REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_DOCUMENT_3                0.044346
DAYS_REGISTRATION              0.041975
OWN_CAR_AGE                    0.037612
LIVE_CITY_NOT_WORK_CITY        0.032518
DEF_30_CNT_SOCIAL_CIRCLE       0.032248
DEF_60_CNT_SOCIAL_CIRCLE       0.031276
FLAG_WORK_PHONE                0.028524
AMT_REQ_CREDIT_BUREAU_YEAR     0.019930
CNT_CHILDREN                   0.019187
CNT_FAM_MEMBERS                0.009308
OBS_30_CNT_SOCIAL_CIRCLE       0.009131
OBS_60_CNT_SOCIAL_CIRCLE       0.009022
REG_REGION_NOT_WORK_REGION     0.006942
REG_REGION_NOT_LIVE_REGION     0.005576
FLAG_DOCUMENT_2                0.005417
FLAG_DOCUM

## Display numeric NA's

In [41]:
all_Xdata_na = (df_t.isnull().sum() / len(df_t)) * 100
all_Xdata_na = all_Xdata_na.drop(all_Xdata_na[all_Xdata_na == 0].index).sort_values(ascending=False)[:300]
missing_data = pd.DataFrame({'Missing Data Percent' :all_Xdata_na})

In [42]:
pd.set_option('display.max_rows', None)
missing_data

Unnamed: 0,Missing Data Percent
COMMONAREA_MEDI,69.872297
COMMONAREA_AVG,69.872297
COMMONAREA_MODE,69.872297
NONLIVINGAPARTMENTS_MEDI,69.432963
NONLIVINGAPARTMENTS_MODE,69.432963
NONLIVINGAPARTMENTS_AVG,69.432963
FONDKAPREMONT_MODE,68.386172
LIVINGAPARTMENTS_MODE,68.354953
LIVINGAPARTMENTS_MEDI,68.354953
LIVINGAPARTMENTS_AVG,68.354953


## Missing Numeric data to convert to median

## What missing data remains?

In [43]:
columns_with_missing_data = df_t.columns[df_t.isnull().any()].tolist()

# Display the columns with missing data
print(columns_with_missing_data)

['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'OWN_CAR_AGE', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',

## Object missing data

In [44]:
df_na_columns = df_t[columns_with_missing_data]

# Show the head of the DataFrame
df_na_columns.head()

Unnamed: 0,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,OWN_CAR_AGE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,...,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,13059.0,360000.0,Unaccompanied,,,2.0,,0.593984,0.819318,,...,0.0,0.0,0.0,-3219.0,0.0,0.0,0.0,0.0,2.0,1.0
1,57685.5,675000.0,Unaccompanied,,Core staff,2.0,0.434314,0.067658,0.189595,,...,0.0,8.0,0.0,-1244.0,0.0,0.0,0.0,0.0,1.0,2.0
2,9000.0,180000.0,Unaccompanied,,Drivers,1.0,,0.037153,0.286652,,...,0.0,0.0,0.0,-265.0,0.0,0.0,0.0,0.0,0.0,1.0
3,21775.5,675000.0,Unaccompanied,,Sales staff,2.0,,0.355276,0.581484,,...,0.0,0.0,0.0,-135.0,0.0,0.0,0.0,0.0,0.0,1.0
4,25407.0,450000.0,Unaccompanied,13.0,,1.0,,0.656941,0.231439,,...,0.0,0.0,0.0,-1062.0,0.0,0.0,0.0,0.0,1.0,1.0


In [45]:
pd.set_option('display.max_rows', None)
missing_percentages = (df_na_columns.isna().sum() / len(df_na_columns)) * 100
print(missing_percentages)

AMT_ANNUITY                      0.003902
AMT_GOODS_PRICE                  0.090403
NAME_TYPE_SUITE                  0.420148
OWN_CAR_AGE                     65.990810
OCCUPATION_TYPE                 31.345545
CNT_FAM_MEMBERS                  0.000650
EXT_SOURCE_1                    56.381073
EXT_SOURCE_2                     0.214626
EXT_SOURCE_3                    19.825307
APARTMENTS_AVG                  50.749729
BASEMENTAREA_AVG                58.515956
YEARS_BEGINEXPLUATATION_AVG     48.781019
YEARS_BUILD_AVG                 66.497784
COMMONAREA_AVG                  69.872297
ELEVATORS_AVG                   53.295980
ENTRANCES_AVG                   50.348768
FLOORSMAX_AVG                   49.760822
FLOORSMIN_AVG                   67.848630
LANDAREA_AVG                    59.376738
LIVINGAPARTMENTS_AVG            68.354953
LIVINGAREA_AVG                  50.193326
NONLIVINGAPARTMENTS_AVG         69.432963
NONLIVINGAREA_AVG               55.179164
APARTMENTS_MODE                 50

# Merge

In [46]:
# previous_app_df = pd.read_csv(f'previous_application.csv', index_col=False, low_memory=False)
original_id = df_e['SK_ID_CURR'].tolist()
previous_app_df.drop_duplicates(subset='SK_ID_CURR', inplace=True)

df_merged = pd.merge(df_t, previous_app_df, on="SK_ID_CURR", how="left")

# Drop previous DUPLICATES
df_e = pd.merge(df_e, previous_app_df, on="SK_ID_CURR", how="left")

len(df_merged.columns)


158

In [47]:
df_merged['is_test'] = 0  # For train data, set to 0
df_e['is_test'] = 1  # For test data, set to 1
# Concatenate the train and test datasets
df_combined = pd.concat([df_merged, df_e], ignore_index=True)
df_combined.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,is_test
0,278284,0.0,Cash loans,F,False,True,0,90000.0,360000.0,13059.0,...,,XNA,Cash,,,,,,,0
1,278285,0.0,Cash loans,F,False,True,0,135000.0,728460.0,57685.5,...,10.0,low_normal,POS household with interest,365243.0,-1213.0,-943.0,-973.0,-968.0,0.0,0
2,278289,0.0,Revolving loans,M,False,True,0,225000.0,180000.0,9000.0,...,12.0,middle,POS mobile with interest,,,,,,,0
3,278290,0.0,Cash loans,F,False,False,1,72000.0,675000.0,21775.5,...,10.0,high,POS mobile with interest,365243.0,-1356.0,-1086.0,-1086.0,-1082.0,0.0,0
4,278292,0.0,Cash loans,M,True,False,0,157500.0,545040.0,25407.0,...,24.0,middle,POS industry with interest,,,,,,,0


In [48]:
# df_e = df_combined[df_combined['is_test'] == 1]
# 
# # Remove the 'is_test' column from the combined DataFrame
# df_combined.drop(columns=['is_test'], inplace=True)
# df_e.drop(columns=['is_test'], inplace=True)
# df_e.drop(columns=['TARGET'],inplace=True)
# len(df_e)

# df_combined[df_combined['is_test'] == 1].head()

In [49]:
threshold = len(df_combined) * 0.2

# Drop columns with 20% or more NaN values
df_combined = df_combined.dropna(axis=1, thresh=threshold)

for col in df_combined.columns:
    # Check if the column is of object data type and has missing or NA values
    if df_combined[col].dtype == 'object' and df_combined[col].isnull().any():
        # Calculate the mode of the column
        mode_values = df_combined[col].mode().iloc[0]
        # Fill missing values with the mode
        df_combined[col].fillna(mode_values, inplace=True)
        
        
for col in df_combined.columns:
    # Check if the column is of float64 data type and has missing or NA values
    if df_combined[col].dtype == 'float64' and df_combined[col].isnull().any():
        # Calculate the median of the column
        median_value = df_combined[col].median()
        # Fill missing values with the median
        df_combined[col].fillna(median_value, inplace=True)

## Dummy Encode

In [50]:
# categorical_vars = df_combined.select_dtypes(include=['object','category']).columns.tolist()


bool_obj_columns = df_combined.select_dtypes(include=['bool', 'object']).columns

# Convert selected columns to category data type
df_combined[bool_obj_columns] = df_combined[bool_obj_columns].astype('category')


# # Perform one-hot encoding on the identified categorical variables
# df_encoded = pd.get_dummies(df_combined, columns=categorical_vars)


# Optionally, if you want to limit one-hot encoding to specific columns, you can replace 'categorical_vars' with a subset of column names
# Example: df_encoded = pd.get_dummies(df, columns=['NAME_CONTRACT_TYPE_x', 'CODE_GENDER'])

# Check the shape of the original and encoded dataframe to understand the transformation
print("Original shape:", df_merged.shape)
print("Encoded shape:", df_combined.shape)


# Display the first few rows of the encoded dataframe
#print(df_encoded.head())
#print(categorical_vars)

Original shape: (307511, 159)
Encoded shape: (356255, 157)


In [51]:
len(df_combined.columns)


157

In [52]:
pd.set_option('display.max_rows', None)
data_types = df_combined.dtypes
print(data_types)

SK_ID_CURR                         int64
TARGET                           float64
NAME_CONTRACT_TYPE_x            category
CODE_GENDER                     category
FLAG_OWN_CAR                    category
FLAG_OWN_REALTY                 category
CNT_CHILDREN                       int64
AMT_INCOME_TOTAL                 float64
AMT_CREDIT_x                     float64
AMT_ANNUITY_x                    float64
AMT_GOODS_PRICE_x                float64
NAME_TYPE_SUITE_x               category
NAME_INCOME_TYPE                category
NAME_EDUCATION_TYPE             category
NAME_FAMILY_STATUS              category
NAME_HOUSING_TYPE               category
REGION_POPULATION_RELATIVE       float64
DAYS_BIRTH                         int64
DAYS_EMPLOYED                      int64
DAYS_REGISTRATION                float64
DAYS_ID_PUBLISH                    int64
OWN_CAR_AGE                      float64
FLAG_MOBIL                         int64
FLAG_EMP_PHONE                     int64
FLAG_WORK_PHONE 

In [53]:
# Filter numeric columns excluding object, category, and bool
numeric_feats = df_combined.select_dtypes(include=['int', 'float']).columns

# Calculate correlations with the target variable
correlations = df_combined[numeric_feats].corr()['TARGET'].sort_values(ascending=False)

# Select the top 30 positive correlations
top_positive_correlations = correlations.head(30)


# Select the top 30 negative correlations
top_negative_correlations = correlations.tail(30)

In [54]:
# Display the results
print("Top 30 positive correlations:")
print(top_positive_correlations)
print("\nTop 30 negative correlations:")
print(top_negative_correlations)

Top 30 positive correlations:
TARGET                         1.000000
DAYS_BIRTH                     0.072610
REGION_RATING_CLIENT_W_CITY    0.057435
DAYS_LAST_PHONE_CHANGE         0.055645
REGION_RATING_CLIENT           0.055240
DAYS_ID_PUBLISH                0.048677
REG_CITY_NOT_WORK_CITY         0.047668
FLAG_EMP_PHONE                 0.043327
REG_CITY_NOT_LIVE_CITY         0.041123
DAYS_REGISTRATION              0.038525
FLAG_DOCUMENT_3                0.035020
LIVE_CITY_NOT_WORK_CITY        0.030602
DAYS_DECISION                  0.029778
DEF_30_CNT_SOCIAL_CIRCLE       0.029214
DEF_60_CNT_SOCIAL_CIRCLE       0.028386
FLAG_WORK_PHONE                0.025808
CNT_CHILDREN                   0.018802
OWN_CAR_AGE                    0.015392
CNT_PAYMENT                    0.012909
AMT_REQ_CREDIT_BUREAU_YEAR     0.009475
CNT_FAM_MEMBERS                0.008866
OBS_30_CNT_SOCIAL_CIRCLE       0.007635
OBS_60_CNT_SOCIAL_CIRCLE       0.007463
DAYS_TERMINATION               0.006951
DAYS_LAST_

In [55]:
columns_to_drop = correlations[(correlations > -0.05) & (correlations < 0.04)].index


In [56]:
columns_to_keep = ['is_test', 'SK_ID_CURR']  # Columns to keep

# Drop the specified columns from df_encoded
df_encoded = df_combined.drop(columns=set(columns_to_drop) - set(columns_to_keep))


In [57]:

# Divide the values of columns containing 'DAYS' by 365
for column in df_encoded.columns:
    if 'DAYS' in column:
        df_encoded[column] = df_encoded[column] / 365



In [58]:
df_e = df_encoded[df_encoded['is_test'] == 1]

# Remove the 'is_test' column from the combined DataFrame
df_encoded.drop(columns=['is_test'], inplace=True)
df_e.drop(columns=['is_test'], inplace=True)
df_e.drop(columns=['TARGET'],inplace=True)
len(df_e)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_e.drop(columns=['is_test'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_e.drop(columns=['TARGET'],inplace=True)


48744

## Downsample

In [59]:
from sklearn.utils import resample

# Assuming df_combined is your combined DataFrame with 'TARGET' column and 'is_test' column



# Separate majority and minority classes
majority_class = df_encoded[df_encoded['TARGET'] == 0] 
minority_class = df_encoded[df_encoded['TARGET'] == 1] 

downsampled_majority = resample(majority_class, 
                                replace=True,  # Sample with replacement to increase the number of instances
                                n_samples=int(len(majority_class)/2),  # Downsample to match desired ratio
                                random_state=31)  # Set random state for reproducibility
print(len(downsampled_majority))



# Combine minority class with upsampled minority class
df_upsampled = pd.concat([downsampled_majority, minority_class])
print(len(df_upsampled))
# Shuffle the DataFrame to mix up the order of the rows
df_upsampled = df_upsampled.sample(frac=1, random_state=14).reset_index(drop=True)


165715
190540


## Remove excess columns

In [60]:
common_columns = df_upsampled.columns.intersection(df_e.columns)

# Drop columns from test set that are not in the train set, excluding 'SK_ID_CURR'
columns_to_drop = df_e.columns.difference(df_upsampled.columns)
columns_to_drop = [col for col in columns_to_drop if col != 'SK_ID_CURR']  # Exclude 'SK_ID_CURR' from the columns to drop
df_e.drop(columns=columns_to_drop, inplace=True)

columns_merged = df_upsampled.columns
columns_e = df_e.columns

# Find the columns that are in df_merged but not in df_e
columns_diff_merged = set(columns_merged) - set(columns_e)

# Find the columns that are in df_e but not in df_merged
columns_diff_e = set(columns_e) - set(columns_merged)

# Print the different columns
print("Columns in df_merged but not in df_e:", columns_diff_merged)
print("Columns in df_e but not in df_merged:", columns_diff_e)


Columns in df_merged but not in df_e: {'TARGET'}
Columns in df_e but not in df_merged: set()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_e.drop(columns=columns_to_drop, inplace=True)


In [61]:
df_upsampled['TARGET'].value_counts()

TARGET
0.0    165715
1.0     24825
Name: count, dtype: int64

## Modeling

### Data Partition

In [62]:
# from category_encoders import TargetEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# 
# # Define the preprocessing steps for categorical variables
# categorical_features = df_t.select_dtypes(include=['category']).columns
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values if any
#     ('target_encoder', TargetEncoder())  # Target encode categorical variables
# ])
# 
# # Combine preprocessing steps for categorical features
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', categorical_transformer, categorical_features)
#     ])
# 
# # Create a pipeline with preprocessing and RandomForest classifier
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('classifier', RandomForestClassifier())])
# 
# # Separate features and target variable
# X = df_t.drop(columns=['TARGET'])
# y = df_t['TARGET']
# 
# # Fit the pipeline on training data
# pipeline.fit(X, y)

In [63]:
# from sklearn.model_selection import train_test_split
# 
# # Define your features (X) and target variable (y) based on your EDA
# X_train = df_upsampled.drop(columns=['TARGET'])
# y_train = df_upsampled['TARGET']
# 
# 
# X_test = df_e
# y_test = None  # You don't have the ground truth labels for the test set
# 
# 
# print(len(X_train))
# print(len(y_train))

In [64]:
# X_test.head()

In [65]:
# y_train.value_counts()

### Random Forest

## Train metrics before test!

In [66]:
# predict_proba

In [67]:
# from sklearn.model_selection import train_test_split
# 
# # Define your features (X) and target variable (y) based on your EDA
# # X = df_downsampled.drop(columns=['TARGET'])
# # y = df_downsampled['TARGET']
# 
# # Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=32)
# 
# 
# # from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
# 
# random_forest = RandomForestClassifier(
#     n_estimators=900,  # Number of trees in the forest
#     max_depth=70,      # Maximum depth of the trees
#     min_samples_split=5,  # Minimum number of samples required to split a node
#     min_samples_leaf=2,   # Minimum number of samples required at each leaf node
#     random_state=12     # Random seed for reproducibility
# )
# 
# # Initialize models
# # logistic_regression = LogisticRegression(random_state=42, max_iter=1000)
# # gradient_boosting = GradientBoostingClassifier(random_state=42)
# 
# 
# # Fit models
# # logistic_regression.fit(X_train, y_train)
# random_forest.fit(X_train, y_train)
# # gradient_boosting.fit(X_train, y_train)
# 
# # Make predictions
# # y_pred_lr = logistic_regression.predict(X_val)
# y_pred_rf = random_forest.predict(X_val)
# # y_pred_gb = gradient_boosting.predict(X_val)
# 
# # Calculate evaluation metrics
# metrics_lr = [accuracy_score, recall_score, f1_score, roc_auc_score]
# metrics_rf_gb = [accuracy_score, recall_score, f1_score, roc_auc_score]
# 
# results = {}
# 
# # Logistic Regression Metrics
# # results['Logistic Regression'] = {metric.__name__: metric(y_val, y_pred_lr) for metric in metrics_lr}
# 
# # Random Forest Metrics
# results['Random Forest'] = {metric.__name__: metric(y_val, y_pred_rf) for metric in metrics_rf_gb}
# 
# # Gradient Boosting Metrics
# # results['Gradient Boosting'] = {metric.__name__: metric(y_val, y_pred_gb) for metric in metrics_rf_gb}
# 
# # Print results
# for model, metrics in results.items():
#     print(f"{model} Metrics:")
#     for metric, value in metrics.items():
#         print(f"{metric}: {value}")
#     print()

In [68]:
start_time = time.time()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define your features (X) and target variable (y) based on your EDA
X = df_upsampled.drop(columns=['TARGET'])
y = df_upsampled['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3)

# Define the preprocessing steps for categorical variables
categorical_features = X_train.select_dtypes(include=['category']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values if any
    ('target_encoder', TargetEncoder())  # Target encode categorical variables
])

# Combine preprocessing steps for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessing and RandomForest classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=800,
                                                                 max_depth=90,
                                                                 min_samples_split=5,
                                                                 min_samples_leaf=2,
                                                                 random_state=46))])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_rf = pipeline.predict(X_val)

# Calculate evaluation metrics
metrics_rf = {
    'Accuracy': accuracy_score(y_val, y_pred_rf),
    'Recall': recall_score(y_val, y_pred_rf),
    'F1 Score': f1_score(y_val, y_pred_rf),
    'ROC AUC': roc_auc_score(y_val, y_pred_rf)
}

# Print results
print("Random Forest Metrics:")
for metric, value in metrics_rf.items():
    print(f"{metric}: {value}")


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Random Forest Metrics:
Accuracy: 0.8683740946782827
Recall: 0.004988028731045491
F1 Score: 0.00986971969996052
ROC AUC: 0.5020558949553222


# This is the real code for test csv

In [69]:
# This is the real code for test csv
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

X_test = df_e
# gradient_boosting = GradientBoostingClassifier(random_state=42)

# Fit models
pipeline.fit(X_train, y_train)
# gradient_boosting.fit(X_train, y_train)

# Make predictions using predict_proba
y_pred_rf_proba = pipeline.predict_proba(X_test)[:, 1]  # Probability of positive class y_pred_gb_proba = gradient_boosting.predict_proba(X_test)[:, 1]  # Probability of positive class

# Create DataFrame with predictions
predictions_df = pd.DataFrame({
    'SK_ID_CURR': df_e['SK_ID_CURR'],  # Assuming df_e_encoded contains the original data with IDs
    'TARGET': y_pred_rf_proba
})

# Save DataFrame to CSV
predictions_df.to_csv('predictions.csv', index=False)



  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

In [70]:
end_time = time.time()
execution_time = end_time - start_time
execution_time

225.3110966682434