In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


## Read in data by chunks

In [2]:
previous =  'https://storage.googleapis.com/home_credit_files/previous_application.csv'
train_main = 'https://storage.googleapis.com/home_credit_files/application_train.csv'
test_main =  'https://storage.googleapis.com/home_credit_files/application_test.csv'

chunk_size = 25000

# Initialize empty list to store chunks
chunks_train = []
chunks_test = []
chunks_previous = []

# Read train data in chunks
for train_chunk in pd.read_csv(train_main, index_col=False, chunksize=chunk_size, low_memory=False,nrows=100000):
    # Process train_chunk here (e.g., clean, transform, analyze)
    chunks_train.append(train_chunk)

# Concatenate train chunks to create final DataFrame
df_t = pd.concat(chunks_train, ignore_index=True)

# Read test data in chunks
for test_chunk in pd.read_csv(test_main, index_col=False, chunksize=chunk_size, low_memory=False):
    # Process test_chunk here (e.g., clean, transform, analyze)
    chunks_test.append(test_chunk)

# Concatenate test chunks to create final DataFrame
df_e = pd.concat(chunks_test, ignore_index=True)

# Read previous data in chunks
for prev_chunk in pd.read_csv(previous, index_col=False, chunksize=chunk_size, low_memory=False,nrows=100000):
    # Process prev_chunk here (e.g., clean, transform, analyze)
    chunks_previous.append(prev_chunk)

# Concatenate previous chunks to create final DataFrame
previous_app_df = pd.concat(chunks_previous, ignore_index=True)

## Reference Data Types

In [3]:
pd.set_option('display.max_rows', None)
data_types = df_t.dtypes
print(data_types)

SK_ID_CURR                        int64
TARGET                            int64
NAME_CONTRACT_TYPE               object
CODE_GENDER                      object
FLAG_OWN_CAR                       bool
FLAG_OWN_REALTY                    bool
CNT_CHILDREN                      int64
AMT_INCOME_TOTAL                float64
AMT_CREDIT                      float64
AMT_ANNUITY                     float64
AMT_GOODS_PRICE                 float64
NAME_TYPE_SUITE                  object
NAME_INCOME_TYPE                 object
NAME_EDUCATION_TYPE              object
NAME_FAMILY_STATUS               object
NAME_HOUSING_TYPE                object
REGION_POPULATION_RELATIVE      float64
DAYS_BIRTH                        int64
DAYS_EMPLOYED                     int64
DAYS_REGISTRATION                 int64
DAYS_ID_PUBLISH                   int64
OWN_CAR_AGE                     float64
FLAG_MOBIL                        int64
FLAG_EMP_PHONE                    int64
FLAG_WORK_PHONE                   int64


In [4]:
df_e.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 121 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(40), object(16)
memory usage: 45.0+ MB


* Comment out heatmap

In [5]:
# numeric_feats = df_t.dtypes[df_t.dtypes != "object"].index
# plt.figure(figsize=(80,18))
# sns.heatmap(df_t[numeric_feats].corr(), annot=False, square=True, cmap='coolwarm')
# plt.show()

## Correlations

In [6]:
# Filter out object type columns from the DataFrame
numeric_df = df_t.select_dtypes(include=['int64', 'float64'])

# Compute correlations with the TARGET column
correlations = numeric_df.corr()['TARGET'].sort_values(ascending=False)

# Select the top 30 positive correlations
top_positive_correlations = correlations.head(30)

# Select the top 30 negative correlations
top_negative_correlations = correlations.tail(30)

# Display the results
print("Top 30 positive correlations:")
print(top_positive_correlations)
print("\nTop 30 negative correlations:")
print(top_negative_correlations)

Top 30 positive correlations:
TARGET                         1.000000
DAYS_BIRTH                     0.085606
DAYS_LAST_PHONE_CHANGE         0.064829
REGION_RATING_CLIENT_W_CITY    0.061884
REGION_RATING_CLIENT           0.061581
FLAG_EMP_PHONE                 0.053128
DAYS_ID_PUBLISH                0.052184
REG_CITY_NOT_WORK_CITY         0.049873
REG_CITY_NOT_LIVE_CITY         0.049431
FLAG_DOCUMENT_3                0.046787
DAYS_REGISTRATION              0.044409
NONLIVINGAREA_MODE             0.039588
NONLIVINGAREA_MEDI             0.036809
NONLIVINGAREA_AVG              0.036423
DEF_30_CNT_SOCIAL_CIRCLE       0.034605
OWN_CAR_AGE                    0.032658
DEF_60_CNT_SOCIAL_CIRCLE       0.032143
FLAG_WORK_PHONE                0.026592
LIVE_CITY_NOT_WORK_CITY        0.025545
AMT_REQ_CREDIT_BUREAU_YEAR     0.024406
CNT_CHILDREN                   0.021518
FLAG_DOCUMENT_2                0.010917
CNT_FAM_MEMBERS                0.008742
REG_REGION_NOT_LIVE_REGION     0.008591
FLAG_CONT_

In [7]:
df_t.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,278284,0,Cash loans,F,False,True,0,90000.0,360000.0,13059.0,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,1.0
1,278285,0,Cash loans,F,False,True,0,135000.0,728460.0,57685.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
2,278289,0,Revolving loans,M,False,True,0,225000.0,180000.0,9000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,278290,0,Cash loans,F,False,False,1,72000.0,675000.0,21775.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,278292,0,Cash loans,M,True,False,0,157500.0,545040.0,25407.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0


## Combine Train and Test

In [8]:
# Add a new column to indicate the source dataset
df_t['is_test'] = 0  # For train data, set to 0
df_e['is_test'] = 1   # For test data, set to 1

# Concatenate the train and test datasets
df_combined = pd.concat([df_t, df_e], ignore_index=True)

In [9]:
# # Read the data into a DataFrame
# # Categorize variables with fewer than 10 unique values and not already integers
# for col in df_combined.columns:
#     if df_combined[col].dtype != 'float64' and df_combined[col].nunique() < 10:
#         df_combined[col] = df_combined[col].astype('category')

In [10]:
# KYLES CODE FOR CATEGORIZING
for col in df_combined.columns:
    if df_combined[col].dtype == 'object' and df_combined[col].nunique() < 25:
        df_combined[col] = df_combined[col].astype('category')

In [11]:
#for col in df_combined.columns:
   # print(df_combined[col].dtype)

## Display numeric NA's

In [12]:
all_Xdata_na = (df_combined.isnull().sum() / len(df_combined)) * 100
all_Xdata_na = all_Xdata_na.drop(all_Xdata_na[all_Xdata_na == 0].index).sort_values(ascending=False)[:300]
missing_data = pd.DataFrame({'Missing Data Percent' :all_Xdata_na})

In [13]:
pd.set_option('display.max_rows', None)
missing_data

Unnamed: 0,Missing Data Percent
COMMONAREA_MODE,89.725972
COMMONAREA_MEDI,89.725972
COMMONAREA_AVG,89.725972
NONLIVINGAPARTMENTS_MEDI,89.640591
NONLIVINGAPARTMENTS_AVG,89.640591
NONLIVINGAPARTMENTS_MODE,89.640591
LIVINGAPARTMENTS_AVG,89.215699
LIVINGAPARTMENTS_MEDI,89.215699
LIVINGAPARTMENTS_MODE,89.215699
FONDKAPREMONT_MODE,89.211666


## Missing Numeric data to convert to median

In [14]:
for col in df_combined.columns:
    # Check if the column is of float64 data type and has missing or NA values
    if df_combined[col].dtype == 'float64' and df_combined[col].isnull().any():
        # Calculate the median of the column
        median_value = df_combined[col].median()
        # Fill missing values with the median
        df_combined[col].fillna(median_value, inplace=True)

## What missing data remains?

In [15]:
columns_with_missing_data = df_combined.columns[df_combined.isnull().any()].tolist()

# Display the columns with missing data
print(columns_with_missing_data)

['NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


## Object missing data

In [16]:
df_na_columns = df_combined[columns_with_missing_data]

# Show the head of the DataFrame
df_na_columns.head()

Unnamed: 0,NAME_TYPE_SUITE,OCCUPATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
0,Unaccompanied,,,,,
1,Unaccompanied,Core staff,,,,
2,Unaccompanied,Drivers,,,,
3,Unaccompanied,Sales staff,,,,
4,Unaccompanied,,,,,


In [17]:
pd.set_option('display.max_rows', None)
missing_percentages = (df_na_columns.isna().sum() / len(df_na_columns)) * 100
print(missing_percentages)

NAME_TYPE_SUITE         0.873985
OCCUPATION_TYPE        31.499758
FONDKAPREMONT_MODE     89.211666
HOUSETYPE_MODE         81.780105
WALLSMATERIAL_MODE     82.061125
EMERGENCYSTATE_MODE    77.987011
dtype: float64


In [18]:
for col in df_combined.columns:
    # Check if the column is of object data type and has missing or NA values
    if df_combined[col].dtype == 'object' and df_combined[col].isnull().any():
        # Calculate the mode of the column
        mode_values = df_combined[col].mode().iloc[0]
        # Fill missing values with the mode
        df_combined[col].fillna(mode_values, inplace=True)

## Categorical Data to be converted next. Take a look at any float or object data with less than 10 unique values and convert to category
* Except target
* Gender -> Category
* Name Contract type -> Category

### Convert Test back to it's own dataframe

In [19]:
df_e = df_combined[df_combined['is_test'] == 1]

# Remove the 'is_test' column from the combined DataFrame
df_combined.drop(columns=['is_test'], inplace=True)
df_e.drop(columns=['is_test'], inplace=True)

len(df_e)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_e.drop(columns=['is_test'], inplace=True)


48744

In [20]:
df_t = df_combined.copy()

## Merged is Train and Application

In [21]:
# previous_app_df = pd.read_csv(f'previous_application.csv', index_col=False, low_memory=False)
original_id = df_e['SK_ID_CURR'].tolist()
previous_app_df.drop_duplicates(subset='SK_ID_CURR', inplace=True)

df_merged = pd.merge(df_t, previous_app_df, on="SK_ID_CURR", how="left")

# Drop previous DUPLICATES
df_e = pd.merge(df_e, previous_app_df, on="SK_ID_CURR", how="left")



In [22]:
#for col in df_merged.columns:
 #   if df_merged[col].dtype == 'object' and df_merged[col].nunique() < 25:
  #      df_merged[col] = df_merged[col].astype('category')

In [23]:
categorical_vars = df_merged.select_dtypes(include=['object','category']).columns.tolist()
test_categorical_vars = df_e.select_dtypes(include=['object','category']).columns.tolist()



# Perform one-hot encoding on the identified categorical variables
df_encoded = pd.get_dummies(df_merged, columns=categorical_vars)
df_e_encoded = pd.get_dummies(df_e, columns=test_categorical_vars)

# Optionally, if you want to limit one-hot encoding to specific columns, you can replace 'categorical_vars' with a subset of column names
# Example: df_encoded = pd.get_dummies(df, columns=['NAME_CONTRACT_TYPE_x', 'CODE_GENDER'])

# Check the shape of the original and encoded dataframe to understand the transformation
print("Original shape:", df_merged.shape)
print("Encoded shape:", df_encoded.shape)
print("Test Encoded shape:", df_e_encoded.shape)

# Display the first few rows of the encoded dataframe
#print(df_encoded.head())
#print(categorical_vars)

Original shape: (148744, 158)
Encoded shape: (148744, 410)
Test Encoded shape: (48744, 408)


## REMOVE NA's AGAIN FOR COMBINED DF

In [24]:
#CLEAN NEW COMBINED DF
all_Xdata_na = (df_encoded.isnull().sum() / len(df_combined)) #* 100
all_Xdata_na = all_Xdata_na.drop(all_Xdata_na[all_Xdata_na == 0].index).sort_values(ascending=False)[:300]
missing_data = pd.DataFrame({'Missing Data Percent' :all_Xdata_na})

all_test_na = (df_e_encoded.isnull().sum() / len(df_e)) #* 100
all_test_na = all_test_na.drop(all_test_na[all_test_na == 0].index).sort_values(ascending=False)[:300]
test_missing = pd.DataFrame({'Missing Data Percent' :all_test_na})



In [25]:
pd.set_option('display.max_rows', None)
missing_data

Unnamed: 0,Missing Data Percent
RATE_INTEREST_PRIVILEGED,0.999086
RATE_INTEREST_PRIMARY,0.999086
AMT_DOWN_PAYMENT,0.88056
RATE_DOWN_PAYMENT,0.88056
DAYS_TERMINATION,0.851913
DAYS_LAST_DUE,0.851913
DAYS_LAST_DUE_1ST_VERSION,0.851913
DAYS_FIRST_DUE,0.851913
DAYS_FIRST_DRAWING,0.851913
NFLAG_INSURED_ON_APPROVAL,0.851913


In [26]:
pd.set_option('display.max_rows', None)
test_missing

Unnamed: 0,Missing Data Percent
RATE_INTEREST_PRIVILEGED,0.99879
RATE_INTEREST_PRIMARY,0.99879
AMT_DOWN_PAYMENT,0.881011
RATE_DOWN_PAYMENT,0.881011
DAYS_TERMINATION,0.849212
DAYS_LAST_DUE,0.849212
DAYS_LAST_DUE_1ST_VERSION,0.849212
DAYS_FIRST_DUE,0.849212
DAYS_FIRST_DRAWING,0.849212
NFLAG_INSURED_ON_APPROVAL,0.849212


In [27]:
for col in df_encoded.columns:
    # Check if the column is of float64 data type and has missing or NA values
    if df_encoded[col].dtype == 'float64' and df_encoded[col].isnull().any():
        # Calculate the median of the column
        median_value = df_encoded[col].median()
        # Fill missing values with the median
        df_encoded[col].fillna(median_value, inplace=True)

# Test
for col in df_e_encoded.columns:
    # Check if the column is of float64 data type and has missing or NA values
    if df_e_encoded[col].dtype == 'float64' and df_e_encoded[col].isnull().any():
        # Calculate the median of the column
        median_value = df_e_encoded[col].median()
        # Fill missing values with the median
        df_e_encoded[col].fillna(median_value, inplace=True) 
        
df_e_encoded.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest
0,100001,0.0,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,...,False,False,False,False,False,False,False,False,False,False
1,100005,0.0,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,...,False,False,False,False,False,False,False,False,False,False
2,100013,0.0,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,...,False,False,False,False,False,False,True,False,False,False
3,100028,0.0,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,...,False,False,False,False,False,False,False,False,False,False
4,100038,0.0,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,...,False,False,False,False,False,False,False,False,False,False


In [28]:
pd.set_option('display.max_rows', None)
df_encoded.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest
0,278284,0.0,0,90000.0,360000.0,13059.0,360000.0,0.01452,-21950,365243,...,False,False,False,False,False,False,False,False,False,False
1,278285,0.0,0,135000.0,728460.0,57685.5,675000.0,0.015221,-15645,-8615,...,False,False,False,False,False,False,False,False,False,False
2,278289,0.0,0,225000.0,180000.0,9000.0,180000.0,0.025164,-11926,-1049,...,False,False,False,False,False,False,False,False,False,False
3,278290,0.0,1,72000.0,675000.0,21775.5,675000.0,0.019101,-15597,-795,...,False,False,False,False,False,False,False,False,False,False
4,278292,0.0,0,157500.0,545040.0,25407.0,450000.0,0.006629,-13220,-1769,...,False,False,False,False,False,False,False,False,False,False


## Downsample

In [29]:
from sklearn.utils import resample

# Assuming df_combined is your combined DataFrame with 'TARGET' column and 'is_test' column

# Separate majority and minority classes
majority_class = df_encoded[df_encoded['TARGET'] == 0]
minority_class = df_encoded[df_encoded['TARGET'] == 1]

# Downsample majority class
desired_ratio = 1.8 # Desired ratio between minority and majority class after downsampling
downsampled_majority = resample(majority_class, 
                                replace=False,  # Sample without replacement to avoid duplication
                                n_samples=int(len(majority_class)/desired_ratio),  # Downsample to match desired ratio
                                random_state=123)  # Set random state for reproducibility

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([downsampled_majority, minority_class])

# Shuffle the DataFrame to mix up the order of the rows
df_downsampled = df_downsampled.sample(frac=1, random_state=113).reset_index(drop=True)

## Remove excess columns

In [30]:

columns_merged = df_downsampled.columns
columns_e = df_e_encoded.columns

# Find the columns that are in df_merged but not in df_e
columns_diff_merged = set(columns_merged) - set(columns_e)

# Find the columns that are in df_e but not in df_merged
columns_diff_e = set(columns_e) - set(columns_merged)

# Print the different columns
print("Columns in df_merged but not in df_e:", columns_diff_merged)
print("Columns in df_e but not in df_merged:", columns_diff_e)

df_downsampled.drop(columns=['NAME_GOODS_CATEGORY_Weapon','NAME_GOODS_CATEGORY_Additional Service'], inplace=True)

Columns in df_merged but not in df_e: {'NAME_GOODS_CATEGORY_Weapon', 'NAME_GOODS_CATEGORY_Additional Service'}
Columns in df_e but not in df_merged: set()


In [31]:
df_downsampled['TARGET'].value_counts()

TARGET
0.0    77544
1.0     9164
Name: count, dtype: int64

## Modeling

### Data Partition

In [32]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target variable (y) based on your EDA
X_train = df_downsampled.drop(columns=['TARGET'])
y_train = df_downsampled['TARGET']

df_e_encoded.drop(columns=['TARGET'], inplace=True)

X_test = df_e_encoded
y_test = None  # You don't have the ground truth labels for the test set


# Check the shape of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)

Training set shape: (86708, 407) (86708,)


In [33]:
y_train.value_counts()

TARGET
0.0    77544
1.0     9164
Name: count, dtype: int64

### Random Forest

## Train metrics before test!

In [34]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target variable (y) based on your EDA
X = df_downsampled.drop(columns=['TARGET'])
y = df_downsampled['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)


# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

random_forest = RandomForestClassifier(
    n_estimators=700,  # Number of trees in the forest
    max_depth=70,      # Maximum depth of the trees
    min_samples_split=5,  # Minimum number of samples required to split a node
    min_samples_leaf=2,   # Minimum number of samples required at each leaf node
    random_state=99     # Random seed for reproducibility
)

# Initialize models
# logistic_regression = LogisticRegression(random_state=42, max_iter=1000)
random_forest = RandomForestClassifier(random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)


# Fit models
# logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Make predictions
# y_pred_lr = logistic_regression.predict(X_val)
y_pred_rf = random_forest.predict(X_val)
y_pred_gb = gradient_boosting.predict(X_val)

# Calculate evaluation metrics
metrics_lr = [accuracy_score, recall_score, f1_score, roc_auc_score]
metrics_rf_gb = [accuracy_score, recall_score, f1_score]

results = {}

# Logistic Regression Metrics
# results['Logistic Regression'] = {metric.__name__: metric(y_val, y_pred_lr) for metric in metrics_lr}

# Random Forest Metrics
results['Random Forest'] = {metric.__name__: metric(y_val, y_pred_rf) for metric in metrics_rf_gb}

# Gradient Boosting Metrics
results['Gradient Boosting'] = {metric.__name__: metric(y_val, y_pred_gb) for metric in metrics_rf_gb}

# Print results
for model, metrics in results.items():
    print(f"{model} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print()

Training set shape: (69366, 407) (69366,)
Validation set shape: (17342, 407) (17342,)
Random Forest Metrics:
accuracy_score: 0.896205743282205
recall_score: 0.012756516916250694
f1_score: 0.024918743228602384

Gradient Boosting Metrics:
accuracy_score: 0.897935647560835
recall_score: 0.06988352745424292
f1_score: 0.12462908011869434



# This is the real code for test csv

In [35]:
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Initialize models
# logistic_regression = LogisticRegression(random_state=42, max_iter=1000)

# gradient_boosting = GradientBoostingClassifier(random_state=42)

# Fit models
# logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
# gradient_boosting.fit(X_train, y_train)

# Make predictions
# y_pred_lr = logistic_regression.predict(X_val)
y_pred_rf = random_forest.predict(X_test)
# y_pred_gb = gradient_boosting.predict(X_test)

# Calculate evaluation metrics
metrics_lr = [accuracy_score, recall_score, f1_score, roc_auc_score]
# metrics_rf_gb = [accuracy_score, recall_score, f1_score]

results = {}

# Logistic Regression Metrics
# results['Logistic Regression'] = {metric.__name__: metric(y_val, y_pred_lr) for metric in metrics_lr}

# Random Forest Metrics
# results['Random Forest'] = {metric.__name__: metric(y_test, y_pred_rf) for metric in metrics_rf_gb}

# # Gradient Boosting Metrics
# results['Gradient Boosting'] = {metric.__name__: metric(y_test, y_pred_gb) for metric in metrics_rf_gb}



In [36]:
ids = df_e_encoded['SK_ID_CURR']
predictions_df = pd.DataFrame({
    'SK_ID_CURR': ids,
    'TARGET': y_pred_rf
})
predictions_df.to_csv('predictions.csv', index=False)


In [37]:
# # from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
# 
# # Initialize models
# # logistic_regression = LogisticRegression(random_state=42, max_iter=1000)
# random_forest = RandomForestClassifier(random_state=42)
# gradient_boosting = GradientBoostingClassifier(random_state=42)
# 
# # Fit models
# # logistic_regression.fit(X_train, y_train)
# random_forest.fit(X_train, y_train)
# gradient_boosting.fit(X_train, y_train)
# 
# # Make predictions
# # y_pred_lr = logistic_regression.predict(X_val)
# y_pred_rf = random_forest.predict(df_e)
# y_pred_gb = gradient_boosting.predict(df_e)
# 
# # Calculate evaluation metrics
# metrics_lr = [accuracy_score, recall_score, f1_score, roc_auc_score]
# metrics_rf_gb = [accuracy_score, recall_score, f1_score]
# 
# results = {}
# 
# # Logistic Regression Metrics
# # results['Logistic Regression'] = {metric.__name__: metric(y_val, y_pred_lr) for metric in metrics_lr}
# 
# # Random Forest Metrics
# results['Random Forest'] = {metric.__name__: metric(y_val, y_pred_rf) for metric in metrics_rf_gb}
# 
# # Gradient Boosting Metrics
# results['Gradient Boosting'] = {metric.__name__: metric(y_val, y_pred_gb) for metric in metrics_rf_gb}
# 
# # Print results
# for model, metrics in results.items():
#     print(f"{model} Metrics:")
#     for metric, value in metrics.items():
#         print(f"{metric}: {value}")
#     print()

### Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

# Drop the 'NAME_CONTRACT_TYPE' column from both X_train and X_val
#X_train.drop(columns=['ORGANIZATION_TYPE'], inplace=True, errors='ignore')  
#X_train.drop(columns=['NAME_CASH_LOAN_PURPOSE'], inplace=True, errors='ignore') 
#X_train.drop(columns=['NAME_GOODS_CATEGORY'], inplace=True, errors='ignore') 
#X_val.drop(columns=['ORGANIZATION_TYPE'], inplace=True, errors='ignore')  
#X_val.drop(columns=['NAME_CASH_LOAN_PURPOSE'], inplace=True, errors='ignore')  
#X_val.drop(columns=['NAME_GOODS_CATEGORY'], inplace=True, errors='ignore') 
# Initialize logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model on training data
model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_proba)

# Print the evaluation metrics
print("Performance Metrics:")
print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC-ROC Score:", roc_auc)


Performance Metrics:
Accuracy: 0.896032752854342
Recall: 0.0
F1-score: 0.0
AUC-ROC Score: 0.6380328643328754


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
