In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

data=pd.read_csv('/content/drive/MyDrive/Convolve_hackathon/Dev_data_to_be_shared.csv')
data.head()

Unnamed: 0,account_number,bad_flag,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,...,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50,onus_attribute_43,onus_attribute_44,onus_attribute_45,onus_attribute_46,onus_attribute_47,onus_attribute_48
0,1,0,,,,,,,,,...,0.0,0.0,0.0,1.0,,,,,,
1,2,0,221000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,,,,,,
3,4,0,86000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,,,,,,
4,5,0,215000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,,,,,,


## Dropping columns(features) which are more than 80% missing

In [5]:
# Define the threshold for missing values (80% in this case)
threshold = 0.8

# Identify columns where the proportion of missing values exceeds the threshold
columns_to_drop = [col for col in data.columns if data[col].isnull().mean() > threshold]

# Drop those columns
df_cleaned = data.drop(columns=columns_to_drop)

# Print results
print("Columns dropped (more than 20% missing):", columns_to_drop)
print("\nCleaned DataFrame:")
df_cleaned.head()



Columns dropped (more than 20% missing): ['bureau_148', 'bureau_436', 'bureau_438', 'bureau_444', 'bureau_446', 'bureau_447', 'bureau_448', 'bureau_449', 'onus_attribute_43', 'onus_attribute_44', 'onus_attribute_45', 'onus_attribute_46', 'onus_attribute_47', 'onus_attribute_48']

Cleaned DataFrame:


Unnamed: 0,account_number,bad_flag,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,...,bureau_enquiry_41,bureau_enquiry_42,bureau_enquiry_43,bureau_enquiry_44,bureau_enquiry_45,bureau_enquiry_46,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50
0,1,0,,,,,,,,,...,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,0,221000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,2.0,4.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0
2,3,0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.0,0.0,11.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0
3,4,0,86000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.0,0.0,38.0,0.0,6.0,0.0,0.0,0.0,0.0,30.0
4,5,0,215000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Filling in the empty values with the average of the remaining values

In [6]:
from sklearn.impute import SimpleImputer
import numpy as np
# Configure an instance of the SimpleImputer class
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Fit the imputer on the DataFrame
X = df_cleaned.values
imputer = imputer.fit(X[:, ::])
# Apply the transform to the DataFrame
X[:, ::] = imputer.transform(X[:, ::])

df_cleaned.iloc[:, ::] = X[:, ::]


##Tuning the hyperparameters to get the best model

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import random

# Initialize results list
results = []

# Set the range of random values for hyperparameters
num_trials = 50  # Number of random trials
rf_ratio_range = (1, 40)  # Range for the ratio of negatives to positives in Random Forest
xgb_ratio_range = (30, 40)  # Range for the ratio of negatives to positives in XGBoost
top_features_range = (50, 500)  # Range for the number of top features to select

# Perform random search
for trial in range(num_trials):
    # Randomly sample hyperparameters
    rf_ratio = random.uniform(*rf_ratio_range)
    xgb_ratio = random.uniform(*xgb_ratio_range)
    top_features_count = random.randint(*top_features_range)

    # Step 1: Balance the dataset for Random Forest
    df_is_flag = df_cleaned[df_cleaned['bad_flag'] == 1]
    df_is_not_flag = df_cleaned[df_cleaned['bad_flag'] == 0]
    df_is_not_flag_rf = df_is_not_flag.sample(n=int(rf_ratio * len(df_is_flag)), random_state=42)
    df_rf = pd.concat([df_is_flag, df_is_not_flag_rf])

    # Step 2: Split the data into features and target for Random Forest
    X_rf = df_rf.drop(columns=['bad_flag', 'account_number'])
    y_rf = df_rf['bad_flag']

    # Step 3: Perform feature selection using Random Forest
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_rf, y_rf)
    importances = rf_model.feature_importances_
    feature_names = X_rf.columns
    important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    top_features = important_features.head(top_features_count).index

    # Step 4: Balance the dataset for XGBoost
    df_is_not_flag_xgb = df_is_not_flag.sample(n=int(xgb_ratio * len(df_is_flag)), random_state=42)
    df_xgb = pd.concat([df_is_flag, df_is_not_flag_xgb])

    # Update X_train and y_train after balancing
    X_xgb = df_xgb[top_features]
    y_xgb = df_xgb['bad_flag']

    # Step 5: Train XGBoost with `scale_pos_weight`
    scale_pos_weight = len(y_xgb[y_xgb == 0]) / len(y_xgb[y_xgb == 1])
    model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
    model.fit(X_xgb, y_xgb)

    # Step 6: Evaluate the model on the entire dataset
    X_new = df_cleaned[top_features]
    y_new = df_cleaned['bad_flag']
    y_pred = model.predict(X_new)
    accuracy = accuracy_score(y_new, y_pred)
    cm = confusion_matrix(y_new, y_pred)
    print("trial:",trial+1, 'rf_ratio:', rf_ratio,'xgb_ratio:', xgb_ratio, 'top_features:',top_features_count, 'accuracy:', accuracy,'\n','confusion_matrix:', cm)
    # Log the results
    results.append({
        'trial': trial + 1,
        'rf_ratio': rf_ratio,
        'xgb_ratio': xgb_ratio,
        'top_features': top_features_count,
        'accuracy': accuracy,
        'confusion_matrix': cm
    })
    precision=cm[1][1]/(cm[1][1]+cm[0][1])
    recall=cm[1][1]/(cm[1][1]+cm[1][0])
    f1_score=2*precision*recall/(precision+recall)
    print("Precision:",precision)
    print("Recall:",recall)
    print("f1_score:",f1_score)

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results)

# Display the best configuration
best_result = results_df.loc[results_df['accuracy'].idxmax()]
print("Best Configuration:")
print(best_result)



trial: 1 rf_ratio: 31.211679830680836 xgb_ratio: 37.03889159705788 top_features: 213 accuracy: 0.9728116025866166 
 confusion_matrix: [[92802  2632]
 [    0  1372]]
Precision: 0.34265734265734266
Recall: 1.0
f1_score: 0.5104166666666666
trial: 2 rf_ratio: 11.367104689452031 xgb_ratio: 33.47314210323922 top_features: 388 accuracy: 0.9727289630808008 
 confusion_matrix: [[92794  2640]
 [    0  1372]]
Precision: 0.3419740777666999
Recall: 1.0
f1_score: 0.5096582466567607
trial: 3 rf_ratio: 21.929826845837695 xgb_ratio: 36.74397593020819 top_features: 399 accuracy: 0.9723880751193108 
 confusion_matrix: [[92761  2673]
 [    0  1372]]
Precision: 0.3391841779975278
Recall: 1.0
f1_score: 0.5065534428650544
trial: 4 rf_ratio: 4.586878673780888 xgb_ratio: 39.37284190293552 top_features: 417 accuracy: 0.9659421936656819 
 confusion_matrix: [[92138  3296]
 [    1  1371]]
Precision: 0.2937647310906364
Recall: 0.999271137026239
f1_score: 0.45404868355688033
trial: 5 rf_ratio: 30.218331861038653 xgb

##Training the data on the best model

In [10]:
# Balance the dataset using sampling
df_is_flag = df_cleaned[df_cleaned['bad_flag'] == 1]
df_is_not_flag = df_cleaned[df_cleaned['bad_flag'] == 0]

# Oversample the minority class
df_is_not_flag = df_is_not_flag.sample(n=int(8.3092*len(df_is_flag)), random_state=42)
df = pd.concat([df_is_flag, df_is_not_flag])

# Split the data into features and target
X = df.drop(columns=['bad_flag', 'account_number'])
y = df['bad_flag']

# Perform feature selection using Random Forest
# Train a Random Forest to compute feature importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)

# Get feature importances and select the top 200 features
importances = rf_model.feature_importances_
feature_names = X.columns
important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)
top_features = important_features.head(236).index  # Select top 200 features

# Reduce X to the top features
X = X[top_features]

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balance the training set for XGBoost
df_is_flag = df_cleaned[df_cleaned['bad_flag'] == 1]
df_is_not_flag = df_cleaned[df_cleaned['bad_flag'] == 0]
df_is_not_flag = df_is_not_flag.sample(n=int(33.0699*len(df_is_flag)), random_state=42)
df_balanced = pd.concat([df_is_flag, df_is_not_flag])

# Update X_train and y_train after balancing
X_train = df_balanced[top_features]
y_train = df_balanced['bad_flag']

# Train XGBoost with `scale_pos_weight`
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
model.fit(X_train, y_train)


# Get predicted class labels
X_new=df_cleaned[top_features]
y_new=df_cleaned['bad_flag']

# Get predicted class labels
y_pred = model.predict(X_new)

# Evaluate the model
accuracy = accuracy_score(y_new, y_pred)
print(f"XGBoost with sampling and feature selection: {accuracy:.2f}")
cm = confusion_matrix(y_new, y_pred)
print(f"Confusion Matrix:\n{cm}")

Predicted Probabilities:
[[9.9867713e-01 1.3228599e-03]
 [9.9923527e-01 7.6473318e-04]
 [9.6963561e-01 3.0364379e-02]
 [9.9862206e-01 1.3779576e-03]
 [9.9818152e-01 1.8184721e-03]]
XGBoost with sampling and feature selection: 0.97
Confusion Matrix:
[[92827  2607]
 [    0  1372]]


##Predicting the probabilities of the validation dataset using the best model

In [13]:
best_model1=model
data_share=pd.read_csv('/content/drive/MyDrive/Convolve_hackathon/validation_data_to_be_shared.csv')
data_share.head()

# Define the threshold for missing values (80% in this case)
threshold = 0.8

# Identify columns where the proportion of missing values exceeds the threshold
columns_to_drop = [col for col in data_share.columns if data_share[col].isnull().mean() > threshold]

# Drop those columns
df_share_cleaned = data_share.drop(columns=columns_to_drop)

# Print results
print("Columns dropped (more than 80% missing):", columns_to_drop)

from sklearn.impute import SimpleImputer
import numpy as np
# Configure an instance of the SimpleImputer class
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Fit the imputer on the DataFrame
X = df_share_cleaned.values
imputer = imputer.fit(X[:, ::])
# Apply the transform to the DataFrame
X[:, ::] = imputer.transform(X[:, ::])

df_share_cleaned.iloc[:, ::] = X[:, ::]

y_proba = best_model1.predict_proba(df_share_cleaned[top_features])

df_final=pd.DataFrame()
df_final['account_number']=df_share_cleaned['account_number']
df_final['predicted_probability']=y_proba[:,1]
df_final.to_csv('/content/drive/MyDrive/Convolve_hackathon/output.csv',index=False)

Columns dropped (more than 20% missing): ['bureau_148', 'bureau_436', 'bureau_438', 'bureau_444', 'bureau_446', 'bureau_447', 'bureau_448', 'bureau_449', 'onus_attribute_43', 'onus_attribute_44', 'onus_attribute_45', 'onus_attribute_46', 'onus_attribute_47', 'onus_attribute_48']

Cleaned DataFrame:
