In [3]:
import pandas as pd
import sklearn

In [77]:
df = pd.read_csv('supply_chain_train.csv')

In [78]:
df['Marital_Status'] = df['Marital_Status'].replace('Unknown', df['Marital_Status'].mode()[0])
df['Education_Level'] = df['Education_Level'].replace('Unknown', df['Education_Level'].mode()[0])
df['Income_Category'] = df['Income_Category'].replace('Unknown', df['Income_Category'].mode()[0])
dummy_df = pd.get_dummies(df[['Gender', 'Marital_Status']])
df = pd.concat([df, dummy_df], axis=1)
df.drop(['Gender', 'Gender_F', 'train_idx', 'CLIENTNUM', 'Marital_Status'], axis=1, inplace=True)

In [79]:
# create a new column that represents age values as discrete bins
df['Customer_Age'] = pd.cut(df['Customer_Age'], bins=range(0, 101, 10), right=False, labels=range(10))

In [80]:
rankings = {'Uneducated': 1, 'High School': 2, 'College': 3, 'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6}

# create a new column that maps each category to its corresponding ranking
df['Education_Level'] = df['Education_Level'].map(rankings)

In [81]:
rankings = {'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3, '$80K - $120K': 4, '$120K +': 5}

# create a new column that maps each category to its corresponding ranking
df['Income_Category'] = df['Income_Category'].map(rankings)


In [86]:
rankings = {'Blue': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4}

# create a new column that maps each category to its corresponding ranking
df['Card_Category'] = df['Card_Category'].map(rankings)


In [92]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import xgboost as xgb

# Load data
data = df

# Split features and target variable
X = data.drop('Attrition_Flag', axis=1)
y = data['Attrition_Flag']

# Split numerical and categorical features
num_features = ['Customer_Age', 'Dependent_count', 'Education_Level', 'Income_Category',
       'Card_Category', 'Months_on_book', 'Total_Relationship_Count',
       'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit',
       'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
       'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1',
       'Avg_Utilization_Ratio']

# Normalize numerical features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters to tune
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

# Train XGBoost model with cross-validation to find best hyperparameters
model = xgb.XGBClassifier()
grid_search = GridSearchCV(model, params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate model on test set using best hyperparameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print('F1 score:', f1)

F1 score: 0.9856115107913669
