In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [3]:
col=['id','CustomerId','Surname']
col2=['CustomerId','Surname']
train_data.drop(col,axis=1,inplace=True)
test_data.drop(col2,axis=1,inplace=True)

In [4]:
encoder=LabelEncoder()

In [5]:
train_data['Geography']=encoder.fit_transform(train_data['Geography'])
train_data['Gender']=encoder.fit_transform(train_data['Gender'])

In [6]:
test_data['Geography']=encoder.fit_transform(test_data['Geography'])
test_data['Gender']=encoder.fit_transform(test_data['Gender'])

In [7]:
cols=['CreditScore', 'Age', 'Tenure', 'Balance','EstimatedSalary']

In [8]:
ss_scaler=StandardScaler()

In [9]:
train_data[cols]=ss_scaler.fit_transform(train_data[cols])
test_data[cols]=ss_scaler.fit_transform(test_data[cols])

In [10]:
X=train_data.drop('Exited',axis=1)
y=train_data['Exited']

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score


In [12]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
xgb_classifier = XGBClassifier(
    objective='binary:logistic',  # For binary classification
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  # Handle class imbalance
    random_state=42
)

In [14]:
# Fit the model
xgb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = xgb_classifier.predict(X_val)


In [15]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, xgb_classifier.predict_proba(X_val)[:, 1])
conf_matrix = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.82
ROC-AUC: 0.89
Confusion Matrix:
[[32378  6755]
 [ 2332  8046]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     39133
           1       0.54      0.78      0.64     10378

    accuracy                           0.82     49511
   macro avg       0.74      0.80      0.76     49511
weighted avg       0.85      0.82      0.83     49511



In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 3]
}

In [17]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 3]
}



In [18]:
randomized_search = RandomizedSearchCV(XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')
randomized_search.fit(X_train, y_train)


In [19]:
best_params = randomized_search.best_params_


In [20]:
best_params

{'subsample': 0.8,
 'scale_pos_weight': 1,
 'n_estimators': 100,
 'min_child_weight': 1,
 'max_depth': 3,
 'learning_rate': 0.2,
 'colsample_bytree': 1.0}

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Best hyperparameters obtained from randomized search
best_params = {
    'subsample': 1.0,
    'scale_pos_weight': 1,
    'n_estimators': 200,
    'min_child_weight': 2,
    'max_depth': 3,
    'learning_rate': 0.1,
    'colsample_bytree': 0.9
}

# Create XGBoost classifier with best hyperparameters
best_model = XGBClassifier(objective='binary:logistic', random_state=42, **best_params)


In [22]:

# Train the model on the entire training dataset
best_model.fit(X_train, y_train)


In [23]:

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model performance on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on Validation Set: {accuracy:.2f}")


Accuracy on Validation Set: 0.87


In [24]:
test_data

Unnamed: 0,id,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,-0.878176,0,0,-1.706504,-1.067887,-0.881274,2,0,1,0.967874
1,165035,0.329567,0,0,0.888990,-1.067887,-0.881274,1,1,0,-0.790939
2,165036,-0.006609,0,0,-0.465181,0.713922,-0.881274,2,1,0,0.528413
3,165037,0.304665,0,1,-0.239486,1.070284,-0.881274,1,1,0,0.032150
4,165038,1.188684,1,1,-0.013791,1.783008,1.050038,1,1,0,0.539331
...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,-1.077392,2,1,-1.029419,0.713922,0.967796,1,1,1,0.711510
110019,275053,-1.015137,0,0,-0.239486,-0.355164,1.954171,1,1,1,-1.394946
110020,275054,0.690645,0,1,-0.803724,-1.067887,-0.881274,2,1,0,-1.909981
110021,275055,0.653292,0,0,-0.690876,-0.711526,-0.881274,1,1,1,0.924908


In [25]:
from sklearn.metrics import accuracy_score

In [26]:
idval=test_data['id']
test_data.drop('id',axis=1,inplace=True)

In [27]:
idval

0         165034
1         165035
2         165036
3         165037
4         165038
           ...  
110018    275052
110019    275053
110020    275054
110021    275055
110022    275056
Name: id, Length: 110023, dtype: int64

In [28]:
# Assuming 'test_data' does not contain the target column
    # Predict probabilities for class 1 (Exited)
y_test_probabilities = best_model.predict_proba(test_data)[:, 1]


In [29]:
y_test_probabilities[:10]

array([0.0278517 , 0.82121557, 0.02846728, 0.22666486, 0.3266    ,
       0.04531394, 0.04600472, 0.10137632, 0.6211459 , 0.01435687],
      dtype=float32)

In [30]:
type(y_test_probabilities)

numpy.ndarray

In [31]:
type(idval)

pandas.core.series.Series

In [32]:
prob=pd.Series(y_test_probabilities)

In [33]:
type(prob)

pandas.core.series.Series

In [34]:
df = pd.concat([idval, prob], axis=1)

In [35]:
df.columns


Index(['id', 0], dtype='object')

In [36]:
df = df.rename(columns={0: 'Exited'})

In [37]:
df

Unnamed: 0,id,Exited
0,165034,0.027852
1,165035,0.821216
2,165036,0.028467
3,165037,0.226665
4,165038,0.326600
...,...,...
110018,275052,0.040529
110019,275053,0.088544
110020,275054,0.017866
110021,275055,0.143228


In [38]:
df.to_csv('check.csv',index=False)

In [39]:
sample_sub=pd.read_csv('sample_submission.csv')

In [40]:
combined_df = pd.merge(sample_sub, df, on='id', how='left')


In [41]:
combined_df

Unnamed: 0,id,Exited_x,Exited_y
0,165034,0.5,0.027852
1,165035,0.5,0.821216
2,165036,0.5,0.028467
3,165037,0.5,0.226665
4,165038,0.5,0.326600
...,...,...,...
110018,275052,0.5,0.040529
110019,275053,0.5,0.088544
110020,275054,0.5,0.017866
110021,275055,0.5,0.143228


In [42]:
combined_df

Unnamed: 0,id,Exited_x,Exited_y
0,165034,0.5,0.027852
1,165035,0.5,0.821216
2,165036,0.5,0.028467
3,165037,0.5,0.226665
4,165038,0.5,0.326600
...,...,...,...
110018,275052,0.5,0.040529
110019,275053,0.5,0.088544
110020,275054,0.5,0.017866
110021,275055,0.5,0.143228
