In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading And Concating Datasets

In [114]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [115]:
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)


In [116]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0.0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0.0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0.0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0.0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1.0


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   id         1000000 non-null  int64  
 1   age        1000000 non-null  int64  
 2   job        1000000 non-null  object 
 3   marital    1000000 non-null  object 
 4   education  1000000 non-null  object 
 5   default    1000000 non-null  object 
 6   balance    1000000 non-null  int64  
 7   housing    1000000 non-null  object 
 8   loan       1000000 non-null  object 
 9   contact    1000000 non-null  object 
 10  day        1000000 non-null  int64  
 11  month      1000000 non-null  object 
 12  duration   1000000 non-null  int64  
 13  campaign   1000000 non-null  int64  
 14  pdays      1000000 non-null  int64  
 15  previous   1000000 non-null  int64  
 16  poutcome   1000000 non-null  object 
 17  y          750000 non-null   float64
dtypes: float64(1), int64(8), object(9)
memory u

In [118]:
df.drop('id',axis=1, inplace=True)

In [119]:
df.drop(columns=['duration'], inplace=True)

## Categorical Columns

In [120]:
cat_cols= df.select_dtypes(include=['object']).columns.tolist()
len(cat_cols)

9

In [121]:
for col in cat_cols:
    print(f"{col}: {df[col].unique()} unique values \n")

job: ['technician' 'blue-collar' 'student' 'admin.' 'management' 'entrepreneur'
 'self-employed' 'unknown' 'services' 'retired' 'housemaid' 'unemployed'] unique values 

marital: ['married' 'single' 'divorced'] unique values 

education: ['secondary' 'primary' 'tertiary' 'unknown'] unique values 

default: ['no' 'yes'] unique values 

housing: ['no' 'yes'] unique values 

loan: ['no' 'yes'] unique values 

contact: ['cellular' 'unknown' 'telephone'] unique values 

month: ['aug' 'jun' 'may' 'feb' 'apr' 'nov' 'jul' 'jan' 'oct' 'mar' 'sep' 'dec'] unique values 

poutcome: ['unknown' 'other' 'failure' 'success'] unique values 



In [122]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
len(num_cols)

7

## Applying Label Enocoding

In [123]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

for col in cat_cols:
    if col != 'month':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

## Numaric Features

In [124]:
for col in num_cols:
    print(f"{col}: {df[col].nunique()} unique values \n")

age: 78 unique values 

balance: 8469 unique values 

day: 31 unique values 

campaign: 52 unique values 

pdays: 614 unique values 

previous: 53 unique values 

y: 2 unique values 



## Cyclical encoding Of days

In [125]:
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 30)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 30)


In [126]:
df['day_sin']

0        -8.660254e-01
1        -5.877853e-01
2         2.079117e-01
3        -4.067366e-01
4         5.877853e-01
              ...     
999995   -5.877853e-01
999996   -7.431448e-01
999997    5.877853e-01
999998   -1.133108e-15
999999    9.510565e-01
Name: day_sin, Length: 1000000, dtype: float64

In [127]:
df['day_cos']

0         0.500000
1        -0.809017
2        -0.978148
3         0.913545
4         0.809017
            ...   
999995   -0.809017
999996   -0.669131
999997    0.809017
999998    1.000000
999999    0.309017
Name: day_cos, Length: 1000000, dtype: float64

In [128]:
df.drop('day',axis=1,inplace=True)

## Cyclical encoding Of Months

In [129]:
## Month mapping
df['month']=df['month'].map({'jan':0,'feb':1,'mar':2,'apr':3,'may':4,'jun':5,'jul':6,'aug':7,'sep':8,'oct':9,'nov':10,'dec':11})

In [130]:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

In [131]:
df.drop('month',axis=1,inplace=True)

## Handling Pdays

In [132]:
df['pdays'].max()

np.int64(871)

In [133]:
df["pdays"] = df["pdays"].replace(-1, 0)

## Handeling Previous

In [134]:
df['previous'].unique()

array([  0,   3,   4,   2,   1,   5,   6,  10,  11,   9,   7,  14,  13,
        17,   8,  24,  23,  37,  12,  27,  25,  38,  15,  16,  29,  19,
        20,  32,  55,  18,  34,  22,  26,  21,  28,  35,  39,  51,  31,
        43,  30,  36,  33,  41,  40,  47,  46, 200,  48,  58,  45,  53,
       150])

### As they are discrete count and don't have special Value so we leave it as it as

In [135]:
# 1. Fill missing balance values if any
df['balance'] = df['balance'].fillna(0)

# 2. Shift balance so that all values are >= 0
shift_value = -df['balance'].min() + 1  # ensures minimum becomes 1
df['balance_shifted'] = df['balance'] + shift_value

# 3. Apply log1p safely
df['balance_log'] = np.log1p(df['balance_shifted'])

# Define bins
bins = [-np.inf, 0, 1000, 5000, 20000, np.inf]
labels = ['negative', 'zero_to_1k', '1k_to_5k', '5k_to_20k', 'above_20k']

# Create binned feature
df['balance_bin'] = pd.cut(df['balance'], bins=bins, labels=labels)

# Optional: convert to numeric codes for XGBoost
df['balance_bin_code'] = df['balance_bin'].cat.codes





In [136]:
df.drop(columns=['balance_bin'], inplace=True)


In [137]:
df.head()  # Display the first few rows of the combined DataFrame


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,...,previous,poutcome,y,day_sin,day_cos,month_sin,month_cos,balance_shifted,balance_log,balance_bin_code
0,42,9,1,1,0,7,0,0,0,3,...,0,3,0.0,-0.866025,0.5,-0.5,-0.866025,8027,8.990691,1
1,38,1,1,1,0,514,0,0,2,1,...,0,3,0.0,-0.587785,-0.809017,0.5,-0.866025,8534,9.051931,1
2,36,1,1,1,0,602,1,0,2,2,...,0,3,0.0,0.207912,-0.978148,0.866025,-0.5,8622,9.062188,1
3,27,8,2,1,0,34,1,0,2,2,...,0,3,0.0,-0.406737,0.913545,0.866025,-0.5,8054,8.994048,1
4,26,9,1,1,0,889,1,0,0,1,...,0,3,1.0,0.587785,0.809017,0.5,0.866025,8909,9.09493,1


In [138]:
df.isnull().sum()

age                      0
job                      0
marital                  0
education                0
default                  0
balance                  0
housing                  0
loan                     0
contact                  0
campaign                 0
pdays                    0
previous                 0
poutcome                 0
y                   250000
day_sin                  0
day_cos                  0
month_sin                0
month_cos                0
balance_shifted          0
balance_log              0
balance_bin_code         0
dtype: int64

In [139]:
df_train=df[df['y'].notnull()]
df_test=df[df['y'].isnull()]

In [140]:
df_test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,...,previous,poutcome,y,day_sin,day_cos,month_sin,month_cos,balance_shifted,balance_log,balance_bin_code
750000,32,1,1,1,0,1397,1,0,2,1,...,0,3,,-0.951057,-0.309017,0.8660254,-0.5,9417,9.150378,2
750001,44,4,1,2,0,23,1,0,0,2,...,0,3,,0.587785,0.809017,1.0,6.123234000000001e-17,8043,8.992682,1
750002,36,6,1,0,0,46,1,1,0,2,...,0,3,,0.406737,-0.913545,0.8660254,-0.5,8066,8.995537,1
750003,58,1,1,1,0,-1380,1,1,2,1,...,0,3,,-0.207912,0.978148,0.8660254,-0.5,6640,8.801018,0
750004,28,9,2,1,0,1950,1,0,0,1,...,0,3,,-0.994522,-0.104528,1.224647e-16,-1.0,9970,9.207436,2


In [141]:
df_train.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,...,previous,poutcome,y,day_sin,day_cos,month_sin,month_cos,balance_shifted,balance_log,balance_bin_code
749995,29,7,2,1,0,1282,0,1,2,2,...,0,3,1.0,0.743145,0.669131,1.224647e-16,-1.0,9302,9.138092,2
749996,69,5,0,2,0,631,0,0,0,1,...,0,3,0.0,-0.743145,-0.669131,-0.5,-0.8660254,8651,9.065546,1
749997,50,1,1,1,0,217,1,0,0,1,...,0,3,0.0,-0.406737,-0.913545,1.0,6.123234000000001e-17,8237,9.016513,1
749998,32,9,1,1,0,-274,0,0,0,6,...,0,3,0.0,-0.743145,0.669131,-0.5,-0.8660254,7746,8.955061,0
749999,42,9,1,1,0,1559,0,0,0,1,...,7,0,0.0,0.743145,0.669131,-0.5,-0.8660254,9579,9.167433,2


In [142]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from collections import Counter

# 1. Separate features and target
X = df_train.drop("y", axis=1)
y = df_train["y"]

# 2. Train/test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Compute scale_pos_weight (for imbalance handling)
counter = Counter(y_train)
neg, pos = counter[0], counter[1]
scale_pos_weight = neg / pos
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# 4. Initialize model with early stopping built-in
model = xgb.XGBClassifier(
    n_estimators=5000,           # large, early stopping will cut it down
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="auc",           # evaluation metric
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=50     # ✅ now belongs here
)

# 5. Train
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# 6. Predictions
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

# 7. Evaluation
print(f"\nAccuracy: {accuracy_score(y_val, y_pred):.5f}")
print(f"ROC AUC: {roc_auc_score(y_val, y_proba):.5f}")
print("\nClassification Report:\n", classification_report(y_val, y_pred))


scale_pos_weight: 7.29
[0]	validation_0-auc:0.78959


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


[1]	validation_0-auc:0.80441
[2]	validation_0-auc:0.81038
[3]	validation_0-auc:0.80928
[4]	validation_0-auc:0.81001
[5]	validation_0-auc:0.81007
[6]	validation_0-auc:0.81082
[7]	validation_0-auc:0.81058
[8]	validation_0-auc:0.81075
[9]	validation_0-auc:0.81262
[10]	validation_0-auc:0.81226
[11]	validation_0-auc:0.81210
[12]	validation_0-auc:0.81172
[13]	validation_0-auc:0.81158
[14]	validation_0-auc:0.81154
[15]	validation_0-auc:0.81131
[16]	validation_0-auc:0.81134
[17]	validation_0-auc:0.81122
[18]	validation_0-auc:0.81123
[19]	validation_0-auc:0.81121
[20]	validation_0-auc:0.81160
[21]	validation_0-auc:0.81231
[22]	validation_0-auc:0.81210
[23]	validation_0-auc:0.81238
[24]	validation_0-auc:0.81302
[25]	validation_0-auc:0.81351
[26]	validation_0-auc:0.81336
[27]	validation_0-auc:0.81315
[28]	validation_0-auc:0.81347
[29]	validation_0-auc:0.81340
[30]	validation_0-auc:0.81366
[31]	validation_0-auc:0.81346
[32]	validation_0-auc:0.81353
[33]	validation_0-auc:0.81337
[34]	validation_0-a

In [143]:
df_test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,...,previous,poutcome,y,day_sin,day_cos,month_sin,month_cos,balance_shifted,balance_log,balance_bin_code
750000,32,1,1,1,0,1397,1,0,2,1,...,0,3,,-0.951057,-0.309017,0.8660254,-0.5,9417,9.150378,2
750001,44,4,1,2,0,23,1,0,0,2,...,0,3,,0.587785,0.809017,1.0,6.123234000000001e-17,8043,8.992682,1
750002,36,6,1,0,0,46,1,1,0,2,...,0,3,,0.406737,-0.913545,0.8660254,-0.5,8066,8.995537,1
750003,58,1,1,1,0,-1380,1,1,2,1,...,0,3,,-0.207912,0.978148,0.8660254,-0.5,6640,8.801018,0
750004,28,9,2,1,0,1950,1,0,0,1,...,0,3,,-0.994522,-0.104528,1.224647e-16,-1.0,9970,9.207436,2


In [150]:
# x_test = df_test.drop('y', axis=1)
x_test=df_test.drop(columns=['y'],axis=1)
y_pred_proba = model.predict_proba(x_test)[:, 1]  # probability of y=1

# ----------------------------
# 4. Prepare submission
# ----------------------------
submission = pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'],   # replace 'id' with your actual ID column name
    'result': y_pred_proba
})

submission.to_csv('submission.csv', index=False)
print("CSV file created: submission.csv")


CSV file created: submission.csv


In [149]:
df_test.info()
# df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   age               250000 non-null  int64  
 1   job               250000 non-null  int64  
 2   marital           250000 non-null  int64  
 3   education         250000 non-null  int64  
 4   default           250000 non-null  int64  
 5   balance           250000 non-null  int64  
 6   housing           250000 non-null  int64  
 7   loan              250000 non-null  int64  
 8   contact           250000 non-null  int64  
 9   campaign          250000 non-null  int64  
 10  pdays             250000 non-null  int64  
 11  previous          250000 non-null  int64  
 12  poutcome          250000 non-null  int64  
 13  y                 0 non-null       float64
 14  day_sin           250000 non-null  float64
 15  day_cos           250000 non-null  float64
 16  month_sin         25

In [None]:
import xgboost as xgb
print(xgb.__version__)


3.0.4


In [None]:
df_train[['y','balance_positive']].corr()

Unnamed: 0,y,balance_bin_code
y,1.0,0.189161
balance_bin_code,0.189161,1.0


In [None]:
# df['balance'].plot(kind='hist')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Bin balance into categories (quantiles or fixed ranges)
df['balance_bin'] = pd.qcut(df['balance'], q=10, duplicates='drop')  
# q=10 → deciles (10 groups with ~equal number of rows)

# 2. Compute target rate per bin
balance_target = df.groupby('balance_bin')['y'].mean()

# 3. Plot
balance_target.plot(kind='bar', figsize=(10,5))
plt.ylabel("Target rate (y=1 probability)")
plt.title("Relationship between balance and target")
plt.xticks(rotation=45)
plt.show()
