In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv(r'C:/vscodefolder/AML_Lab/Datasets/income.csv')

# Display the first few rows of the dataset to ensure it loaded correctly
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [12]:
# Handle missing values or placeholders like '?'
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)  # Drop rows with missing values

In [13]:
# Separate features (X) and target (y)
X = df.iloc[:, :-1].values  # All columns except the last one (income) as features
y = df.iloc[:, -1].values   # The last column (income) as target


In [14]:

# Encode categorical features and the target variable
categorical_columns = [1, 3, 5, 6, 7, 8, 9, 13]  # Indices of categorical columns
for column in categorical_columns:
    le = LabelEncoder()
    X[:, column] = le.fit_transform(X[:, column])

# Encode the target variable
le_income = LabelEncoder()
y = le_income.fit_transform(y)

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)


In [16]:
# Standardize the features
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [17]:


# Train a Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42, max_features=2)
gbc.fit(X_train_std, y_train)

# Evaluate the model
print("GBC accuracy is %2.2f%%" % (accuracy_score(y_test, gbc.predict(X_test_std)) * 100))
print("Confusion Matrix : \n", confusion_matrix(y_test, gbc.predict(X_test_std)))



GBC accuracy is 85.52%
Confusion Matrix : 
 [[6534  269]
 [1041 1201]]


In [18]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'random_state': [42, 100, 200],
    'max_features': [4, 5, 6]
}
gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, param_grid, cv=4)
gb_cv.fit(X_train_std, y_train)

# Evaluate the model after hyperparameter tuning
print("Best Parameters : ", gb_cv.best_params_)
print("Train Score : ", gb_cv.best_score_)
print('Test Score :', gb_cv.score(X_test_std, y_test))
y_pred = gb_cv.predict(X_test_std)




Best Parameters :  {'learning_rate': 0.1, 'max_features': 5, 'n_estimators': 300, 'random_state': 42}
Train Score :  0.8673744037197667
Test Score : 0.8707573244886678


In [19]:
print("Confusion Matrix after hyperparameter tuning :")
print(confusion_matrix(y_test, y_pred))

print("GBC accuracy after hyperparameter tuning is %2.2f%%" % (accuracy_score(y_test, y_pred) * 100))
print("Precision: %2.4f " % precision_score(y_test, y_pred))
print("Recall: %2.4f " % recall_score(y_test, y_pred))
print("F1 Score: %2.4f " % f1_score(y_test, y_pred))

Confusion Matrix after hyperparameter tuning :
[[6411  392]
 [ 777 1465]]
GBC accuracy after hyperparameter tuning is 87.08%
Precision: 0.7889 
Recall: 0.6534 
F1 Score: 0.7148 


: 