In [40]:
# 1.1 Load and Preprocess the Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset from your local directory
file_path = 'adult income.csv'
data = pd.read_csv(file_path, na_values=' ?', skipinitialspace=True)

# Display the first few rows of the dataset
print(data.head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [27]:
# Select relevant features 
selected_columns = ['age', 'workclass', 'education', 'occupation', 'relationship', 'race', 'gender', 
                    'capital-gain', 'capital-loss', 'hours-per-week', 'income']

data = data[selected_columns]

# Handling missing values by replacing them with the most frequent value
for column in data.columns:
    if data[column].isnull().sum() > 0:
        data[column].fillna(data[column].mode()[0], inplace=True)

# Encode categorical variables
categorical_columns = data.select_dtypes(include=['object']).columns.drop('income')
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

# Convert income to binary (0 and 1)
data['income'] = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# Normalize the data
scaler = StandardScaler()
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.drop('income')
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


# Display the first few rows after preprocessing
print(data.head())

        age  workclass  education  occupation  relationship      race  \
0 -0.995129   0.088484  -2.397350    0.099824      0.971649 -1.971746   
1 -0.046942   0.088484   0.183660   -0.372938     -0.900852  0.392384   
2 -0.776316  -1.277432  -0.848744    1.045346     -0.900852  0.392384   
3  0.390683   0.088484   1.216063    0.099824     -0.900852 -1.971746   
4 -1.505691  -2.643348   1.216063   -1.554840      0.971649  0.392384   

    gender  capital-gain  capital-loss  hours-per-week  income  
0  0.70422     -0.144804     -0.217127       -0.034087       0  
1  0.70422     -0.144804     -0.217127        0.772930       0  
2  0.70422     -0.144804     -0.217127       -0.034087       1  
3  0.70422      0.886874     -0.217127       -0.034087       1  
4 -1.42001     -0.144804     -0.217127       -0.841104       0  


In [39]:
# 4.3 Random Forest

# Make predictions with the best random forest model
y_pred_random_forest = best_random_forest.predict(X_test)

# Calculate accuracy
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print("Random Forest Accuracy:", accuracy_random_forest)

# Calculate other classification metrics
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_random_forest))

# Confusion matrix
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_random_forest))


Random Forest Accuracy: 0.8738867847271983
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      7479
           1       0.78      0.64      0.70      2290

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769

Random Forest Confusion Matrix:
[[7070  409]
 [ 823 1467]]


In [28]:
# 1.2 Split the Dataset into Training and Test Sets

# Define the features and target variable
X = data.drop('income', axis=1)
y = data['income']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (39073, 10)
Test set shape: (9769, 10)


In [29]:
# 2.1 Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the logistic regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Accuracy:", accuracy_log_reg)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.8074521445388474
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      7479
           1       0.69      0.32      0.44      2290

    accuracy                           0.81      9769
   macro avg       0.76      0.64      0.66      9769
weighted avg       0.79      0.81      0.78      9769

Logistic Regression Confusion Matrix:
[[7156  323]
 [1558  732]]


In [30]:
# 2.2 Decision Tree

from sklearn.tree import DecisionTreeClassifier

# Initialize and train the decision tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Make predictions
y_pred_decision_tree = decision_tree.predict(X_test)

# Evaluate the model
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
print("Decision Tree Accuracy:", accuracy_decision_tree)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))

Decision Tree Accuracy: 0.8228068379568021
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      7479
           1       0.62      0.61      0.62      2290

    accuracy                           0.82      9769
   macro avg       0.75      0.75      0.75      9769
weighted avg       0.82      0.82      0.82      9769

Decision Tree Confusion Matrix:
[[6631  848]
 [ 883 1407]]


In [31]:
#2.3 Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the random forest model
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Make predictions
y_pred_random_forest = random_forest.predict(X_test)

# Evaluate the model
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print("Random Forest Accuracy:", accuracy_random_forest)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_random_forest))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_random_forest))

Random Forest Accuracy: 0.8586344559320299
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      7479
           1       0.72      0.65      0.68      2290

    accuracy                           0.86      9769
   macro avg       0.81      0.79      0.80      9769
weighted avg       0.85      0.86      0.86      9769

Random Forest Confusion Matrix:
[[6905  574]
 [ 807 1483]]


In [33]:
# 3.1 Logistic Regression
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for logistic regression
param_grid_log_reg = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                      'penalty': ['l1', 'l2']}

# Initialize logistic regression model
log_reg = LogisticRegression(random_state=42)

# Perform grid search cross-validation
grid_search_log_reg = GridSearchCV(estimator=log_reg, param_grid=param_grid_log_reg, cv=5, scoring='accuracy')
grid_search_log_reg.fit(X_train, y_train)

# Get the best parameters and best score
best_params_log_reg = grid_search_log_reg.best_params_
best_score_log_reg = grid_search_log_reg.best_score_

print("Best Parameters for Logistic Regression:", best_params_log_reg)
print("Best Accuracy Score for Logistic Regression:", best_score_log_reg)

# Get the best model
best_log_reg = grid_search_log_reg.best_estimator_

Best Parameters for Logistic Regression: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy Score for Logistic Regression: 0.8027025233675926


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ilaydadenizikendiz/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ilaydadenizikendiz/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ilaydadenizikendiz/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, 

In [34]:
# 3.2 Decision Tree

# Define hyperparameters for decision tree
param_grid_decision_tree = {'max_depth': [None, 10, 20, 30, 40],
                            'min_samples_split': [2, 5, 10],
                            'min_samples_leaf': [1, 2, 4]}

# Initialize decision tree model
decision_tree = DecisionTreeClassifier(random_state=42)

# Perform grid search cross-validation
grid_search_decision_tree = GridSearchCV(estimator=decision_tree, param_grid=param_grid_decision_tree, cv=5, scoring='accuracy')
grid_search_decision_tree.fit(X_train, y_train)

# Get the best parameters and best score
best_params_decision_tree = grid_search_decision_tree.best_params_
best_score_decision_tree = grid_search_decision_tree.best_score_

print("Best Parameters for Decision Tree:", best_params_decision_tree)
print("Best Accuracy Score for Decision Tree:", best_score_decision_tree)

# Get the best model
best_decision_tree = grid_search_decision_tree.best_estimator_

Best Parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Accuracy Score for Decision Tree: 0.8541447286650714


In [36]:
from sklearn.model_selection import RandomizedSearchCV
import time

# Define hyperparameters for random forest
param_dist_random_forest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize random forest model
random_forest = RandomForestClassifier(random_state=42)

# Perform randomized search cross-validation
random_search_random_forest = RandomizedSearchCV(estimator=random_forest, 
                                                 param_distributions=param_dist_random_forest, 
                                                 n_iter=50,  # Number of parameter settings sampled
                                                 cv=3,       # Number of folds in cross-validation
                                                 scoring='accuracy', 
                                                 n_jobs=-1,  # Use all available CPU cores
                                                 random_state=42)

start_time = time.time()
random_search_random_forest.fit(X_train, y_train)
end_time = time.time()

# Get the best parameters and best score
best_params_random_forest = random_search_random_forest.best_params_
best_score_random_forest = random_search_random_forest.best_score_

print(f"Best Parameters for Random Forest: {best_params_random_forest}")
print(f"Best Accuracy Score for Random Forest: {best_score_random_forest:.4f}")
print(f"Random search took {end_time - start_time:.2f} seconds")

# Get the best model
best_random_forest = random_search_random_forest.best_estimator_


Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None, 'bootstrap': True}
Best Accuracy Score for Random Forest: 0.8631
Random search took 131.99 seconds


In [37]:
# 4.1 Logistic Regression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions with the best logistic regression model
y_pred_log_reg = best_log_reg.predict(X_test)

# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Accuracy:", accuracy_log_reg)

# Calculate other classification metrics
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

# Confusion matrix
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.8079639676527792
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      7479
           1       0.71      0.31      0.43      2290

    accuracy                           0.81      9769
   macro avg       0.76      0.63      0.66      9769
weighted avg       0.79      0.81      0.78      9769

Logistic Regression Confusion Matrix:
[[7185  294]
 [1582  708]]


In [38]:
# 4.2 Decision Tree
# Make predictions with the best decision tree model
y_pred_decision_tree = best_decision_tree.predict(X_test)

# Calculate accuracy
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
print("Decision Tree Accuracy:", accuracy_decision_tree)

# Calculate other classification metrics
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))

# Confusion matrix
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))

Decision Tree Accuracy: 0.8632408639574163
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7479
           1       0.78      0.58      0.66      2290

    accuracy                           0.86      9769
   macro avg       0.83      0.76      0.79      9769
weighted avg       0.86      0.86      0.86      9769

Decision Tree Confusion Matrix:
[[7113  366]
 [ 970 1320]]


In [39]:
# 4.3 Random Forest

# Make predictions with the best random forest model
y_pred_random_forest = best_random_forest.predict(X_test)

# Calculate accuracy
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print("Random Forest Accuracy:", accuracy_random_forest)

# Calculate other classification metrics
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_random_forest))

# Confusion matrix
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_random_forest))


Random Forest Accuracy: 0.8738867847271983
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      7479
           1       0.78      0.64      0.70      2290

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769

Random Forest Confusion Matrix:
[[7070  409]
 [ 823 1467]]
