In [None]:
pip install imbalanced-learn
pip upgrade sklearn
pip install scikit-plot
# Importing all the libraries
import pandas as pd
from sklearn.svm import SVC
from collections import Counter
import seaborn as sns
import numpy as np
import scikitplot as skplt
import time
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import validation_curve, learning_curve


In [None]:
# Data link: https://archive.ics.uci.edu/ml/datasets/adult ,download adult.data
# Defining the columns names because the data file has no column names, so it makes it easier this way

column_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss',
'hours-per-week','native-country','income']

# Please change the path below to the folder where you have downloaded the dataset in

data = pd.read_csv("/Users/imsazulfiqar/Downloads/adult.data", names=column_names)
data

# Checking for unique values and their frequency for the all variables

for col in column_names:
print(data[col].unique(),' ')
print(data[col].value_counts())

# Since education and education num are same, removing education

data = data.drop('education', axis=1)

# Relationship column does not look useful

data = data.drop('relationship', axis=1)

# ref:https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html
# Dropping the values with ? from the data

data = data[~data.isin(['?']).any(axis=1)]
data.shape

# Checking data types of each variable
data.dtypes

# Checking for class imbalance
target_percentage = data['income'].value_counts(normalize=True) * 100
print(target_percentage)
target_percentage.plot(kind='bar')
plt.title("Distribution of target variable")
plt.xlabel("Income")
plt.ylabel("Percentage")
plt.show()

# Using Label encoding for target variable since they are in order ;>50k or <=50k
encoder = LabelEncoder()
data['income'] = encoder.fit_transform(data['income'])

# 0 for <=50k 1 for >50k
encoder.classes_

# Correlation heatmap to check the correlation of numeric values with the target column

plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), cmap="rocket", annot=True)
plt.show()

# Plotting distributions of numeric columns
numerical_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
data[numerical_columns].hist(bins=20, figsize=(15,10))
plt.suptitle("Histograms of Numerical Columns")
plt.show()

# FNLWGT does not show any correlation with the other numeric values, dropping it
data = data.drop('fnlwgt', axis=1)

# Removing capital gain and loss since they have 29849 and 31042 values as 0, which does not contribute in prediction

data = data.drop('capital-loss', axis=1)
data = data.drop('capital-gain', axis=1)

# Making a copy of the current dataframe so no changes are made on the original data
df=data.copy()

# Plotting the relationship of age with income by making age ranges
age_bins = [10, 19, 29, 39, 49, 59, 69, 79, 89]
age_labels = ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80+']
df['agerange'] = pd.cut(df.age, bins=age_bins, labels = age_labels, include_lowest = True)
ax=sns.countplot(x ="agerange", data = df,hue="income")
ax.set(xlabel='Age groups', ylabel='Number of Times Contacted')

# ref:https://contactsunny.medium.com/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621
# Using One-hot encoding for categorical columns because since there are different numbers in the same column,
# the model will misunderstand the data to be in some kind of order, 0 < 1 < 2.

categorical_columns = ['workclass', 'marital-status', 'occupation', 'race', 'sex', 'native-country']
data = pd.get_dummies(data, columns=categorical_columns)
numerical_columns_ = ['age', 'education-num', 'hours-per-week']

# Partitioning the data into X and y
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Scaling the numerical features
scaler = StandardScaler()
X[numerical_columns_] = scaler.fit_transform(X[numerical_columns_])
X

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(y_test)

# ref: https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html
# Since the dataset is imbalanced, using random under-sampling to under-sample the majority class by
# randomly picking samples with or without replacement.

rus = RandomUnderSampler(sampling_strategy=1)
X_res, y_res = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % len(y_res))

# ref: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# Defining the MLP model with assiging 10% of data for validation

mlp = MLPClassifier(validation_fraction=0.1, early_stopping=True)

# Making a grid of the hyperparameters to tune

parameters = {
'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100),(50, 50, 50),(100, 100, 100)],
'alpha': [ 0.0001,0.001, 0.01],
'activation': ['logistic','relu', 'tanh'],
'solver': ['adam', 'sgd'],
'learning_rate':['adaptive','invscaling'],
'momentum':[0.1,0.5,1],
}

# Timing the execution
start_time = time.time()

# Performing grid search with 5 fold cross-validation
grid_search_mlp = GridSearchCV(mlp, parameters, cv=5,scoring='accuracy',verbose=3,refit = True)

# Fitting the model on the random sampled data
grid_search_mlp.fit(X_res, y_res)

# Ending the time and calculating the time elapsed
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time)

# ref: https://stackoverflow.com/questions/41475539/using-best-params-from-gridsearchcv
# Printing the best hyperparameters chosen
print("Best hyperparameters:", grid_search_mlp.best_params_)

# Fitting MLP on the best hyperparameters
mlp = MLPClassifier(**grid_search_mlp.best_params_)
mlp.fit(X_res, y_res)
y_pred = mlp.predict(X_test)

# Printing the loss computed with the loss function
print('Loss: ',mlp.loss_)

# Plotting the loss curve
plt.figure()
plt.plot(mlp.loss_curve_)
plt.title("Loss Curve (MLP)")
plt.xlabel("Number of iterations")
plt.ylabel("Loss")
plt.show()

# ref:https://campus.datacamp.com/courses/hyperparameter-tuning-in-python/grid-search?ex=10
# Getting the mean test scores for each hyperparameter combination
mean_test_scores = grid_search_mlp.cv_results_['mean_test_score']

# Sorting the mean test scores list in descending order to get the highest scores first
sorted_indices = np.argsort(mean_test_scores)[::-1]

# Getting the top 18 hyperparameter combinations based on their performance in cross validation
params_list = grid_search_mlp.cv_results_['params']
top_18_hyperparams = []
for index in sorted_indices[:18]:
top_18_hyperparams.append(params_list[index])

# Calculating the validation accuracies for the top hyperparameters extracted
validation_accuracies = []
for hyperparams in top_18_hyperparams:
mlp = MLPClassifier(**hyperparams)
mlp.fit(X_res, y_res)
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
validation_accuracies.append(accuracy)

# Combining the hyperparameters and their validation accuracies in a list
results = list(zip(top_18_hyperparams, validation_accuracies))
results

# ref:https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
# Plotting the validation curve for MLP on the best hyperparameters for values
# of alpha since its the regularization constant
# Plotting on a logarithmic scale

param_range = np.logspace(-5, 3, 9)

# Calculating the training and validation scores for each alpha value in the param_range.
train_scores, val_scores = validation_curve(mlp,X_res, y_res, param_name="alpha",param_range=param_range,cv=5)

# Calculating the mean training and validation scores for each alpha value.
train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)

# Ploting the validation scores on the same semilogarithmic scale to observe
# the differences in performance for very small values of alpha
plt.semilogx(param_range, train_mean, label="Training score", color="darkorange")
plt.semilogx(param_range, val_mean, label="Validation score", color="navy")
plt.title("Validation Curve for MLP")
plt.xlabel("Alpha")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

#ref: #ref:https://scikit-plot.readthedocs.io/en/stable/metrics.html
# Evaluation metrics on test data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Plotting the ROC curve for MLP
Predict = mlp.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, Predict)

# Plotting the confusion matrix for MLP
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()

# Classification report for MLP
print(classification_report(y_test, y_pred))

# ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
# ref: https://stackoverflow.com/questions/30972029/how-does-the-class-weight-parameter-in-scikit-learn-work
# Defining the SVM model setting the class weight to balanced to help with the imbalanced class problem
svm = SVC(random_state=10,class_weight='balanced')

# Making a grid of the hyperparameters to tune
param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'tol': [1e-4, 1e-3, 1e-2],
'kernel':['linear', 'poly', 'rbf', 'sigmoid']
}

# Timing the execution
start_time = time.time()

# Performing grid search with 5 fold cross-validation
grid_search_svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', refit = True, verbose = 4,return_train_score=True)

# Fitting the model on the random sampled data
grid_search_svm.fit(X_res, y_res)

# Ending the time and calculating the time elapsed
end_time = time.time()
elapsed_time_svm = end_time - start_time

# Printing the best hyperparameters chosen
print("Best hyperparameters:", grid_search_svm.best_params_)

# Fitting SVM on the best hyperparameters
svm = SVC(**grid_search_svm.best_params_,probability=True)
svm.fit(X_res, y_res)
y_pred = svm.predict(X_test)

# Getting the mean test scores for each hyperparameter combination
mean_test_scores = grid_search_svm.cv_results_['mean_test_score']

# Sorting the mean test scores list in descending order to get the highest scores first
sorted_indices = np.argsort(mean_test_scores)[::-1]

# Getting the top 18 hyperparameter combinations based on their performance in cross validation
params_list = grid_search_svm.cv_results_['params']
top_18_hyperparams = []
for index in sorted_indices[:18]:
top_18_hyperparams.append(params_list[index])

# Calculating the validation accuracies for the top hyperparameters extracted
validation_accuracies = []
for hyperparams in top_18_hyperparams:
svm = SVC(**hyperparams, probability= True)
svm.fit(X_res, y_res)
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
validation_accuracies.append(accuracy)

# Combining the hyperparameters and their validation accuracies in a list
results = list(zip(top_18_hyperparams, validation_accuracies))
results

# Plotting the validation curve for SVM on the best hyperparameters for values
# of C since its the regularization constant
# Plotting on a logarithmic scale
param_range = np.logspace(-6, 1, 8)

# Calculating the training and validation scores for each C value in the param_range.
train_scores, test_scores = validation_curve(svm, X_res, y_res, param_name='C', param_range=param_range, cv=5)

# Calculating the mean training and validation scores for each C value.
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

# Ploting the validation scores on the same semilogarithmic scale to observe
# the differences in performance for very small values of C
plt.semilogx(param_range, train_mean, label='Training score', color='blue')
plt.semilogx(param_range, test_mean, label='Cross-validation score', color='red')
plt.title('Validation curve for SVM')
plt.xlabel('C')
plt.ylabel('Score')
plt.legend(loc='best')
plt.show()

# Evaluation metrics on test data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Plotting the ROC curve for SVM
Predict = svm.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, Predict)

# Plotting the confusion matrix for SVM
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()

# Classification report for SVM
print(classification_report(y_test, y_pred))

# Plotting the decision function values for the test set
decision_values = svm.decision_function(X_test)
plt.hist(decision_values, bins=50)
plt.xlabel("Decision Function Value")
plt.ylabel("Frequency")
plt.show()