Task 1: Import the adult dataset from the ucimlrepo

In [None]:
#pip install ucimlrepo

In [None]:
#pip install numpy

In [None]:
#pip install pandas

In [None]:
#pip install matplotlib

In [None]:
#pip install scikit-learn

In [None]:
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


Task 2: Take a quick look at the data structure (i.e., X) using .head(), .info(), .describe(), and .shape.

In [None]:
# use head() to look at the first 5 rows
X.head()

In [None]:
# use info() to get a quick description of the data
X.info()

In [None]:
#use .describe() method to see a summary of the numerical attributes
X.describe()


In [None]:
X.shape

Task 2.1: Plot a histogram of the data.

In [None]:
# plotting a histogram of the data using hist()
X.hist(figsize=(20, 16))
plt.show()

Task 3: There are missing values in this dataset that are entered as ?, check for the number of these missing values.

In [None]:
# counting the number of the missing elements denoted by ?
X[X == '?'].count()


In [None]:
# replacing the ? with null value and showing the data using info()
X = X.replace('?', np.nan)
X.info()

Task 5: Create and apply a preprocessing pipeline to:
1. Fill in the missing numerical values with the mean using a SimpleImputer.
2. Scale the numerical columns using StandardScaler. Do not scale the target.
3. Fill in the missing categorical values with the most_frequent value using SimpleImputer.
4. Encode the categorical columns using OneHotEncoder. Do not encode the target.
- Display your pipeline.
- Print X_prepared.shape.
Tips:
- If you are facing an issue with the preprocessing pipeline producing a sparse matrix, pass a “sparse_output=False” option to the OneHotEncoder in the pipeline, i.e., OneHotEncoder(sparse_output=False)
- X_prepared.shape should be (48842, 105) at this point.

In [None]:
# Create the cat and num columns
# Get a list of column names from the 'X' DataFrame that are of numerical data types.
# Get a list of column names from the 'X' DataFrame that are not of numerical data types.

num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(exclude='number').columns.to_list()

print(num_cols)

print(cat_cols)

# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

# Use ColumnTransformer to set the estimators and transformations

preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)],
                                    remainder='passthrough'
                                 )


In [None]:
# showing the numerical columns
num_cols

In [None]:
# Show the pipeline
preprocessing

In [None]:
# applying the pipeline to the dataset
X_prepared = preprocessing.fit_transform(X)
X_prepared.shape

Task 6: Check the target value_counts.

In [None]:
# using value_count() to get the count of differnet targets
y.value_counts()

Task 7: Remove the period at the end of the >50K. and <=50K. i.e., replace all instances that are <=50K. with <=50K , and replace all the instances that are >50K. with >50K

In [None]:
#replacing the values in the target data with the desired value
y = y.replace('<=50K.', '<=50K')
y = y.replace('>50K.', '>50K')
y.value_counts()

Task 8: Split the data into 80% training set and 20% testing set, print the shape of X_train, X_test, y_train, y_test in one command.

In [None]:
# spliting the data by choosing 20% of the data be the test data and the rest training set (80%)
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Task 9: Train a svm model (svc) to predict if the income of the adult exceeds 50K on the training set using: kernel = poly, gamma = 1, and C =0.1. Call your model model_svm.

In [None]:
# using the SVC function traing the data with Polynomial model, gamma = 1, C = 0.1
model_svm = SVC(kernel='poly', C=0.1, gamma=1)
model_svm.fit(X_train.iloc[:10000], y_train.iloc[:10000].values.ravel())
model_svm.fit(X_train, y_train)

Task 9.1: Test your model on the X_Test, and report the classification_report on the y_test and y_predict.

In [None]:
# Testing the model
y_predict = clf.predict(X_test)
print(f'classification_report for C = 1')
print (classification_report(y_test, y_predict))

Task 9.2: Display the confusion matrix of your test results using ConfusionMatrixDisplay.from_predictions(y_test, y_predict)

In [None]:
#Displaying the confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_predict)

Task 10: Use GridSearchCV to find the best value of kernel, gamma, and C

Task 10.1: Split the dataset into 60% training, 20% validation, and 20% testing. Use the code below to perform the split:

In [None]:
# splitting the data set into 40% validaton set and 60% training set
X_train, X_validation_test, y_train, y_validation_test = train_test_split(X, y, test_size=0.4, random_state=42)

# splitting the training dataset (the initial  40%) into half, validation and test set (each 20% of the original dataset)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation_test, y_validation_test, test_size=0.5, random_state=42)

print(X_train.shape, y_train.shape, X_validation.shape, y_validation.shape, X_test.shape, y_test.shape)

Task 10.2: Use the below code snippet to pass the following hyperparameters for the GridSearchCV to find the best ones:

In [None]:
# code author luisguiserrano 

from sklearn.model_selection import GridSearchCV

svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1 , 10],
                  'gamma': [0.01, 1, 10]
                }
svm = SVC()
svm_gs = GridSearchCV(estimator = svm,
                      param_grid = svm_parameters)
svm_gs.fit(X_train.iloc[:10000], y_train.iloc[:10000].values.ravel())

svm_winner = svm_gs.best_estimator_
svm_winner.score(X_validation, y_validation)

Task 10.2: Check the svm winner parameters using svm_winner

In [None]:
# printing the vm number
svm_winner