## 1) Prepare a classification model using SVM for salary data 

Data Description:

age -- age of a person
workclass	-- A work class is a grouping of work 
education	-- Education of an individuals	
maritalstatus -- Marital status of an individulas	
occupation	 -- occupation of an individuals
relationship -- 	
race --  Race of an Individual
sex --  Gender of an Individual
capitalgain --  profit received from the sale of an investment	
capitalloss	-- A decrease in the value of a capital asset
hoursperweek -- number of hours work per week	
native -- Native of an individual
Salary -- salary of an individual


In [None]:
#Import Libraries 
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import libraries for machine learning
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [None]:
#Load and Preprocess Training Data
# Load the training dataset from Google Drive
train_data = pd.read_csv('SalaryData_Train(1).csv')

# Display the first few rows of the dataset
train_data.head()

# Remove duplicate entries from the dataset
train_data_clean = train_data.drop_duplicates()

# Check the structure and data types of the dataset
train_data_clean.info()

In [None]:
#Encode Categorical Variables
# Import LabelEncoder to convert categorical variables into numerical format
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Encode categorical columns
for column in ['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native', 'Salary']:
  train_data_clean[column] = encoder.fit_transform(train_data_clean[column])

# Display the first few rows of the encoded dataset
train_data_clean.head()

In [None]:
#Visualize Data
# Visualize pairwise relationships in the dataset with hue based on 'Salary'
sns.pairplot(train_data_clean, hue='Salary')

# Display a heatmap of the correlation matrix
plt.figure(figsize=(18, 8))
sns.heatmap(train_data_clean.corr(), cmap='coolwarm', annot=True, fmt=".2f")
plt.show()

In [None]:
#Load and Preprocess Test Data
# Load the test dataset from Google Drive
test_data = pd.read_csv('/content/drive/MyDrive/Assignment DS/Assignment 17/SalaryData_Test(1).csv')

# Remove duplicate entries from the test dataset
test_data_clean = test_data.drop_duplicates()

# Encode categorical columns in the test dataset
for column in ['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native', 'Salary']:
  test_data_clean[column] = encoder.fit_transform(test_data_clean[column])

# Display the first few rows of the encoded test dataset
test_data_clean.head()

In [None]:
#Split Features and Labels
# Split the training data into features (X) and labels (y)
X_train = train_data_clean.drop('Salary', axis=1)
y_train = train_data_clean['Salary']

# Split the test data into features (X) and labels (y)
X_test = test_data_clean.drop('Salary', axis=1)
y_test = test_data_clean['Salary']

# Display the shapes of the training and test sets
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
#Train and Evaluate SVM with Different Kernels
# Train and evaluate SVM with RBF kernel
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)
predictions_rbf = svm_rbf.predict(X_test)
print("RBF Kernel Accuracy:", accuracy_score(y_test, predictions_rbf))
print(confusion_matrix(y_test, predictions_rbf))

In [None]:
# Train and evaluate SVM with Linear kernel
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)
predictions_linear = svm_linear.predict(X_test)
print("Linear Kernel Accuracy:", accuracy_score(y_test, predictions_linear))
print(confusion_matrix(y_test, predictions_linear))

In [None]:
# Train and evaluate SVM with Polynomial kernel
svm_poly = SVC(kernel='poly')
svm_poly.fit(X_train, y_train)
predictions_poly = svm_poly.predict(X_test)
print("Polynomial Kernel Accuracy:", accuracy_score(y_test, predictions_poly))
print(confusion_matrix(y_test, predictions_poly))

In [None]:
# Train and evaluate SVM with Sigmoid kernel
svm_sigmoid = SVC(kernel='sigmoid')
svm_sigmoid.fit(X_train, y_train)
predictions_sigmoid = svm_sigmoid.predict(X_test)
print("Sigmoid Kernel Accuracy:", accuracy_score(y_test, predictions_sigmoid))
print(confusion_matrix(y_test, predictions_sigmoid))

In [None]:
#Hyperparameter Tuning with GridSearchCV
# Define parameter grid for RBF kernel
param_grid_rbf = {'kernel': ['rbf'], 'gamma': [50, 5, 0.5], 'C': [15, 12, 7, 3, 0.1, 0.0001]}

# Initialize GridSearchCV for RBF kernel
grid_rbf = GridSearchCV(SVC(), param_grid_rbf, cv=10)
grid_rbf.fit(X_train, y_train)

# Display best parameters and accuracy
print("Best Parameters for RBF Kernel:", grid_rbf.best_params_)
print("Best Cross-Validation Accuracy for RBF Kernel:", grid_rbf.best_score_)

# Evaluate the best model on the test set
best_rbf = grid_rbf.best_estimator_
test_accuracy_rbf = accuracy_score(y_test, best_rbf.predict(X_test))
print("Test Accuracy with Best RBF Kernel:", test_accuracy_rbf)
python

Execute

Copy Code
# Define parameter grid for Linear kernel
param_grid_linear = {'kernel': ['linear'], 'C': [15, 11, 7, 2, 0.1, 0.0001]}

# Initialize GridSearchCV for Linear kernel
grid_linear = GridSearchCV(SVC(), param_grid_linear, cv=10)
grid_linear.fit(X_train, y_train)

# Display best parameters and accuracy
print("Best Parameters for Linear Kernel:", grid_linear.best_params_)
print("Best Cross-Validation Accuracy for Linear Kernel:", grid_linear.best_score_)

# Evaluate the best model on the test set
best_linear = grid_linear.best_estimator_
test_accuracy_linear = accuracy_score(y_test, best_linear.predict(X_test))
print("Test Accuracy with Best Linear Kernel:", test_accuracy_linear)

In [None]:
# Define parameter grid for Polynomial kernel
param_grid_poly = {'kernel': ['poly'], 'gamma': [45, 5, 0.5, 0.1], 'C': [15, 10, 5, 3, 0.1, 0.0001]}

# Initialize GridSearchCV for Polynomial kernel
grid_poly = GridSearchCV(SVC(), param_grid_poly, cv=10)
grid_poly.fit(X_train, y_train)

# Display best parameters and accuracy
print("Best Parameters for Polynomial Kernel:", grid_poly.best_params_)
print("Best Cross-Validation Accuracy for Polynomial Kernel:", grid_poly.best_score_)

# Evaluate the best model on the test set
best_poly = grid_poly.best_estimator_
test_accuracy_poly = accuracy_score(y_test, best_poly.predict(X_test))
print("Test Accuracy with Best Polynomial Kernel:", test_accuracy_poly)

In [None]:
# Define parameter grid for Sigmoid kernel
param_grid_sigmoid = {'kernel': ['sigmoid'], 'gamma': [50, 5, 0.5], 'C': [15, 12, 7, 0.1, 0.0001]}

# Initialize GridSearchCV for Sigmoid kernel
grid_sigmoid = GridSearchCV(SVC(), param_grid_sigmoid, cv=10)
grid_sigmoid.fit(X_train, y_train)

# Display best parameters and accuracy
print("Best Parameters for Sigmoid Kernel:", grid_sigmoid.best_params_)
print("Best Cross-Validation Accuracy for Sigmoid Kernel:", grid_sigmoid.best_score_)

# Evaluate the best model on the test set
best_sigmoid = grid_sigmoid.best_estimator_
test_accuracy_sigmoid = accuracy_score(y_test, best_sigmoid.predict(X_test))
print("Test Accuracy with Best Sigmoid Kernel:", test_accuracy_sigmoid)