In [13]:
# Step 1: Load the dataset
import pandas as pd
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


# Laboratory Exercise 2 - SVM Implementation

Steps:
1. Load the dataset
2. Identify the shape of your data
3. Enlist the column names in your dataset
4. Establish X and Y Matrix
5. Perform 70/30 Data Split
6. Provide data dimension (train and test)
7. Define the Model
8. Build the training model
9. Perform prediction on test data
10. Print Model Performance

In [14]:
# Step 2: Identify the shape of your data
data.shape

(9578, 14)

In [15]:
# Step 3: Enlist the column names in your dataset
data.columns.tolist()

['credit.policy',
 'purpose',
 'int.rate',
 'installment',
 'log.annual.inc',
 'dti',
 'fico',
 'days.with.cr.line',
 'revol.bal',
 'revol.util',
 'inq.last.6mths',
 'delinq.2yrs',
 'pub.rec',
 'not.fully.paid']

In [16]:
# Step 4: Establish X and Y Matrix
X = data.drop('not.fully.paid', axis=1)  # Features
Y = data['not.fully.paid']  # Target
print('Features shape:', X.shape)
print('Target shape:', Y.shape)
print('\nFeature columns:', X.columns.tolist())
print('\nTarget values:', Y.value_counts())

Features shape: (9578, 13)
Target shape: (9578,)

Feature columns: ['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec']

Target values: not.fully.paid
0    8045
1    1533
Name: count, dtype: int64


In [17]:
# Step 5: Perform 70/30 Data Split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [18]:
# Step 6: Provide data dimension (train and test)
print('Training data shape:', X_train.shape, Y_train.shape)
print('Testing data shape:', X_test.shape, Y_test.shape)

Training data shape: (6704, 13) (6704,)
Testing data shape: (2874, 13) (2874,)


In [19]:
# Step 6.1: Data Preprocessing (Handle categorical variables and scaling)
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

# Check data types
print("Data types:")
print(X_train.dtypes)
print("\nCategorical columns:", X_train.select_dtypes(include=['object']).columns.tolist())

# Handle categorical variables using Label Encoding
le = LabelEncoder()
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

# Encode categorical columns
categorical_columns = X_train.select_dtypes(include=['object']).columns
for col in categorical_columns:
    # Fit on training data and transform both train and test
    X_train_processed[col] = le.fit_transform(X_train[col].astype(str))
    X_test_processed[col] = le.transform(X_test[col].astype(str))

# Scale the features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

print("\nAfter preprocessing:")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

Data types:
credit.policy          int64
purpose               object
int.rate             float64
installment          float64
log.annual.inc       float64
dti                  float64
fico                   int64
days.with.cr.line    float64
revol.bal              int64
revol.util           float64
inq.last.6mths         int64
delinq.2yrs            int64
pub.rec                int64
dtype: object

Categorical columns: ['purpose']

After preprocessing:
X_train_scaled shape: (6704, 13)
X_test_scaled shape: (2874, 13)


In [20]:
# Step 7: Define the Model
from sklearn.svm import SVC
Llatuna = SVC()

In [21]:
# Step 8: Build the training model (using preprocessed data)
Llatuna.fit(X_train_scaled, Y_train)
print("Model training completed successfully!")

Model training completed successfully!


In [22]:
# Step 9: Perform prediction on test data (using preprocessed data)
Y_pred = Llatuna.predict(X_test_scaled)
print("First 10 predictions:", Y_pred[:10])
print("Prediction completed!")

First 10 predictions: [0 0 0 0 0 0 0 0 0 0]
Prediction completed!


In [23]:
# Step 10: Print Model Performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, Y_pred))
print('Classification Report:\n', classification_report(Y_test, Y_pred))

Accuracy: 0.8389004871259569
Confusion Matrix:
 [[2405    3]
 [ 460    6]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      2408
           1       0.67      0.01      0.03       466

    accuracy                           0.84      2874
   macro avg       0.75      0.51      0.47      2874
weighted avg       0.81      0.84      0.77      2874

