In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
import joblib
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Load the dataset
file_path = '/kaggle/input/dataset-1/credit_scores.csv'  # Adjust the path as necessary
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Score,Count_Auto Loan,Count_Credit-Builder Loan,Count_Personal Loan,Count_Home Equity Loan,Count_Not Specified,Count_Mortgage Loan,Count_Student Loan,Count_Debt Consolidation Loan,Count_Payday Loan
0,0x1608,CUS_0xd40,July,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,1.0,1.0,1.0,1.0,0.0,0,0,0,0
1,0x160f,CUS_0x21b1,February,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
2,0x1612,CUS_0x21b1,May,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
3,0x1613,CUS_0x21b1,June,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
4,0x1615,CUS_0x21b1,August,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0


In [3]:
# Drop specified columns
df.drop(columns=["Name", "SSN", "ID", "Customer_ID"], inplace=True)

# Define target variable
target = 'Credit_Score'
X = df.drop(columns=[target])
y = df[target]


In [4]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [5]:
# Select numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Define numerical transformer
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define categorical transformer
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [6]:
# Sample a portion of the dataset
sample_frac = 0.1  
df_sample = df.sample(frac=sample_frac, random_state=1)

# Define target and features on the sampled data
X_sample = df_sample.drop(columns=[target])
y_sample = df_sample[target]

# Split the sampled data
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=1)

# Select numerical and categorical columns
num_cols = X_sample.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_sample.select_dtypes(include=['object']).columns

# Define numerical transformer
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define categorical transformer
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Apply preprocessing on the sampled data
X_train_sample = preprocessor.fit_transform(X_train_sample)
X_test_sample = preprocessor.transform(X_test_sample)

# Print the shape of the processed sample data
print(f"Processed sample data shapes - X_train_sample: {X_train_sample.shape}, X_test_sample: {X_test_sample.shape}")


Processed sample data shapes - X_train_sample: (2672, 59), X_test_sample: (668, 59)


In [7]:
# Define the model
svm = SVC()

# Define hyperparameters
param_grid = {
    'kernel': ['rbf', 'linear'],
    'C': [0.01, 10, 20]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(svm, param_grid, cv=3) 
grid_search.fit(X_train_sample, y_train_sample)

# Get the best model
best_model = grid_search.best_estimator_

# Report the best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")


Best hyperparameters: {'C': 10, 'kernel': 'rbf'}


In [8]:
# Predict on the test set
y_pred_sample = best_model.predict(X_test_sample)

# Evaluate the accuracy
accuracy_sample = accuracy_score(y_test_sample, y_pred_sample)
print(f"Test Accuracy on Sampled Data: {accuracy_sample:.2f}")


Test Accuracy on Sampled Data: 0.61


In [9]:
# Retrain the model on the entire dataset
best_model.fit(preprocessor.fit_transform(X), y)
# Save the model
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']