# Credit Scoring Model

## Develop a credit scoring model to predict the creditworthiness of individuals based on historical financial data. Utilize classification algorithms and assess the model's accuracy.

## Import Libraries and Load Data

In [1]:
# Import necessary libraries
import pandas as pd

# Load the datasets
train_data = pd.read_csv('CreditScore_train.csv')
test_data = pd.read_csv('CreditScore_test.csv')

# Display the column names and first few rows of the datasets
print("Training Data Columns and First Few Rows:")
print(train_data.columns)
print(train_data.head())

print("\nTesting Data Columns and First Few Rows:")
print(test_data.columns)
print(test_data.head())


Training Data Columns and First Few Rows:
Index(['x001', 'x002', 'x003', 'x004', 'x005', 'x006', 'x007', 'x008', 'x009',
       'x010',
       ...
       'x296', 'x297', 'x298', 'x299', 'x300', 'x301', 'x302', 'x303', 'x304',
       'y'],
      dtype='object', length=305)
      x001   x002  x003   x004   x005  x006  x007  x008  x009  x010  ...  \
0  1084094  426.0  39.0  128.0  426.0     0     0     0     0     0  ...   
1  1287777  160.0   2.0   64.0  160.0     1     1     2     0     1  ...   
2  1483016  163.0  16.0  104.0  239.0     0     0     0     1     0  ...   
3   959054    NaN   NaN    NaN  102.0     0     0     0     0     0  ...   
4  1342113    3.0   2.0    2.0   62.0     0     2     2     0     0  ...   

    x296    x297  x298  x299  x300  x301  x302  x303  x304    y  
0      0     NaN     0     0     0     0   NaN     0   NaN  807  
1  17318  0.8417     1     1     1     0   NaN     0   NaN  819  
2      0     NaN     0     0     0     0   NaN     0   NaN  803  
3     

#### This block imports the necessary libraries, loads the training and testing datasets, and displays the first few rows of each dataset.

## Data Preprocessing

In [2]:
# Check the column names and adjust accordingly
target_column = 'y'  # Credit score column

# Separate features and target variable from training data
X_train = train_data.drop(target_column, axis=1)
y_train = train_data[target_column]

# Separate features and target variable from testing data
X_test = test_data.drop(target_column, axis=1)
y_test = test_data[target_column]

# Handling missing values by filling with mean
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# Standardize the feature variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
import joblib
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']

#### This block separates the features and target variable from the training and testing datasets based on the correct target column name. It handles missing values by filling them with the mean of the columns. It then standardizes the feature variables using StandardScaler and saves the scaler for future use.

## Train the Model

In [3]:
# Initialize the Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Train the model on the training data
model.fit(X_train_scaled, y_train)

# Save the trained model
joblib.dump(model, 'credit_scoring_model.joblib')


['credit_scoring_model.joblib']

#### This block initializes a Linear Regression model and trains it on the standardized training data. The trained model is then saved using joblib.

## Evaluate the Model

In [4]:
# Load the trained model and scaler
loaded_model = joblib.load('credit_scoring_model.joblib')
loaded_scaler = joblib.load('scaler.joblib')

# Predict the credit scores on the test data
y_pred_scores = loaded_model.predict(X_test_scaled)

# Define a threshold to determine creditworthiness
threshold = 700  # Example threshold, can be adjusted

# Determine creditworthiness based on the threshold
y_pred_creditworthiness = ['Good' if score >= threshold else 'Bad' for score in y_pred_scores]

# Calculate accuracy and print evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_test_creditworthiness = ['Good' if score >= threshold else 'Bad' for score in y_test]

accuracy = accuracy_score(y_test_creditworthiness, y_pred_creditworthiness)
conf_matrix = confusion_matrix(y_test_creditworthiness, y_pred_creditworthiness)
class_report = classification_report(y_test_creditworthiness, y_pred_creditworthiness)

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 0.93155

Confusion Matrix:
[[13876   489]
 [  880  4755]]

Classification Report:
              precision    recall  f1-score   support

         Bad       0.94      0.97      0.95     14365
        Good       0.91      0.84      0.87      5635

    accuracy                           0.93     20000
   macro avg       0.92      0.90      0.91     20000
weighted avg       0.93      0.93      0.93     20000



#### This block loads the trained model and scaler, predicts the credit scores on the test data, determines creditworthiness based on a defined threshold, calculates the accuracy, and prints the evaluation metrics including the confusion matrix and classification report.

## Save the Results

In [5]:
# Save the predictions and actual values in a DataFrame
results = pd.DataFrame({
    'Actual Credit Score': y_test,
    'Predicted Credit Score': y_pred_scores,
    'Actual Creditworthiness': y_test_creditworthiness,
    'Predicted Creditworthiness': y_pred_creditworthiness
})

# Save the results to a CSV file
results.to_csv('credit_scoring_results.csv', index=False)

print("Results saved to 'credit_scoring_results.csv'")


Results saved to 'credit_scoring_results.csv'
