In [1]:
import pandas as pd

df = pd.read_csv('credit_scores.csv')

In [20]:
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,education,n_yrs_employed,n_yrs_in_address,income,debt_to_income_ratio,credit_to_debit_ratio,other_debit,defaulted_loan,credit_score
0,0,41,3,17,12,176,9.3,11.359392,5.008608,1.0,575.934522
1,1,27,1,10,6,31,17.3,1.362202,4.000798,0.0,478.33496
2,2,40,1,15,14,55,5.5,0.856075,2.168925,0.0,508.423785
3,3,41,1,15,14,120,2.9,2.65872,0.82128,0.0,548.412256
4,4,24,2,2,0,28,17.3,1.787436,3.056564,1.0,469.753713


In [22]:
df.drop_duplicates(inplace=True)

In [23]:
df.isna().sum()

Unnamed: 0               0
age                      0
education                0
n_yrs_employed           0
n_yrs_in_address         0
income                   0
debt_to_income_ratio     0
credit_to_debit_ratio    0
other_debit              0
defaulted_loan           0
credit_score             0
dtype: int64

In [24]:
X, y = df.drop(columns=['credit_score']), df['credit_score']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
df.columns

Index(['Unnamed: 0', 'age', 'education', 'n_yrs_employed', 'n_yrs_in_address',
       'income', 'debt_to_income_ratio', 'credit_to_debit_ratio',
       'other_debit', 'defaulted_loan', 'credit_score'],
      dtype='object')

In [26]:
ct = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['age', 'n_yrs_employed', 'n_yrs_in_address', 'income']),
        ('cat', StandardScaler(), ['debt_to_income_ratio', 'credit_to_debit_ratio','other_debit']),
    ],
    remainder='passthrough', sparse_threshold=0)

In [27]:
model = Pipeline(steps=[
    ('preprocessor', ct),
    ('Linear_Regression', LinearRegression())
])

In [29]:
model.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [32]:
model.score(x_test, y_test)

1.0

In [33]:
model.predict(x_test)

array([479.2098652, 468.947166 , 482.020472 , 486.8622976, 486.75159  ,
       484.27411  , 486.5766256, 487.3789384, 544.240338 , 474.9604172,
       480.9312076, 480.872752 , 475.8133632, 491.3277732, 476.1224514,
       472.201155 , 482.527795 , 475.639    , 483.8407108, 481.093228 ,
       485.5420138, 485.7039412, 618.238924 , 481.88844  , 474.53094  ,
       494.34816  , 582.217826 , 481.957344 , 490.07728  , 498.218792 ,
       509.586088 , 483.397004 , 471.01256  , 495.827386 , 509.54604  ,
       506.7572224, 510.5334948, 478.35196  , 489.3291016, 497.11238  ,
       463.2254372, 539.80352  , 481.732036 , 481.8640192, 489.5755136,
       479.595412 , 482.1831164, 493.0311376, 473.9297852, 483.3492176,
       471.8448674, 479.8242476, 482.9939328, 470.8290976, 498.99803  ,
       475.6789302, 488.213694 , 481.1716   , 504.882824 , 488.3029388,
       475.510234 , 539.0592898, 483.07603  , 473.6607636, 542.0983952,
       512.643798 , 494.742016 , 473.6749744, 501.3786612, 478.6

In [34]:
joblib.dump(model, 'credit_score_model.pkl')  

['credit_score_model.pkl']