In [47]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [48]:
FULL_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4.csv'
SMALL_SAMPLE_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_small_sample.csv'
SMALL_HEAD_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_small_head.csv'
MEDIUM_SAMPLE_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_medium_sample.csv'
MEDIUM_HEAD_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_medium_head.csv'
USED_DATASET_NAME = MEDIUM_HEAD_DATASET_NAME

In [49]:
COL_NAMES = ['loan_amnt', 'home_ownership', 'annual_inc', 'grade', 'int_rate']
FEATURE_NAMES = ['loan_amnt', 'home_ownership', 'annual_inc', 'grade']
LABEL_NAME = 'int_rate'
NOMINAL_COLS = ['home_ownership']
ORDINAL_COLS = ['grade']
CAT_COLS = NOMINAL_COLS + ORDINAL_COLS

In [50]:
df = pd.read_csv(FULL_DATASET_NAME, nrows=10000, skiprows=range(1, 20000), usecols=COL_NAMES, low_memory=False)
NUMERIC_COLS = df.select_dtypes(include="number").columns
print(NUMERIC_COLS)
df.describe()
df.info()
df.head()
df.grade.value_counts().keys().sort_values().tolist()


Index(['loan_amnt', 'int_rate', 'annual_inc'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   loan_amnt       10000 non-null  float64
 1   int_rate        10000 non-null  float64
 2   grade           10000 non-null  object 
 3   home_ownership  10000 non-null  object 
 4   annual_inc      10000 non-null  float64
dtypes: float64(3), object(2)
memory usage: 390.8+ KB


['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [55]:
cat_pipeline = Pipeline(
    steps=[
        ("oh_enc", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]
)

GRADE_VALUES = [df.grade.value_counts().keys().sort_values().tolist()]
pre_processor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), CAT_COLS),
        ("num", StandardScaler(), NUMERIC_COLS)
    ]
)

pre_processor.fit(df)
pre_processor.get_params()
pre_processor.get_feature_names_out()

prepared_np = pre_processor.transform(df)
df_1 = pd.DataFrame(
    data=prepared_np,
    columns=pre_processor.get_feature_names_out()
)

df_1.head()


Unnamed: 0,cat__home_ownership_ANY,cat__home_ownership_MORTGAGE,cat__home_ownership_OWN,cat__home_ownership_RENT,cat__grade_A,cat__grade_B,cat__grade_C,cat__grade_D,cat__grade_E,cat__grade_F,cat__grade_G,num__loan_amnt,num__int_rate,num__annual_inc
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.242159,0.227091,-0.161681
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.682005,0.086951,0.023319
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.913046,-1.42133,-0.082869
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.499312,-0.238458,-0.041389
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.797525,0.086951,-0.331749


In [56]:
X_df = df_1.drop(columns=['num__int_rate'])
y_df = df_1['num__int_rate']
X_train, X_test, y_train, y_test = \
    train_test_split(X_df.values, y_df.values, test_size=.2, shuffle=False)
´


[-0.23845766 -1.02941478 -1.02941478 ... -0.23845766  0.22709068
 -0.58524366]
