In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns

In [3]:
df = pd.read_csv("s3://german-credit-20221001/datos/original/german_credit_data.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903
std,288.819436,11.375469,0.653614,2822.736876,12.058814
min,0.0,19.0,0.0,250.0,4.0
25%,249.75,27.0,2.0,1365.5,12.0
50%,499.5,33.0,2.0,2319.5,18.0
75%,749.25,42.0,2.0,3972.25,24.0
max,999.0,75.0,3.0,18424.0,72.0


In [7]:
df["Risk"].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

In [8]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Risk"])

In [9]:
train["Risk"].value_counts()/train["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [10]:
test["Risk"].value_counts()/test["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [11]:
train.to_csv("s3://german-credit-20221001/datos/train/train.csv", index=False)

In [12]:
test.to_csv("s3://german-credit-20221001/datos/test/test.csv")

In [13]:
pd.crosstab(index=train["Housing"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free,29,45,74
own,133,371,504
rent,48,74,122
All,210,490,700


In [14]:
pd.crosstab(index=train["Duration"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0,6,6
6,7,49,56
7,0,2,2
8,1,3,4
9,7,26,33
10,3,19,22
11,0,5,5
12,37,83,120
13,0,3,3
14,1,3,4


In [15]:
tabla_sex =pd.crosstab(index=train["Sex"], columns=train["Risk"], margins=True)

In [16]:
tabla_sex["proba"] = tabla_sex.iloc[:,1]/tabla_sex.iloc[:,2]

In [17]:
tabla_sex

Risk,bad,good,All,proba
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,76,138,214,0.64486
male,134,352,486,0.72428
All,210,490,700,0.7


In [18]:
##Preprosesamiento
from sklearn.impute import SimpleImputer

In [19]:
#Se imputan variables que no contienen datos, esto es para producción
#Age","Credit amount","Duration",
X_train = train[["Saving accounts","Checking account"]]

In [20]:
imputer = SimpleImputer(strategy="constant", fill_value='UPS')

In [21]:
imputer.fit(X_train)

SimpleImputer(fill_value='UPS', strategy='constant')

In [22]:
imputer.statistics_

array(['UPS', 'UPS'], dtype=object)

In [23]:
imputer.transform([['moderate',np.nan],[np.nan,np.nan]])



array([['moderate', 'UPS'],
       ['UPS', 'UPS']], dtype=object)

In [24]:
imputer.fit_transform(X_train)

array([['little', 'rich'],
       ['little', 'UPS'],
       ['UPS', 'moderate'],
       ...,
       ['little', 'UPS'],
       ['little', 'UPS'],
       ['little', 'little']], dtype=object)

## Prueba con KNN

In [25]:
df2 = pd.DataFrame( [[40,19,3.0],[42,20,3.1],[44,21,np.nan],[45,23,4.1],[39,25,5.0],[80,27,np.nan],[82,30,4.8]], columns=["Peso","Edad","Nota"])

In [26]:
from sklearn.impute import KNNImputer

In [27]:
imputer = KNNImputer(n_neighbors=4)

In [28]:
imputer.fit(df2)

KNNImputer(n_neighbors=4)

In [29]:
imputer.transform(df2)

array([[40.  , 19.  ,  3.  ],
       [42.  , 20.  ,  3.1 ],
       [44.  , 21.  ,  3.8 ],
       [45.  , 23.  ,  4.1 ],
       [39.  , 25.  ,  5.  ],
       [80.  , 27.  ,  3.75],
       [82.  , 30.  ,  4.8 ]])

## Escalado

In [30]:
 from sklearn.preprocessing import StandardScaler, RobustScaler

In [31]:
X_train = train[["Age","Credit amount","Duration"]]

In [32]:
scaler = StandardScaler()

In [33]:
scaler.fit(X_train[["Age"]])

StandardScaler()

In [34]:
scaler.mean_

array([35.40285714])

In [35]:
scaler.var_ ** 0.5

array([11.23479253])

In [37]:
scaler.inverse_transform([[-0.39189483]])

array([[31.00000003]])

In [38]:
X_train["Age"].quantile(0.5)

33.0

In [39]:
X_train["Age"] - X_train["Age"].mean() / (X_train["Age"].quantile(0.75) - X_train["Age"].quantile(0.25)*1.5)

328   -16.20381
891    -1.20381
255   -20.20381
243   -20.20381
492   -20.20381
         ...   
73     -6.20381
401   -19.20381
769    15.79619
2       1.79619
617   -10.20381
Name: Age, Length: 700, dtype: float64

In [41]:
X_train["Age"].median()

33.0

In [57]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [52]:
encoder.categories_

[array(['female', 'male'], dtype=object)]

In [54]:
encoder.inverse_transform([[0]])

array([['female']], dtype=object)

In [55]:
from sklearn.pipeline import Pipeline

In [59]:
numeric_pipeline = Pipeline(
    [
        ('Imputación con la media', SimpleImputer(strategy='mean')),
        ('Escalado minmax', MinMaxScaler())
    ]
)

In [64]:
numeric_pipeline.fit(train[["Age","Credit amount","Duration"]])

Pipeline(steps=[('Imputación con la media', SimpleImputer()),
                ('Escalado minmax', MinMaxScaler())])

In [66]:
numeric_pipeline[0].statistics_

array([  35.40285714, 3236.12142857,   21.04857143])

In [69]:
numeric_pipeline[1].min_

array([-0.36363636, -0.01375592, -0.05882353])

In [72]:
numeric_pipeline[1].data_max_

array([   75., 18424.,    72.])

In [73]:
train[["Age","Credit amount","Duration"]].max()

Age                 75
Credit amount    18424
Duration            72
dtype: int64

In [75]:
numeric_pipeline.transform(train[["Age","Credit amount","Duration"]])

array([[0.2       , 0.23236492, 0.47058824],
       [0.47272727, 0.08688236, 0.16176471],
       [0.12727273, 0.3944096 , 0.82352941],
       ...,
       [0.78181818, 0.07730824, 0.11764706],
       [0.52727273, 0.10157368, 0.11764706],
       [0.30909091, 0.18851106, 0.02941176]])

In [77]:
numeric_pipeline.transform([[np.nan, np.nan, np.nan]])



array([[0.28005195, 0.16430733, 0.25071429]])

In [86]:
#Sex, Purpose, Housing
categorical_pipeline = Pipeline(
    [
        ('Imputación con la moda', SimpleImputer(strategy='most_frequent')),
        ('Escalado OneHotEncoder', OneHotEncoder(sparse=False))
    ]
)

In [87]:
categorical_pipeline.fit(train[["Sex","Purpose","Housing"]])

Pipeline(steps=[('Imputación con la moda',
                 SimpleImputer(strategy='most_frequent')),
                ('Escalado OneHotEncoder', OneHotEncoder(sparse=False))])

In [88]:
categorical_pipeline.transform(train[["Sex","Purpose","Housing"]])

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [89]:
categorical_pipeline[1].categories_

[array(['female', 'male'], dtype=object),
 array(['business', 'car', 'domestic appliances', 'education',
        'furniture/equipment', 'radio/TV', 'repairs', 'vacation/others'],
       dtype=object),
 array(['free', 'own', 'rent'], dtype=object)]