In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [117]:
df = pd.read_csv('s3://creditdata2080/Datos/Original/german_credit_data.csv')

In [118]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


## Exploracion

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [120]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903
std,288.819436,11.375469,0.653614,2822.736876,12.058814
min,0.0,19.0,0.0,250.0,4.0
25%,249.75,27.0,2.0,1365.5,12.0
50%,499.5,33.0,2.0,2319.5,18.0
75%,749.25,42.0,2.0,3972.25,24.0
max,999.0,75.0,3.0,18424.0,72.0


In [121]:
df['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

In [122]:
train, test = train_test_split(df, test_size = 0.3, random_state = 42,stratify = df['Risk'])

In [123]:
train['Risk'].value_counts()/train['Risk'].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [124]:
test['Risk'].value_counts()/test['Risk'].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [125]:
train.to_csv('s3://creditdata2080/Datos/train/train.csv', index = False)

In [126]:
test.to_csv('s3://creditdata2080/Datos/test/train.csv',index = False)

In [127]:
#Verificar la cantidad de valores nulos en el dataset
train.isna().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     125
Checking account    274
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [128]:
#Cantidad de reuniones por grupo
train.groupby(['Risk']).size().reset_index(name='cantidad de reuniones por grupo')

Unnamed: 0,Risk,cantidad de reuniones por grupo
0,bad,210
1,good,490


In [129]:
#Cantidad de reuniones por grupo
train.groupby(['Risk','Sex']).size().reset_index(name='cantidad de reuniones por grupo')

Unnamed: 0,Risk,Sex,cantidad de reuniones por grupo
0,bad,female,76
1,bad,male,134
2,good,female,138
3,good,male,352


In [130]:
#Cantidad de reuniones por grupo
train.groupby(['Sex']).size().reset_index(name='cantidad de reuniones por grupo')

Unnamed: 0,Sex,cantidad de reuniones por grupo
0,female,214
1,male,486


In [131]:
#Cantidad de reuniones por grupo
train.groupby(['Age','Risk']).size().reset_index(name='cantidad de reuniones por grupo')

Unnamed: 0,Age,Risk,cantidad de reuniones por grupo
0,20,bad,3
1,20,good,8
2,21,bad,3
3,21,good,5
4,22,bad,9
...,...,...,...
89,68,bad,2
90,68,good,1
91,70,good,1
92,74,good,1


In [132]:
pd.crosstab(index=train['Housing'], columns=train['Risk'], margins =True)

Risk,bad,good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free,29,45,74
own,133,371,504
rent,48,74,122
All,210,490,700


In [133]:
tabla_sex=pd.crosstab(index=train['Sex'], columns=train['Risk'], margins =True)

In [134]:
tabla_sex

Risk,bad,good,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,76,138,214
male,134,352,486
All,210,490,700


In [135]:
tabla_sex['probabilidad'] = tabla_sex.iloc[:,1]/tabla_sex.iloc[:,2]

In [136]:
tabla_sex

Risk,bad,good,All,probabilidad
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,76,138,214,0.64486
male,134,352,486,0.72428
All,210,490,700,0.7


## Preprocesamiento

## Imputación

In [137]:
from sklearn.impute import SimpleImputer
import numpy as np

In [138]:
X_train = train[['Age', 'Credit amount','Duration']]

In [139]:
imputer = SimpleImputer(strategy = 'mean')

In [140]:
imputer.statistics_

AttributeError: 'SimpleImputer' object has no attribute 'statistics_'

In [None]:
X_train['Age'].mean()

In [None]:
imputer.transform([[24,100,np.nan], [np.nan,np.nan,12]])

In [None]:
imputer.fit_transform(X_train)

In [None]:
X_train = train[['Saving accounts', 'Checking account']]

In [None]:
imputer = SimpleImputer(strategy = 'most_frequent')

In [None]:
imputer.fit(X_train)

In [None]:
imputer.statistics_

In [None]:
imputer.fit_transform(X_train)

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors = 2)

In [None]:
df2 = pd.DataFrame([[40, 19, 3], [42,20,3.1], [44,21,np.nan], [45,23,4.1], [39,25, 5.0], [80,27,np.nan], [82,3.0,4.8]], columns=["Peso", "Edad", "nota"])

In [None]:
df2

In [None]:
result = imputer.fit_transform(df2)

In [None]:
result

## Escalada

In [143]:
from sklearn.preprocessing import StandardScaler, RobustScaler

In [144]:
scaler = StandardScaler()

In [145]:
scaler.fit(X_train[['Age']])

StandardScaler()

In [146]:
scaler.mean_

array([35.40285714])

In [148]:
scaler.var_**0.5

array([11.23479253])

In [149]:
scaler.transform(X_train[['Age']])

array([[-0.39189483],
       [ 0.94324331],
       [-0.74793167],
       [-0.74793167],
       [-0.74793167],
       [ 0.05315121],
       [-0.035858  ],
       [-1.0149593 ],
       [-0.48090404],
       [ 1.12126173],
       [-0.48090404],
       [-1.28198693],
       [ 1.65531698],
       [-1.10396851],
       [-0.65892246],
       [-0.65892246],
       [ 1.29928014],
       [ 1.03225252],
       [-0.74793167],
       [ 1.8333354 ],
       [ 2.27838145],
       [ 1.12126173],
       [-0.21387641],
       [-0.83694088],
       [ 0.58720647],
       [ 1.03225252],
       [-0.74793167],
       [ 1.38828935],
       [-0.74793167],
       [-0.035858  ],
       [ 2.27838145],
       [ 1.74432619],
       [ 0.23116963],
       [ 0.32017884],
       [ 0.05315121],
       [-0.65892246],
       [-0.92595009],
       [-0.035858  ],
       [ 1.38828935],
       [ 0.23116963],
       [ 0.05315121],
       [-1.10396851],
       [-0.39189483],
       [-0.92595009],
       [ 0.40918805],
       [-0

In [151]:
scaler.inverse_transform([[-0.39189483]])

array([[31.00000003]])

In [152]:
#Robust scaler
X_train['Age'].quantile(0.5)

33.0

In [156]:
median = X_train['Age'].median()

In [154]:
q1 = X_train['Age'].quantile(0.25)

In [155]:
q3 = X_train['Age'].quantile(0.75)