In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns

In [205]:
df = pd.read_csv("s3://german-credit-20221001/datos/original/german_credit_data.csv")

In [163]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [165]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903
std,288.819436,11.375469,0.653614,2822.736876,12.058814
min,0.0,19.0,0.0,250.0,4.0
25%,249.75,27.0,2.0,1365.5,12.0
50%,499.5,33.0,2.0,2319.5,18.0
75%,749.25,42.0,2.0,3972.25,24.0
max,999.0,75.0,3.0,18424.0,72.0


In [166]:
df["Risk"].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

In [167]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Risk"])

In [168]:
train["Risk"].value_counts()/train["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [169]:
test["Risk"].value_counts()/test["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [170]:
train.to_csv("s3://german-credit-20221001/datos/train/train.csv", index=False)

In [171]:
test.to_csv("s3://german-credit-20221001/datos/test/test.csv")

In [172]:
pd.crosstab(index=train["Housing"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free,29,45,74
own,133,371,504
rent,48,74,122
All,210,490,700


In [173]:
pd.crosstab(index=train["Duration"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0,6,6
6,7,49,56
7,0,2,2
8,1,3,4
9,7,26,33
10,3,19,22
11,0,5,5
12,37,83,120
13,0,3,3
14,1,3,4


In [174]:
tabla_sex =pd.crosstab(index=train["Sex"], columns=train["Risk"], margins=True)

In [175]:
tabla_sex["proba"] = tabla_sex.iloc[:,1]/tabla_sex.iloc[:,2]

In [176]:
tabla_sex

Risk,bad,good,All,proba
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,76,138,214,0.64486
male,134,352,486,0.72428
All,210,490,700,0.7


In [177]:
##Preprosesamiento
from sklearn.impute import SimpleImputer

In [178]:
#Se imputan variables que no contienen datos, esto es para producción
#Age","Credit amount","Duration",
X_train = train[["Saving accounts","Checking account"]]

In [179]:
imputer = SimpleImputer(strategy="constant", fill_value='UPS')

In [180]:
imputer.fit(X_train)

SimpleImputer(fill_value='UPS', strategy='constant')

In [181]:
imputer.statistics_

array(['UPS', 'UPS'], dtype=object)

In [182]:
imputer.transform([['moderate',np.nan],[np.nan,np.nan]])



array([['moderate', 'UPS'],
       ['UPS', 'UPS']], dtype=object)

In [183]:
imputer.fit_transform(X_train)

array([['little', 'rich'],
       ['little', 'UPS'],
       ['UPS', 'moderate'],
       ...,
       ['little', 'UPS'],
       ['little', 'UPS'],
       ['little', 'little']], dtype=object)

## Prueba con KNN

In [184]:
df2 = pd.DataFrame( [[40,19,3.0],[42,20,3.1],[44,21,np.nan],[45,23,4.1],[39,25,5.0],[80,27,np.nan],[82,30,4.8]], columns=["Peso","Edad","Nota"])

In [185]:
from sklearn.impute import KNNImputer

In [192]:
imputer = KNNImputer(n_neighbors=4)

In [193]:
imputer.fit(df2)

KNNImputer(n_neighbors=4)

In [194]:
imputer.transform(df2)

array([[40.  , 19.  ,  3.  ],
       [42.  , 20.  ,  3.1 ],
       [44.  , 21.  ,  3.8 ],
       [45.  , 23.  ,  4.1 ],
       [39.  , 25.  ,  5.  ],
       [80.  , 27.  ,  3.75],
       [82.  , 30.  ,  4.8 ]])

## Escalado

In [197]:
 from sklearn.preprocessing import StandardScaler, RobustScaler

In [207]:
X_train = train[["Age","Credit amount","Duration"]]

In [208]:
scaler = StandardScaler()

In [211]:
scaler.fit(X_train[["Age"]])

StandardScaler()

In [213]:
scaler.mean_

array([35.40285714])

In [214]:
scaler.var_ ** 0.5

array([11.23479253])

In [215]:
scaler.transform(X_train[["Age"]])

array([[-0.39189483],
       [ 0.94324331],
       [-0.74793167],
       [-0.74793167],
       [-0.74793167],
       [ 0.05315121],
       [-0.035858  ],
       [-1.0149593 ],
       [-0.48090404],
       [ 1.12126173],
       [-0.48090404],
       [-1.28198693],
       [ 1.65531698],
       [-1.10396851],
       [-0.65892246],
       [-0.65892246],
       [ 1.29928014],
       [ 1.03225252],
       [-0.74793167],
       [ 1.8333354 ],
       [ 2.27838145],
       [ 1.12126173],
       [-0.21387641],
       [-0.83694088],
       [ 0.58720647],
       [ 1.03225252],
       [-0.74793167],
       [ 1.38828935],
       [-0.74793167],
       [-0.035858  ],
       [ 2.27838145],
       [ 1.74432619],
       [ 0.23116963],
       [ 0.32017884],
       [ 0.05315121],
       [-0.65892246],
       [-0.92595009],
       [-0.035858  ],
       [ 1.38828935],
       [ 0.23116963],
       [ 0.05315121],
       [-1.10396851],
       [-0.39189483],
       [-0.92595009],
       [ 0.40918805],
       [-0

In [218]:
scaler.inverse_transform([[-0.39189483]])

array([[31.00000003]])

In [220]:
X_train["Age"].quantile(0.5)

33.0

In [222]:
X_train["Age"] - X_train["Age"].mean() / (X_train["Age"].quantile(0.75) - X_train["Age"].quantile(0.25)*1.5)

328   -16.20381
891    -1.20381
255   -20.20381
243   -20.20381
492   -20.20381
         ...   
73     -6.20381
401   -19.20381
769    15.79619
2       1.79619
617   -10.20381
Name: Age, Length: 700, dtype: float64