In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
from sklearn.preprocessing import LabelEncoder
from math import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
mirrored_strategy = tf.distribute.MirroredStrategy()

Num GPUs Available:  1
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [4]:
df = pd.read_csv('home-credit-default-risk//application_train.csv')

# visualisation data

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.isna())

## valeur vide ou null 

In [None]:
(df.isna().sum()/df.shape[0]).sort_values()

In [None]:
#on suppr toutes les colonnes à plus de 90% de val manquante
df = df[df.columns[(df.isna().sum()/df.shape[0]) < 0.9]]

# Variables du csv

In [None]:
df.dtypes.value_counts()

In [None]:
pbar = tqdm(total=1)
for col in df.select_dtypes('float'):
    plt.figure()
    sns.distplot(df[col])
    pbar.update(1/len(df.select_dtypes('float')))
pbar.close()

In [None]:
pbar = tqdm(total=1)
for col in df.select_dtypes('object'):
    print(f'{col :-<40} => {df[col].unique()}')
    plt.figure()
    df[col].value_counts(normalize=True, dropna=False).plot.pie()
    pbar.update(1/len(df.select_dtypes('object')))
pbar.close()

# encoder des 16 colonnes objects

In [None]:
#df2 = pd.get_dummies(df)

In [5]:
le = []
for i,col in enumerate(df.select_dtypes('object').columns):
    try:
        le.append(LabelEncoder())
        df[col] = le[i].fit_transform(df[col].astype(str))
    except:
        print('error : '+col)

In [None]:
#le1 = LabelEncoder()
#le2 = LabelEncoder()

In [None]:
#df['EMERGENCYSTATE_MODE_1'] = df['EMERGENCYSTATE_MODE'].astype(str)

In [None]:
#le1.fit_transform(df['EMERGENCYSTATE_MODE_1'])

In [None]:
#le2.fit_transform(df['ORGANIZATION_TYPE'])

In [None]:
#le1.classes_

In [None]:
#le2.classes_

In [None]:
#df['EMERGENCYSTATE_MODE_1'] = le1.fit_transform(df['EMERGENCYSTATE_MODE_1'])
#df['ORGANIZATION_TYPE_1'] = le2.fit_transform(df['ORGANIZATION_TYPE'])

# difference entre positif et negatif

In [None]:
#df['TARGET']

In [None]:
df.dtypes.value_counts()

In [None]:
positif = df[df['TARGET'] == 1 ]
negatif = df[df['TARGET'] != 1 ]

In [None]:
df_int32_columns = df.select_dtypes('int32').columns
df_int64_columns = df.select_dtypes('int64').columns
df_float_columns = df.select_dtypes('float').columns

In [None]:
pbar = tqdm(total=1)
x=0
y=0
nbr = 3
fig, axs = plt.subplots(ceil(len(df_int32_columns)/nbr), nbr,  figsize=(20, 20))
for col in (df_int32_columns):
    if(x>=nbr):
        x=0
        y+=1
    sns.distplot(positif[col], label='positif', ax=axs[y,x])
    sns.distplot(negatif[col], label='negatif', ax=axs[y,x])
    axs[y,x].legend()
    x+=1
    pbar.update(1/len(df_int32_columns))
pbar.close()

In [None]:
pbar = tqdm(total=1)
x=0
y=0
nbr = 3
fig, axs = plt.subplots(ceil(len(df_int64_columns)/nbr), nbr,  figsize=(20, 80))
for col in (df_int64_columns):
    if(x>=nbr):
        x=0
        y+=1
    sns.distplot(positif[col], label='positif', ax=axs[y,x])
    sns.distplot(negatif[col], label='negatif', ax=axs[y,x])
    axs[y,x].legend()
    x+=1
    pbar.update(1/len(df_int64_columns))
pbar.close()

In [None]:
pbar = tqdm(total=1)
x=0
y=0
nbr = 3
fig, axs = plt.subplots(ceil(len(df_float_columns)/nbr), nbr,  figsize=(20, 120))
for col in (df_float_columns):
    if(x>=nbr):
        x=0
        y+=1
    sns.distplot(positif[col], label='positif', ax=axs[y,x])
    sns.distplot(negatif[col], label='negatif', ax=axs[y,x])
    axs[y,x].legend()
    x+=1
    pbar.update(1/len(df_float_columns))
pbar.close()

# nouveau dataTrain

In [6]:
data_train = df[['TARGET','EMERGENCYSTATE_MODE','HOUSETYPE_MODE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','FONDKAPREMONT_MODE','DAYS_BIRTH','DAYS_ID_PUBLISH','FLAG_PHONE','REG_CITY_NOT_WORK_CITY','FLAG_DOCUMENT_3','AMT_CREDIT','AMT_GOODS_PRICE','REGION_POPULATION_RELATIVE','DAYS_REGISTRATION','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','OBS_60_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_30_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE']]
data_train.head()

Unnamed: 0,TARGET,EMERGENCYSTATE_MODE,HOUSETYPE_MODE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,FONDKAPREMONT_MODE,DAYS_BIRTH,DAYS_ID_PUBLISH,FLAG_PHONE,REG_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_3,AMT_CREDIT,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_REGISTRATION,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE
0,1,0,0,8,6,5,1,0,1,7,4,3,-9461,-2120,1,0,1,406597.5,351000.0,0.018801,-3648.0,0.083037,0.262949,0.139376,2.0,2.0,2.0,2.0
1,0,0,0,3,1,39,0,0,0,4,1,3,-16765,-291,1,0,1,1293502.5,1129500.0,0.003541,-1186.0,0.311267,0.622246,,1.0,0.0,1.0,0.0
2,0,2,1,8,1,11,1,1,1,7,4,0,-19046,-2531,1,0,0,135000.0,135000.0,0.010032,-4260.0,,0.555912,0.729567,0.0,0.0,0.0,0.0
3,0,2,1,8,6,5,0,0,1,7,4,0,-19005,-2437,0,0,1,312682.5,297000.0,0.008019,-9833.0,,0.650442,,2.0,0.0,2.0,0.0
4,0,2,1,3,4,37,1,0,1,7,4,0,-19932,-3458,0,1,0,513000.0,513000.0,0.028663,-4311.0,,0.322738,,0.0,0.0,0.0,0.0


In [None]:
(data_train.isna().mean()).sort_values()

In [None]:
data_train['EMERGENCYSTATE_MODE'].isna().sum()

In [None]:
pbar = tqdm(total=1)
for col in data_train.columns:
    if((data_train[col].isna().sum()) > 0):
        print(f'{col :-<40} => {data_train[col].unique()}')
        plt.figure()
        data_train[col].value_counts(normalize=True, dropna=False).plot.pie()
    pbar.update(1/len(data_train.columns))
pbar.close()

# suppr les 1021 lignes vide 

In [None]:
a=data_train['OBS_30_CNT_SOCIAL_CIRCLE'].isna()
a2=data_train['OBS_60_CNT_SOCIAL_CIRCLE'].isna()
a3=data_train['DEF_60_CNT_SOCIAL_CIRCLE'].isna()

In [None]:
b= (a==1)&(a2==1)&(a3==1)

In [None]:
print(b==a)

In [None]:
data_train['OBS_30_CNT_SOCIAL_CIRCLE'].isna().sum()

In [7]:
data_train = data_train.dropna(subset=['OBS_30_CNT_SOCIAL_CIRCLE'])

In [None]:
(data_train.isna().sum()/data_train.shape[0]).sort_values()

# train et test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns=['TARGET']), data_train['TARGET'], test_size=0.20, random_state=42)

# Reseaux neuronnes 

# Model

In [22]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(27, activation='relu', input_shape=(X_train.shape)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(30, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [23]:
model.compile(loss='mse',
             optimizer='adam',
             metrics=['accuracy'])

In [24]:
X_train.shape

(245192, 27)

# Entrainement

In [25]:
history = model.fit(X_train,
                    epochs=100,
                    batch_size= 40,
                   validation_data=y_train)

Epoch 1/100


AttributeError: in user code:

    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py:543 train_step  **
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:391 update_state
        self._build(y_pred, y_true)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:321 _build
        self._metrics = nest.map_structure_up_to(y_pred, self._get_metric_objects,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1114 map_structure_up_to
        return map_structure_with_tuple_paths_up_to(
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1213 map_structure_with_tuple_paths_up_to
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1213 <listcomp>
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1116 <lambda>
        lambda _, *values: func(*values),  # Discards the path arg.
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:421 _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:421 <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:442 _get_metric_object
        y_t_rank = len(y_t.shape.as_list())

    AttributeError: 'NoneType' object has no attribute 'shape'


In [20]:
history = model.fit(X_train,
                    epochs=100,
                    batch_size= 40,
                   validation_data=y_train, 
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
                               tfdocs.modeling.EpochDots(),
                               tf.keras.callbacks.ModelCheckpoint("weights.best.hdf5", 
                                                                   monitor='val_accuracy', 
                                                                   verbose=0, 
                                                                   save_best_only=True, 
                                                                   save_weights_only=False, 
                                                                   mode='max', 
                                                                   periode=1)])

Epoch 1/100


AttributeError: in user code:

    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py:543 train_step  **
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:391 update_state
        self._build(y_pred, y_true)
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:321 _build
        self._metrics = nest.map_structure_up_to(y_pred, self._get_metric_objects,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1114 map_structure_up_to
        return map_structure_with_tuple_paths_up_to(
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1213 map_structure_with_tuple_paths_up_to
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1213 <listcomp>
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py:1116 <lambda>
        lambda _, *values: func(*values),  # Discards the path arg.
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:421 _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:421 <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\donovan\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:442 _get_metric_object
        y_t_rank = len(y_t.shape.as_list())

    AttributeError: 'NoneType' object has no attribute 'shape'


In [28]:
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import *

In [29]:
model1 = SGDClassifier(random_state=0)
model2 = DecisionTreeClassifier(random_state=0)
model3 = KNeighborsClassifier(n_neighbors=2)

#voting hard voting, soft voting
model4 = VotingClassifier([('sgd', model1), 
                          ('tree', model2),
                          ('kn', model3)],
                         voting='hard')

In [30]:
for model in (model1, model2, model3, model4):
    model.fit(X_train, y_train)
    print(model.__class__.__name__, model.score(X_test, y_test))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# normalisation de la data

# valeurs vide 

In [None]:
imputer = KNNImputer(n_neighbors=2)

In [None]:
lol = imputer.fit_transform([data_train['EXT_SOURCE_2']])[0]

In [None]:
lol.shape

In [None]:
columns = (data_train.isna().sum()/data_train.shape[0]).sort_values() >0
columns_index = columns.index[columns.values == True]

imputer = KNNImputer(n_neighbors= 2)

pbar = tqdm(total=1)
for column in columns_index:
    
    impute_with_2 = imputer.fit_transform([data_train[column]])[0]
    print("Colonne : "+ column + " -- taille KNN : "+str(len(impute_with_2)) + " -- taille colonne: "+str(len(data_train[column])))
#     print(len(data_train[column]))
#     data_train[column].fillna(impute_with_2)
        
#     i=0
#     for value in empty_values:
#         print(value)
#         value.fillna(impute_with_2[i])
#         i+=1
#     data_train[column] = impute_with_2
    
#     col = data_train[column]
    nulls = data_train[data_train[column].isna()]
#     print(nulls)
    for i, ni in enumerate(nulls.index[:len(impute_with_2)]):
        data_train[column].loc[ni] = impute_with_2[i]
#         print(str(data_train[column].loc[ni]) + " devient "+ str(impute_with_2[i]))

    pbar.update(1/(len(columns_index)))
pbar.close()


# data_train.head()

In [None]:
(data_train.isna().mean()).sort_values()

In [None]:
df.shape

In [None]:
df_test = pd.read_csv('home-credit-default-risk//application_test.csv')

In [None]:
for i,col in enumerate(df_test.select_dtypes('object').columns):
    try:
        df_test[col] = le[i].transform(df_test[col].astype(str))
    except:
        print('error : '+col)

# autre

In [None]:
test = df.groupby(['TARGET'])
test

In [None]:
test.head()

In [None]:
#test['CODE_GENDER'].value_counts()

In [None]:
test['FLAG_OWN_CAR'].value_counts()

In [None]:

x=0
y=0

# test.hist()
# test.plot()
# test.size()

# test['TARGET'].columns

# print(test.group.columns)

columns_name = []

for name, group in test:
   columns_name = group.columns


# negatif_test = test[test['TARGET'] != 1]
# positif_test = test['TARGET'] == 1


# test['TARGET'].hist()

test.hist(figsize=(25, 25))

# fig, axs = plt.subplots(ceil(len(columns_name)/4), 4,  figsize=(30, 20))
# for col in (columns_name):
#     if(x>=4):
#         x=0
#         y+=1
        
#     sns.distplot(test[col], label='positif', ax=axs[y,x])
# #     sns.distplot(positif_test[col], label='positif', ax=axs[y,x])
# #     sns.distplot(negatif_test[col], label='negatif', ax=axs[y,x])
#     axs[y,x].legend()
#     x+=1