In [None]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#import optbinning as opt
%matplotlib inline
import pandas as pd
from sklearn.impute import SimpleImputer
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.preprocessing import OneHotEncoder, StandardScaler,KBinsDiscretizer, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import mannwhitneyu ,chi2_contingency, anderson, f_oneway
import statsmodels.api as sm 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score


: 

In [None]:
data=pd.read_csv("../data/application_train_vf.csv",parse_dates=["date_mensuelle"], index_col=0)

: 

In [None]:
data["SK_ID_CURR"].value_counts().max()

: 

In [None]:
data["NAME_CONTRACT_TYPE"].value_counts()

: 

In [None]:
data=data[data["NAME_CONTRACT_TYPE"]=="Cash loans"]
data.drop(columns=["NAME_CONTRACT_TYPE"], inplace = True)

: 

In [None]:
pd.DataFrame(data.isna().mean())

: 

In [None]:
data["TARGET"].value_counts(normalize=True)

: 

#### Variables CREDIT BUREAU

In [None]:
credit_bureau_data=pd.read_csv("../data/cb_findings.csv", index_col=0)
data=data.merge(credit_bureau_data, left_on="SK_ID_CURR", right_on="CB_SK_ID_CURR")

: 

In [None]:
credit_bureau_data.isna().mean()

: 

#### Etudes des variables -- prédicteurs potentiels

In [None]:
data.dtypes

: 

In [None]:
data["HAS_CHILDREN"]=data["CNT_CHILDREN"].apply(lambda x : "Y" if x > 0 else "N")
data.drop(columns=["CNT_CHILDREN"], inplace=True)

: 

In [None]:
data.nunique()

: 

In [None]:
drop_documents_cols=["FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4" , 
"FLAG_DOCUMENT_5" , "FLAG_DOCUMENT_6","FLAG_DOCUMENT_7" ,              
"FLAG_DOCUMENT_8" , "FLAG_DOCUMENT_9","FLAG_DOCUMENT_10"  ,"FLAG_DOCUMENT_11" , "FLAG_DOCUMENT_12",              
"FLAG_DOCUMENT_13" ,"FLAG_DOCUMENT_14" ,"FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16" ,
"FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18","FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20","FLAG_DOCUMENT_21"]

: 

In [None]:
data.drop(columns=drop_documents_cols, inplace=True)

: 

In [None]:
def convert_numeric_to_category(df: pd.DataFrame):
    for colname in df.columns.tolist():
        if (df[colname].dtype=="number") & (df[colname].nunique() <=10):
            df[colname]=df[colname].astype("category")
            pass
        else:
            pass

: 

In [None]:
convert_numeric_to_category(data)

: 

In [None]:
data["date_annee"]=data["date_mensuelle"].dt.year

: 

In [None]:
out_of_sample_data=data[data["date_annee"]==2020]
#data=data[data["date_annee"]<2020]

: 

: 

In [None]:
out_of_sample_data["TARGET"].value_counts(normalize=True)


: 

In [None]:
data["TARGET"].value_counts(normalize=True)

: 

In [None]:
categorical_vars=data.select_dtypes(include="object").columns.tolist()
numerical_vars=data.select_dtypes(include="number").columns.tolist()
binary_vars=[var for var in numerical_vars if (data[var].nunique()==2)&(var !="TARGET")]

: 

In [None]:
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


def mannwhitney_test(df:pd.DataFrame,variable:str,target:str):
    # Séparer les données en deux groupes en fonction de la variable cible
    group_1 = df[df[target] == 0]
    group_2 = df[df[target] == 1]

    stat, p_value = mannwhitneyu(group_1[variable].dropna(), group_2[variable].dropna())
    # Afficher les résultats
    print(f"Variable: {variable}")
    print(f"Mann-Whitney U-statistic: {stat}")
    print(f"P-value: {p_value}")
    print(10*"===")
    
    pass


def calculate_information_value_from_contingency_table(contingency_table):
    """
    Calculer l'Information Value à partir d'une table de contingence.
    
    Args:
        contingency_table (pd.DataFrame): La table de contingence entre la variable et la target.
        
    Returns:
        float: L'Information Value calculé.
    """
    non_event_rate=contingency_table.iloc[0]/(contingency_table.iloc[0].sum())
    event_rate=contingency_table.iloc[1]/(contingency_table.iloc[1].sum())
    iv=0
    if (non_event_rate.min() > 0) & (event_rate.mean() >0) :
        for col in non_event_rate.index:
            iv += (event_rate[col] - non_event_rate[col])*np.log(event_rate[col] / non_event_rate[col])

    return iv

: 

In [None]:

def show_volume_stability_overtime(data:pd.DataFrame,colname:str,threshold=0.05):
        # Calcul des pourcentages pour chaque modalité au fil du temps
    resultats = data[[colname, "date_annee"]].groupby(by=["date_annee"]).value_counts(normalize=True).unstack().fillna(0)

    # Affichage des lineplots
    sns.set(style="whitegrid")
    resultats.plot(kind='line', marker='o', markersize=8)
    plt.title('Pourcentage de chaque modalité au fil du temps')
    plt.hlines(y=threshold, xmin=data["date_annee"].min(), xmax=data["date_annee"].max(), linestyles="dashed")
    plt.xlabel('Temps')
    plt.ylabel('Pourcentage')
    plt.legend(title=f"{colname}")
    plt.show()
    
    
def show_risk_stability_overtime(data:pd.DataFrame,colname:str):
    result = data.groupby([colname, "date_annee"])['TARGET'].value_counts(normalize=True).unstack().fillna(0)[1]

    # Tracer le lineplot
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=result.index.get_level_values("date_annee"), y=result.values, hue=result.index.get_level_values(f"{colname}"), marker='o')

    # Ajouter des étiquettes et un titre
    plt.xlabel('date_annee')
    plt.ylabel('Taux de défaut')
    plt.title(f"Taux de défaut en fonction de {colname} et année")

    # Ajouter une légende
    plt.legend(title=f"{colname}")

    # Afficher le graphique
    plt.show()
    

: 

#### Test de stabilité en risque des variables binaires

In [None]:
binary_vars

: 

In [None]:
data["FLAG_MOBIL"].value_counts()

: 

In [None]:
for var in binary_vars:
    show_risk_stability_overtime(data,var)

: 

In [None]:
#### variables à écarter : flag_mobil, flag_cont_mobil, flag_email, reg_region_not_live_region, reg_region_not_work_region,live_region_not_work_region, 

: 

In [None]:
binary_risk_non_stable_vars=["FLAG_MOBIL", "FLAG_CONT_MOBILE", "FLAG_EMAIL", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION","LIVE_REGION_NOT_WORK_REGION"]

: 

In [None]:
binary_vars=list(filter(lambda x : x not in binary_risk_non_stable_vars, binary_vars))

: 

In [None]:
from scipy.stats import chi2_contingency

: 

In [None]:
for col in binary_vars:
    print(col)
    print(chi2_contingency(pd.crosstab(data["TARGET"], data[col].dropna())))
    print(60*"=")

: 

#### Test de stabilité en volume des variables binaires

In [None]:
for var in binary_vars:
    show_volume_stability_overtime(data,var)

: 

In [None]:
#### variables à virer : flag_mobil , flag_cont_mobil,reg_region_not_live_region, live_region_not_work_region

: 

In [None]:
binary_volume_non_stable_vars=[]

: 

In [None]:
# variables à virer parmi les binaires:
binary_non_stable_vars=list(set(binary_volume_non_stable_vars+binary_risk_non_stable_vars))

: 

In [None]:
binary_vars=list(filter(lambda x : x not in binary_non_stable_vars,binary_vars))

: 

In [None]:
data.drop(columns=binary_non_stable_vars, inplace=True)

: 

In [None]:
numerical_vars=list(filter(lambda x : x not in binary_vars+binary_non_stable_vars,numerical_vars))

: 

In [None]:
numerical_vars

: 

#### Model Benchmark

#### Test de stabilité des variables catégorielles à faible modalité (2-4)

In [None]:
low_category_categorical_vars=[var for var in categorical_vars+numerical_vars if  (var not in binary_vars+binary_non_stable_vars+["TARGET"]) & (data[var].nunique()>=2) & (data[var].nunique()<=4)]

: 

In [None]:
low_category_categorical_vars

: 

In [None]:
for colname in low_category_categorical_vars:
    show_volume_stability_overtime(data,colname)

: 

In [None]:
for col in low_category_categorical_vars:
    show_risk_stability_overtime(data,col)

: 

In [None]:
low_category_non_stable_vars=["FLAG_OWN_REALTY", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "EMERGENCYSTATE_MODE"]
low_category_categorical_vars=list(filter(lambda x : x not in low_category_non_stable_vars,low_category_categorical_vars))

: 

In [None]:
data["CODE_GENDER"]=data["CODE_GENDER"].apply(lambda x: "F" if x =="XNA" else x )

: 

In [None]:
data.drop(columns=low_category_non_stable_vars)

: 

#### Test de stabilité des autres variables catégorielles

In [None]:
categorical_vars=list(filter(lambda x : x not in binary_non_stable_vars+binary_vars+low_category_categorical_vars+low_category_non_stable_vars, categorical_vars))

: 

In [None]:
for col in categorical_vars:
    print(data[col].value_counts(normalize=True))
    print(40*"=")

: 

In [None]:
for col in categorical_vars:
    n=data.shape[0]
    contingency_table=pd.crosstab(data["TARGET"], data[col])
    chi2, p, _, _ = chi2_contingency(contingency_table, correction=True) #Application d'une correction de Yates par rapport à la faible représentation des classes
    k, r = contingency_table.shape
    v_cramer = np.sqrt(chi2 / (n * min(k-1, r-1)))
    print(f"\nCrosstab for {col}:\n")
    print(contingency_table)
    print("\n" + "-"*40)
    print(f"\nChi-squared: {chi2}")
    print(f"P-value: {p}")
    print(f"Cramer's V: {v_cramer}")
    print("\n" + "="*80)
    

: 

In [None]:
for col in categorical_vars:
    show_volume_stability_overtime(data,col)

: 

In [None]:
data["OCCUPATION_TYPE"].value_counts(normalize=True)

: 

In [None]:
data["REALTY"]="OWN_REALTY_"+data["FLAG_OWN_REALTY"]+"-TYPE_"+data["NAME_HOUSING_TYPE"]

: 

In [None]:
data["REALTY"].value_counts(normalize=True)

: 

In [None]:
data["OCCUPATION_TYPE"].fillna("Unknown", inplace=True)

: 

In [None]:
group_occupation_type={
    0: ['Accountants', 'HR staff', 'High skill tech staff'],
 1: ['Managers', 'Core staff', 'Private service staff', 'Unknown',
        'Medicine staff', 'IT staff', 'Secretaries'],
 2: ['Realty agents', 'Cleaning staff', 'Sales staff', 'Laborers',
        'Cooking staff', 'Security staff'],
 3: ['Drivers', 'Waiters/barmen staff', 'Low-skill Laborers'],
}

: 

In [None]:
data["OCCUPATION_TYPE"]=data["OCCUPATION_TYPE"].map({value: key for key, values in group_occupation_type.items() for value in values})

: 

: 

In [None]:
show_risk_stability_overtime(data,"OCCUPATION_TYPE")

: 

In [None]:
show_volume_stability_overtime(data,"OCCUPATION_TYPE")

: 

In [None]:
for col in categorical_vars:
    show_risk_stability_overtime(data,col)

: 

In [None]:
group_education_type={
    "Graduated" : ["Academic degree", "Higher education"],
    "Non graduated":["Lower secondary", "Secondary / secondary special", "Incomplete higher"]
}

: 

In [None]:
data["NAME_EDUCATION_TYPE"]=data["NAME_EDUCATION_TYPE"].map({value: key for key, values in group_education_type.items() for value in values})

: 

In [None]:
show_risk_stability_overtime(data,"NAME_EDUCATION_TYPE")

: 

In [None]:
show_volume_stability_overtime(data,"NAME_EDUCATION_TYPE")

: 

In [None]:
group_family_status={
    "Already_Married": ["Civil marriage", "Married","Separated", "Widow"],
    "Single" :["Single / not married", "Unknown"]
    # "Separated": []
}

: 

In [None]:
data["NAME_FAMILY_STATUS_2"]=data["NAME_FAMILY_STATUS"].map({value: key for key, values in group_family_status.items() for value in values})

: 

In [None]:
data["NAME_FAMILY_STATUS"].value_counts(normalize=True)

: 

In [None]:
show_risk_stability_overtime(data,"NAME_FAMILY_STATUS_2" )

: 

In [None]:
cramers_v(pd.crosstab(data["HAS_CHILDREN"], data["NAME_FAMILY_STATUS_2"]))

: 

In [None]:
pd.crosstab(data["HAS_CHILDREN"], data["NAME_FAMILY_STATUS_2"])

: 

In [None]:
data["FAM_STATS_CHILD"]=data["NAME_FAMILY_STATUS_2"]+"-"+"HAS_CHILDREN_"+data["HAS_CHILDREN"] 

: 

In [None]:
show_risk_stability_overtime(data, "FAM_STATS_CHILD")

: 

In [None]:
show_volume_stability_overtime(data, "FAM_STATS_CHILD")

: 

In [None]:
data["FAM_STATS_CHILD"].value_counts(normalize=True)

: 

In [None]:
data["FAM_STATS_CHILD"]=data["FAM_STATS_CHILD"].apply(lambda x : "Single" if x in ["Single-HAS_CHILDREN_N","Single-HAS_CHILDREN_Y"] else x)

: 

In [None]:
show_risk_stability_overtime(data, "FAM_STATS_CHILD")

: 

In [None]:
cramers_v(pd.crosstab(data["TARGET"], data["NAME_FAMILY_STATUS_2"]))

: 

In [None]:
cramers_v(pd.crosstab(data["TARGET"], data["HAS_CHILDREN"]))

: 

In [None]:
cramers_v(pd.crosstab(data["TARGET"], data["FAM_STATS_CHILD"]))

: 

In [None]:
calculate_information_value_from_contingency_table(pd.crosstab(data["TARGET"], data["NAME_FAMILY_STATUS_2"]))

: 

In [None]:
calculate_information_value_from_contingency_table(pd.crosstab(data["TARGET"], data["FAM_STATS_CHILD"]))

: 

In [None]:
calculate_information_value_from_contingency_table(pd.crosstab(data["TARGET"], data["HAS_CHILDREN"]))

: 

In [None]:
calculate_information_value_from_contingency_table(pd.crosstab(data["TARGET"], data["NAME_EDUCATION_TYPE"]))

: 

: 

In [None]:
for col in categorical_vars:
    print(data[[col,"TARGET"]].groupby(by=col).agg(np.mean).sort_values(by="TARGET", ascending=False))
    print(60*"=")

: 

#### Test de combinaison de variables numériques

In [None]:
social_vars=["OBS_30_CNT_SOCIAL_CIRCLE",
"DEF_30_CNT_SOCIAL_CIRCLE",
"OBS_60_CNT_SOCIAL_CIRCLE",
"DEF_60_CNT_SOCIAL_CIRCLE", "DAYS_LAST_PHONE_CHANGE"
]

: 

In [None]:
for col in social_vars:
    print(mannwhitney_test(data,col, "TARGET"))

: 

In [None]:
data["DAYS_LAST_PHONE_CHANGE"].fillna(data["DAYS_LAST_PHONE_CHANGE"].min(), inplace=True)

: 

In [None]:
data["AMT_INCOME_TOTAL"].describe()

: 

In [None]:
data["AMT_CREDIT_TO_INCOME"]=(data["AMT_CREDIT"]+data["CB_AMT_CREDIT_SUM"])/data["AMT_INCOME_TOTAL"]

: 

In [None]:
# Plot des distributions kernel pour TARGET==0 et TARGET==1
sns.kdeplot(data.loc[data["TARGET"]==0, "AMT_CREDIT_TO_INCOME"], color='skyblue', label='TARGET=0', fill=True)
sns.kdeplot(data.loc[data["TARGET"]==1, "AMT_CREDIT_TO_INCOME"], color='red', label='TARGET=1', fill=True)

plt.title("Distribution de AMT_CREDIT_TO_INCOME")
plt.xlabel('Valeurs')
plt.ylabel('Densité')
plt.legend()
plt.show()


: 

In [None]:
median_imputer=SimpleImputer(strategy="median").set_output(transform="pandas")
data["AMT_GOODS_PRICE"]=median_imputer.fit_transform(data["AMT_GOODS_PRICE"].to_numpy().reshape(-1,1))

: 

In [None]:
data["AMT_GOODS_PRICE"].isna().mean()

: 

In [None]:
data["AMT_GOODS_PRICE"].describe()

: 

In [None]:
data["AMT_CREDIT_NORM"]=data["AMT_CREDIT"]/data["AMT_GOODS_PRICE"]
data["AMT_ANNUITY"]=(data["AMT_ANNUITY"]+data["CB_AMT_ANNUITY"])/data["AMT_INCOME_TOTAL"]
data["AMT_INCOME_TOTAL_NORM"]=data["AMT_INCOME_TOTAL"]/data["AMT_GOODS_PRICE"]

: 

In [None]:
data["BORROWER_AGE"]=data["DAYS_BIRTH"].apply(np.abs)//365
data["BORROWER_SENIORITY"]=data["DAYS_EMPLOYED"].apply(np.abs)//365
data["BORROWER_FIDELITY"]=data["DAYS_REGISTRATION"].apply(np.abs)//365

: 

In [None]:
for col in ["BORROWER_AGE","BORROWER_SENIORITY","BORROWER_FIDELITY"]:
    plt.figure(figsize=(10, 6))
    # Plot des distributions kernel pour TARGET==0 et TARGET==1
    sns.kdeplot(data.loc[data["TARGET"]==0, col], color='green', label='TARGET=0', fill=False)
    sns.kdeplot(data.loc[data["TARGET"]==1, col], color='red', label='TARGET=1', fill=False)

    plt.title(f"Distribution de {col}")
    plt.xlabel('Valeurs')
    plt.ylabel('Densité')
    plt.legend()
    plt.show()

: 

In [None]:
data["BORROWER_SENIORITY"].quantile(q=0.8)

: 

In [None]:
for col in ["DAYS_LAST_PHONE_CHANGE","AMT_INCOME_TOTAL_NORM", "AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY","AMT_GOODS_PRICE"]:
    plt.figure(figsize=(10, 6))
    # Plot des distributions kernel pour TARGET==0 et TARGET==1
    sns.kdeplot(data.loc[data["TARGET"]==0, col], color='green', label='TARGET=0', fill=False)
    sns.kdeplot(data.loc[data["TARGET"]==1, col], color='red', label='TARGET=1', fill=False)

    plt.title(f"Distribution de {col}")
    plt.xlabel('Valeurs')
    plt.ylabel('Densité')
    plt.legend()
    plt.show()

: 

#### Test de rang des variables numériques ? 

In [None]:
numerical_vars=[var for var in numerical_vars if (var not in binary_vars)& (var not in low_category_categorical_vars)]

: 

In [None]:
numerical_vars

: 

In [None]:
data["CNT_FAM_MEMBERS"].value_counts(normalize=True)

: 

In [None]:
data[['AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']].describe()

: 

In [None]:
tested_numerical_variables=[
    "BORROWER_AGE","BORROWER_SENIORITY","BORROWER_FIDELITY","AMT_INCOME_TOTAL_NORM",
    "AMT_CREDIT_NORM", "AMT_INCOME_TOTAL","AMT_CREDIT",
    "AMT_ANNUITY","AMT_GOODS_PRICE",
    'CB_AMT_CREDIT_SUM_DEBT', 'CB_NB_CREDIT_ACTIVE', 'CB_NB_CREDIT_CLOSED', 'CB_DAYS_CREDIT', 'CB_DAYS_CREDIT_ENDDATE', 'CB_AMT_CREDIT_SUM', 'CB_AMT_ANNUITY'
] 

: 

In [None]:
for col in tested_numerical_variables:
    mannwhitney_test(data,col,"TARGET")

: 

In [None]:
col="CB_DAYS_CREDIT"
plt.figure(figsize=(10, 6))
# Plot des distributions kernel pour TARGET==0 et TARGET==1
sns.kdeplot(data.loc[data["TARGET"]==0, col], color='green', label='TARGET=0', fill=False)
sns.kdeplot(data.loc[data["TARGET"]==1, col], color='red', label='TARGET=1', fill=False)

plt.title(f"Distribution de {col}")
plt.xlabel('Valeurs')
plt.ylabel('Densité')
plt.legend()
plt.show()

: 

In [None]:
# Correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(data[tested_numerical_variables].corr(method="spearman"), annot=True,fmt=".2f")
plt.show()

: 

In [None]:
# tested_numerical_variables.remove("AMT_ANNUITY")

: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

class DecisionTreeDiscretizer:
    def __init__(self, max_bins=5, target=None):
        self.tree_max_bins = max_bins
        self.clf = DecisionTreeClassifier(criterion="gini", max_depth=int(round(self.tree_max_bins/2)),
                                          min_samples_split=0.05,
                                           min_samples_leaf=0.05,
                                           class_weight=None) #
        self.target = target
        

    def fit(self, X_train):
        # Entraîner le modèle sur les données d'entraînement
        self.clf.fit(X_train.values.reshape(-1, 1), self.target)

    def _get_tree_thresholds(self):
        thresholds = set()
        for node in range(self.clf.tree_.node_count):
            if self.clf.tree_.children_left[node] != self.clf.tree_.children_right[node]:  # non-leaf node
                feature = self.clf.tree_.feature[node]
                threshold = self.clf.tree_.threshold[node]
                thresholds.add(threshold)
        return np.array(list(thresholds))

    def get_thresholds(self):
        if hasattr(self.clf, 'tree_') and self.clf.tree_ is not None:
            thresholds_np = self._get_tree_thresholds()
            thresholds = [-np.inf] + sorted(list(set(list(thresholds_np)))) + [np.inf]
            return thresholds
        else:
            raise ValueError("Le classifieur n'est pas entraîné. Utilisez la méthode fit avant d'obtenir les seuils.")

    def transform(self, X):
        # Vérifier si le modèle est entraîné
        if not hasattr(self.clf, 'tree_') or self.clf.tree_ is None:
            raise ValueError("Le classifieur n'est pas entraîné. Utilisez la méthode fit avant de transformer les données de test.")

        thresholds = self.get_thresholds()

        # Utiliser pd.cut pour obtenir les intervalles au lieu des numéros
        intervals = pd.cut(X.values.flatten(), bins=thresholds, include_lowest=True, right=True)
        return intervals

: 

In [None]:
data["AMT_GOODS_PRICE"].isna().mean()

: 

In [None]:
data["AMT_CREDIT_NORM"].isna().mean()

: 

In [None]:
data[binary_vars+low_category_categorical_vars]=data[binary_vars+low_category_categorical_vars].astype("category")

: 

In [None]:
for col in binary_vars:
    print(f"{col} IV {calculate_information_value_from_contingency_table(pd.crosstab(data['TARGET'], data[col]))}")
    print( 60*"*")

: 

In [None]:
for col in low_category_categorical_vars:
    print(f"{col} IV {calculate_information_value_from_contingency_table(pd.crosstab(data['TARGET'], data[col]))}")
    print( 60*"*")

: 

In [None]:
def group_days_credit(days):
    if (np.abs(days) < 365) :
        return "< 1 YEAR"
    elif (np.abs(days) >= 365) & (np.abs(days) <365*2):
        return "BETWEEN 1-2 YEAR"
    elif (np.abs(days) >= 365*2) :
        return "MORE THAN 2 YEARs"
    

: 

In [None]:
def group_number_credit(nb):
    if nb <=2 :
        return "2 OR LESS"
    elif (nb > 2) & (nb <=4):
        return "3 OR 4"
    else:
        return "5 OR MORE"

: 

In [None]:
# data["CB_NB_CREDIT_ACTIVE"]=data["CB_NB_CREDIT_ACTIVE"].apply(group_number_credit)

: 

In [None]:
# data["CB_NB_CREDIT_ACTIVE"].value_counts()

: 

In [None]:
# data["CB_DAYS_CREDIT"]=data["CB_DAYS_CREDIT"].apply(group_days_credit)

: 

In [None]:
# data["CB_NB_ACTIVE_CREDIT_DAYS_LAST"]=data["CB_NB_CREDIT_ACTIVE"] + "-" + data["CB_DAYS_CREDIT"]

: 

In [None]:
# show_risk_stability_overtime(data,"CB_DAYS_CREDIT")

: 

In [None]:
# calculate_information_value_from_contingency_table(pd.crosstab(data["TARGET"],data["CB_DAYS_CREDIT"]))

: 

In [None]:
import statsmodels.api as sm
data_train, data_test=train_test_split(data, test_size=0.3, stratify=data["TARGET"], random_state=42)

: 

In [None]:
discretised_cols=["AMT_INCOME_TOTAL_NORM", "AMT_CREDIT_TO_INCOME" , "BORROWER_AGE", "BORROWER_SENIORITY",
                  "BORROWER_FIDELITY", "AMT_CREDIT_NORM", "DAYS_LAST_PHONE_CHANGE"
                  ]
discretised_cols_2=["AMT_ANNUITY","AMT_GOODS_PRICE", "CB_DAYS_CREDIT",
                  'CB_AMT_CREDIT_SUM_DEBT', 'CB_NB_CREDIT_CLOSED', 
                  'CB_DAYS_CREDIT_ENDDATE', 'CB_AMT_CREDIT_SUM', 'CB_AMT_ANNUITY'
]

: 

In [None]:
dt_discretizer=DecisionTreeDiscretizer(target=data_train["TARGET"])

for col in discretised_cols:
    dt_discretizer.fit(data_train[col])
    data_train[col]=dt_discretizer.transform(data_train[col])
    data_test[col]=dt_discretizer.transform(data_test[col])

: 

In [None]:
calculate_information_value_from_contingency_table(pd.crosstab(data_train["TARGET"], data_train["DAYS_LAST_PHONE_CHANGE"]))

: 

In [None]:
dt_discretizer=DecisionTreeDiscretizer(target=data_train["TARGET"])

for col in discretised_cols_2:
    dt_discretizer.fit(data_train[col])
    data_train[col]=dt_discretizer.transform(data_train[col])
    data_test[col]=dt_discretizer.transform(data_test[col])

: 

In [None]:
for col in discretised_cols+discretised_cols_2:
    show_risk_stability_overtime(data_train,col)

: 

In [None]:
for col in discretised_cols+discretised_cols_2:
    show_volume_stability_overtime(data_train,col)

: 

In [None]:
for col in discretised_cols+discretised_cols_2:
    print(f"{col} : {calculate_information_value_from_contingency_table(pd.crosstab(data_train['TARGET'], data_train[col]))}")
    print(60*"=")

: 

In [None]:
cramers_v(pd.crosstab(data["BORROWER_AGE"], data["BORROWER_SENIORITY"]))

: 

In [None]:
data_train[["AMT_CREDIT_NORM" , "BORROWER_AGE" , "BORROWER_FIDELITY" , "FLAG_OWN_CAR"]].isna().mean()

: 

In [None]:
data_train[["REGION_RATING_CLIENT_W_CITY","FAM_STATS_CHILD", "NAME_EDUCATION_TYPE", "AMT_CREDIT_NORM" , "BORROWER_AGE" , "BORROWER_FIDELITY" , "BORROWER_SENIORITY","DAYS_LAST_PHONE_CHANGE"]].isna().mean()

: 

In [None]:
data_train["CB_DAYS_CREDIT"].value_counts()

: 

#### Model selection : toutes les combinaisons possibles

from itertools import combinations
class IterativeFitter:
    
    def __init__(self, candidates, target, n_features_max, data):
        self.candidates = candidates
        self.target = target
        self.n_features_max = n_features_max
        self.data_train, self.data_test = train_test_split(data, test_size=0.3, stratify=data[target], random_state=42)
        self.model_candidates = self._combinations()
        self.correlated_features = []
        self.aucs=[]
        pass
    
    
    def _combinations(self):
        # for r in range(4,min(self.n_features_max,len(self.candidates)+1)):
        return list(combinations(self.candidates, self.n_features_max))
     
    
    
    def _fit_one_model(self, features):
        categorized_features =[f"C({feature})" for feature in features ]
        try:
            formula = f"{self.target} ~ " + " + ".join(categorized_features) + " - 1"
            
            # Ajuster le modèle logistique
            model = sm.Logit.from_formula(formula=formula, data=self.data_train)
            result = model.fit(disp=0)
            
            # Prédire les probabilités et calculer l'AUC
            y_pred_proba = result.predict(self.data_test)
            auc = roc_auc_score(self.data_test[self.target] , y_pred_proba)
            
            return auc, result

        except Exception as e:
            # Gérer l'erreur de matrice singulière
            self.correlated_features.append(features)
            print("Une erreur est survenue lors de l'ajustement du modèle :", str(e))
            return None, None
    
    
    def get_best_model(self):
        # self._combinations()
        best_model=None
        best_auc=0
        for predictors_set in self.model_candidates:
            
            auc, result = self._fit_one_model(predictors_set)
            if auc != None :
                self.aucs.append(auc)
                if auc > best_auc:
                    best_auc = auc
                    best_model = result
                    best_predictors = predictors_set
                
        self.best_auc= best_auc
        self.best_predictors = best_predictors
        self.best_model = best_model
        
        pass


In [None]:
candidates_predicors = ['BORROWER_AGE',
 'BORROWER_SENIORITY',
 'AMT_CREDIT_NORM',
 'CB_NB_CREDIT_CLOSED', 'CB_DAYS_CREDIT',
 'HAS_CHILDREN',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS_2',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE', "DAYS_LAST_PHONE_CHANGE"] #'NAME_INCOME_TYPE',


: 

In [None]:
#iterative_fitter=IterativeFitter(candidates=candidates_predicors, target="TARGET", n_features_max=7, data=data)

: 

In [None]:
#len(iterative_fitter.model_candidates)

: 

In [None]:
#iterative_fitter.get_best_model()

: 

In [None]:
#iterative_fitter.best_auc

: 

In [None]:
#iterative_fitter.best_predictors

: 

In [None]:
#print(iterative_fitter.best_model.summary())

: 

In [None]:
#max(iterative_fitter.aucs)

: 

In [None]:
data["NAME_FAMILY_STATUS_2"]

: 

#### Model estimation 

In [None]:
formula="TARGET ~ C(OCCUPATION_TYPE,Treatment(reference=0)) + C(NAME_EDUCATION_TYPE,Treatment(reference='Non graduated'))  + C(AMT_CREDIT_NORM,Treatment(reference=3)) + C(BORROWER_AGE,Treatment(reference=0)) + C(BORROWER_SENIORITY,Treatment(reference=0)) + C(CB_NB_CREDIT_CLOSED, Treatment(reference=0))+ C(CB_DAYS_CREDIT,Treatment(reference=3)) - 1" # + C(DAYS_LAST_PHONE_CHANGE, Treatment(reference=3)) + C(FAM_STATS_CHILD,Treatment(reference='Single')) + +BORROWER_FIDELITY 
# à spécifier les modalités de références pour avoir 

: 

In [None]:
model_logit=sm.Logit.from_formula(formula=formula,data=data_train).fit()

: 

In [None]:
print(model_logit.summary())

: 

In [None]:
y_train_proba=model_logit.predict(data_train)
gini=2*roc_auc_score(data_train["TARGET"],y_train_proba) - 1
print(f"{gini = :.3f}")

: 

In [None]:
y_test_proba=model_logit.predict(data_test)
gini=2*roc_auc_score(data_test["TARGET"],y_test_proba) - 1
print(f"{gini = :.3f}")

: 

In [None]:
#lol

: 

#### Construction de la grille de score

In [None]:
def grid_score(data_train,results_model_logit, variables_utilisees) -> pd.DataFrame:
    index_logit = list(results_model_logit.params.index)

    variables_logit = []
    modalites_variables = []

    # récupérer les variables
    for ligne in index_logit :
        variable = ligne.split(",")[0].replace("C(","")
        variables_logit.append(variable)

        modalite = ligne.split("[")[-1].replace("]","")
        if "T.Interval" in modalite :
            modalite = modalite.replace("T.Interval","")
        if ", closed='right'" in modalite :
            modalite = modalite.replace(", closed='right')", "]")
        if "T." in modalite :
            modalite = modalite.replace("T.","")
        modalites_variables.append(modalite)

    df_coef = pd.DataFrame({'Variable': variables_logit, 'Modalités': modalites_variables, 'Coefficient' : list(model_logit.params), 'p-value' : list(model_logit.pvalues)})

    # variables_utilisees = ["OCCUPATION_TYPE", "NAME_EDUCATION_TYPE"  , "AMT_CREDIT_NORM" , "BORROWER_AGE" , "BORROWER_SENIORITY" , "CB_NB_CREDIT_CLOSED", "CB_DAYS_CREDIT"]
    grid = {'Variable':[],'Modalités':[],'effectif':[],}

    for var in variables_utilisees:
        for modalite in data_train[var].value_counts().reset_index()[var].unique():
            grid['Variable'].append(var)
            grid['Modalités'].append(modalite)
            effectif_pct = data_train[var].value_counts(normalize=True)[modalite] 
            grid['effectif'].append(effectif_pct)
    grid=pd.DataFrame(grid)
    grid['Modalités'] =grid['Modalités'].apply(str) #màj ici
    grid['Coefficient']=0

    grid_df=pd.merge(grid.drop(columns='Coefficient'),df_coef, on=['Variable', 'Modalités'], how='outer')
    grid_df=grid_df.pivot_table(index=['Variable', 'Modalités'], values=['effectif', 'Coefficient', 'p-value'], aggfunc='sum')
    grid_df=grid_df.reset_index()
    grid_df['Modalités'] =grid_df['Modalités'].apply(str)

    grid_df = grid_df.drop_duplicates(subset=['Variable','Modalités'],keep='last').reset_index(drop=True)

    notes = []
    sum_diff = sum([coefficients.max() - coefficients.min() for variable, coefficients in grid_df.groupby('Variable')['Coefficient']])

    #chaque ligne du DataFrame
    for index, row in grid_df.iterrows():
        # Extraire la variable correspondante à la modalité
        variable = row['Variable']
        
        coefficients_variable = grid_df.loc[grid_df['Variable'] == variable, 'Coefficient']
        note = 1000 * ((coefficients_variable.max() - row['Coefficient']) / sum_diff)
        notes.append(note)


    grid_df['Note'] = notes

    for var in grid_df['Variable'].unique():
        for modal in grid_df[grid_df['Variable'] == var]['Modalités'].unique():
            tmp = data_train[var].value_counts(normalize=True)
            proportion = tmp[tmp.index.astype(str) == modal].iloc[0]
            grid_df.loc[(grid_df['Variable'] == var) & (grid_df['Modalités'] == modal), 'effectif'] = proportion


    moyennes= {}
    contributions = []
    for var in grid_df['Variable'].unique():
        moyennes[var] = grid_df[grid_df['Variable']==var]['Note'].mean()
        
    denominator = np.sum([np.sqrt(np.sum([(row['effectif'] * (row['Note'] - moyennes[var])**2) for _, row in grid_df.loc[grid_df['Variable'] == var].iterrows()])) for var in grid_df['Variable'].unique()])

    for index, row in grid_df.iterrows():
        variable = row['Variable']
        mean_note = moyennes[variable]
        #formule
        numerator = np.sqrt(np.sum([(row['effectif'] * (row['Note'] - mean_note)**2) for _, row in grid_df.loc[grid_df['Variable'] == variable].iterrows()]))
        contribution = numerator / denominator
        contributions.append(contribution*100)


    grid_df['Contribution'] = contributions

    tx_df = {'Variable':[],'Modalités':[],'tx_defaut':[]}
    for var in grid_df['Variable'].unique():
        for modalite in grid_df[grid_df['Variable']==var]['Modalités'].unique():

            defauts = data_train[data_train[var].apply(str)==modalite]['TARGET'].sum()
            tout_lemonde =  data_train[data_train[var].apply(str)==modalite]['TARGET'].shape[0]
            tx_defaut= defauts/tout_lemonde * 100
            tx_df['Variable'].append(var)
            tx_df['Modalités'].append(modalite)
            tx_df['tx_defaut'].append(tx_defaut)

    tx_df= pd.DataFrame(tx_df)
    Grille_score = pd.merge(grid_df, tx_df, on=['Variable', 'Modalités'], how='left')
    Grille_score['Contribution'] = Grille_score['Contribution'].apply(lambda row: round(row,2))
    Grille_score['Note'] = Grille_score['Note'].apply(lambda row: round(row))
    Grille_score['tx_defaut'] = Grille_score['tx_defaut'].apply(lambda row: round(row,2))
    Grille_score['Coefficient'] = Grille_score['Coefficient'].apply(lambda row: round(row,4))
    Grille_score['p-value'] = Grille_score['p-value'].apply(lambda row: round(row,3))
    Grille_score['effectif'] = Grille_score['effectif'].apply(lambda row: round(row*100,1))

    return Grille_score

: 

In [None]:
liste_variables_utilisees = ["OCCUPATION_TYPE", "NAME_EDUCATION_TYPE"  , "AMT_CREDIT_NORM" , "BORROWER_AGE" , "BORROWER_SENIORITY" , "CB_NB_CREDIT_CLOSED", "CB_DAYS_CREDIT"]
grille = grid_score(data_train,model_logit, liste_variables_utilisees)

: 

In [None]:
grille

: 

In [None]:
grille["Contribution"].unique().sum()

: 

In [None]:
grille.groupby("Variable")["Note"].max().sum()

: 

In [None]:
# import openpyxl
# grille.to_excel("../data/grille_de_score_revolving.xlsx")

: 

In [None]:
def attribute_score(grid_score, data):
    data['Note'] = 0  
    for var in grid_score["Variable"].unique():
        modal = grid_score[grid_score['Variable'] == var]['Modalités'].unique()
        for i in range(len(modal)):
            condition = data[var].apply(str) == modal[i]  # Condition pour vérifier la modalité
            note = grid_score[(grid_score['Variable'] == var) & (grid_score['Modalités'] == str(modal[i]))]['Note'].values[0]
            data['Note'] = np.where(condition, data['Note'] + note, data['Note'])
    pass

: 

In [None]:
attribute_score(grille,data_train)
attribute_score(grille,data_test)

: 

In [None]:
data_train.shape

: 

In [None]:
data_train["Note"]

: 

In [None]:
def show_conditionnal_density(data:pd.DataFrame, colname:str):
    plt.figure(figsize=(10, 6))
    # Plot des distributions kernel pour TARGET==0 et TARGET==1
    sns.kdeplot(data.loc[data["TARGET"]==0, colname], color='green', label='TARGET=0', fill=False)
    sns.kdeplot(data.loc[data["TARGET"]==1, colname], color='red', label='TARGET=1', fill=False)

    plt.title(f"Distribution de {colname} sur le test")
    plt.xlabel(f"{colname}")
    plt.ylabel('Pourcentage')
    plt.legend()
    plt.show()

: 

In [None]:
show_conditionnal_density(data_train,"Note")

: 

In [None]:
show_conditionnal_density(data_test,"Note")

: 

#### Segmentation -- CHR

In [None]:
from jenkspy import JenksNaturalBreaks

: 

In [None]:
segment=JenksNaturalBreaks(n_classes=7)
segment.fit(data_train["Note"].values)

: 

In [None]:
segment.breaks_

: 

In [None]:
segment.goodness_of_variance_fit(data_train["Note"])

: 

In [None]:
data_train["Segment"]=segment.predict(data_train["Note"].values)
data_test["Segment"]=segment.predict(data_test["Note"].values)

: 

In [None]:
csv_file_path = "./data/data_seg_train_2020_cash.csv"
data_train.to_csv(csv_file_path, index=False)

print(f"DataFrame sauvegardé avec succès sous {csv_file_path}")

: 

In [None]:
csv_file_path = "./data/data_seg_test_2020_cash.csv"
data_test.to_csv(csv_file_path, index=False)

print(f"DataFrame sauvegardé avec succès sous {csv_file_path}")

: 

In [None]:
data_train["Segment"].value_counts(normalize=True)

: 

In [None]:
data_train[["Segment","TARGET"]].groupby("Segment").mean()

: 

In [None]:
def subplot_segment_default_rate(data):
    mean_target_by_segment = data.groupby('Segment')['TARGET'].mean().reset_index()
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(data['Segment'].value_counts(normalize=True), color='lightblue', label='Distribution des Segments')
    ax2 = ax.twinx()
    sns.lineplot(x='Segment', y='TARGET', data=mean_target_by_segment, marker='o', color='red', linewidth=2, label='Taux de défaut')
    ax.set_ylabel('Taux d\'observations par segment', color='blue')
    ax2.set_ylabel('Taux de défaut', color='blue')
    plt.title('Répartition des CHR et des taux de défaut par CHR')
    plt.show()
    pass

: 

In [None]:
subplot_segment_default_rate(data_train)

: 

In [None]:
mean_target_by_segment = data_train.groupby('Segment')['TARGET'].mean().reset_index()
plt.figure(figsize=(10, 6))
ax = sns.barplot(data_train['Segment'].value_counts(normalize=True), color='lightblue', label='Distribution des Segments')
ax2 = ax.twinx()
sns.lineplot(x='Segment', y='TARGET', data=mean_target_by_segment, marker='o', color='red', linewidth=2, label='Taux de défaut')
ax.set_ylabel('Taux d\'observations par segment', color='blue')
ax2.set_ylabel('Taux de défaut', color='blue')
plt.title('Répartition des CHR et des taux de défaut par CHR sur le jeu d\'entrainement')
plt.show()

: 

In [None]:
mean_target_by_segment = data_test.groupby('Segment')['TARGET'].mean().reset_index()

# Configurer le style seaborn
# sns.set(style="whitegrid")

# Créer le graphique
plt.figure(figsize=(10, 6))
ax = sns.barplot(data_test['Segment'].value_counts(normalize=True), color='lightblue', label='Distribution des Segments')
ax2 = ax.twinx()
sns.lineplot(x='Segment', y='TARGET', data=mean_target_by_segment, marker='o', color='red', linewidth=2, label='Taux de défaut')

# Ajouter des étiquettes et un titre
ax.set_ylabel('Taux d\'observations par segment', color='blue')
ax2.set_ylabel('Taux de défaut', color='blue')
plt.title('Répartition des CHR et des taux de défaut par CHR sur le jeu de test')

# Afficher le graphique
plt.show()

: 

In [None]:
data_test[["Segment","TARGET"]].groupby("Segment").mean()

: 

In [None]:
show_volume_stability_overtime(data_train,"Segment",0.03)

: 

In [None]:
show_risk_stability_overtime(data_train,"Segment")

: 

#### Model calibration (méthodes d'inférence non paramétrique)

##### MoC C

In [None]:
# Nombre d'échantillons bootstrap par segment
num_bootstrap_samples = 1000
moc_c_segment={}

def lra_calculation(sample):
    lra = sample.groupby("date_annee").mean().mean()
    return lra.values

# Boucle pour chaque segment
for segment in range(7):
    # Echantillons bootstrap
    sample_size=data_test[data_test["Segment"]==segment].shape[0]
    bootstrap_samples = [data_test[data_test["Segment"]==segment].sample(frac=1, replace=True)[["TARGET","date_annee"]] for _ in range(num_bootstrap_samples)]
    
    # Default rate
    lra_s = [lra_calculation(sample) for sample in bootstrap_samples]
    
    # Calculer le 90e centile et la moyenne
    percentile_90 = np.percentile(lra_s, 90)
    mean_rate = np.mean(lra_s)
    moc_c=percentile_90 - mean_rate
    moc_c_segment[segment]=moc_c
    
    temp_df = pd.DataFrame({
        'Size':[sample_size],
        'Segment': [segment],
        'Percentile_90': [percentile_90],
        'Mean_Rate': [mean_rate],
        "MoC_C": [moc_c]
    })
    
    print(temp_df)

: 

In [None]:
moc_c_segment

: 

##### MoC A

In [None]:
show_volume_stability_overtime(data_train,"TARGET")

: 

In [None]:
show_volume_stability_overtime(data_test,"TARGET")

: 

In [None]:
# Quantification :
deficiency_impact=((data.loc[data["date_annee"]<2019,"TARGET"].mean())/(data["TARGET"].mean()) - 1 )*100

: 

In [None]:
deficiency_impact

: 

In [None]:
# Nombre d'échantillons bootstrap par segment
num_bootstrap_samples = 1000
moc_a_segment={}
# Fonction pour calculer le taux de 1 dans un échantillon bootstrap

def calculate_adjustment(sample:pd.DataFrame):
    ajustement=sample.loc[sample["date_annee"]<2019,["TARGET","date_annee"]].groupby("date_annee").mean().mean() - sample[["TARGET","date_annee"]].groupby("date_annee").mean().mean()
    return ajustement.values

# Boucle pour chaque segment
for segment in range(7):
    # Générer des échantillons bootstrap
    sample_size=data_test[data_test["Segment"]==segment].shape[0]
    bootstrap_samples = [data_test[data_test["Segment"]==segment].sample(frac=1, replace=True)[["TARGET","date_annee"]] for _ in range(num_bootstrap_samples)]
    
    # Calcul de l'ajustement sur chaque echantillon
    adjustments = [calculate_adjustment(sample) for sample in bootstrap_samples]
    
    # Calculer le 90e centile et la moyenne
    percentile_90 = np.percentile(adjustments, 90)
    mean_rate = np.mean(adjustments)
    moc_a= percentile_90 - mean_rate
    moc_a_segment[segment] = moc_a
    # Créer un DataFrame temporaire pour le segment actuel
    temp_df_a = pd.DataFrame({
        'Size':[sample_size],
        'Segment': [segment],
        'Percentile_90': [percentile_90],
        'Mean_Rate': [mean_rate],
        "MoC_A": [moc_a]
    })
    
    print(temp_df_a)

: 

In [None]:
moc_a_segment

: 

In [None]:
lra_=data_train[["date_annee","Segment","TARGET"]].groupby(["Segment","date_annee"]).mean().reset_index()

: 

In [None]:
lra=lra_[["Segment","TARGET"]].groupby("Segment").mean()

: 

In [None]:
summary=pd.concat([lra, pd.DataFrame(list(moc_a_segment.values()), columns=["MOC_A"]), pd.DataFrame(list(moc_c_segment.values()), columns=["MOC_C"])], axis=1)

: 

In [None]:
summary["PD"]=summary.sum(axis=1)

: 

In [None]:
summary.rename(columns={"TARGET":"LRA"}, inplace=True)

: 

In [None]:
summary

: 

: 