## Initial steps

The first thing to do when working with files in Google Drive is to mount drive using the following script:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#@title You can give a title to your cells by using @title before a comment on your cell { run: "auto", form-width: "40%" }

#You can use a form to easily change the value of a variable
variable_name = 0 #@param {type:"integer"}

show = True #@param {type:"boolean"}

if show:
    print("If this is being showed, it means the variable show was set as True")


If this is being showed, it means the variable show was set as True


In [3]:
#@title Import python libraries
import numpy as np
import pandas as pd

#libraries useful for plotting graphs
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#@title **Location** of the dataset
path = "drive/MyDrive/Credit Project Prototypes/Datasets/PAKDD2009/"
filename = 'PAKDDdataset.csv'

## Reading the dataset

### Sometimes it is necessary to specify the enconding of the file you are wanting to read.
[Source of the code bellow](https://stackoverflow.com/questions/54133455/importing-csv-using-pd-read-csv-invalid-start-byte-error)

In [5]:
#@title Using chardet to find the right **encoding** of a file
import chardet    
rawdata = open(path+filename, 'rb').read()
result = chardet.detect(rawdata)
charenc = result['encoding']
print(charenc)

ascii


In [6]:
#@title Reading the dataset from a **csv** file
df = pd.read_csv(path+filename,sep='\t', encoding=charenc)

# Basic info of the dataset

Functions:
*   df.columns
*   df.shape
*   df.head()
*   df.info()
*   df.describe()
*   df[col].unique()
*   df[col].value_counts
*   df[col].dtype
*   max(df[col]), 
*   min(df[col])
*   df[col].mean()



In [7]:
#@title Show the **columns** of the dataset
print(df.columns)

Index(['ID_SHOP', 'SEX', 'MARITAL_STATUS', 'AGE', 'FLAG_RESIDENCIAL_PHONE',
       'AREA_CODE_RESIDENCIAL_PHONE', 'PAYMENT_DAY', 'SHOP_RANK',
       'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE', 'FLAG_MOTHERS_NAME',
       'FLAG_FATHERS_NAME', 'FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN',
       'FLAG_RESIDENCE_STATE_eq_WORKING_STATE', 'MONTHS_IN_THE_JOB',
       'PROFESSION_CODE', 'MATE_INCOME',
       'FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS', 'FLAG_OTHER_CARD',
       'QUANT_BANKING_ACCOUNTS', 'FLAG_MOBILE_PHONE', 'FLAG_CONTACT_PHONE',
       'PERSONAL_NET_INCOME', 'COD_APPLICATION_BOOTH',
       'QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION',
       'FLAG_CARD_INSURANCE_OPTION', 'TARGET_LABEL_BAD=1', 'Unnamed: 27'],
      dtype='object')


In [8]:
#@title Show the **shape** of the dataset
print(df.shape)
print('The number of rows is:', df.shape[0])
print('The number of columns is:', df.shape[1])

(40000, 28)
The number of rows is: 40000
The number of columns is: 28


In [9]:
#@title Show the first hows of the dataset with **head** function
n_rows = 5 #@param {type:"integer"}
df.head(n_rows)

Unnamed: 0,ID_SHOP,SEX,MARITAL_STATUS,AGE,FLAG_RESIDENCIAL_PHONE,AREA_CODE_RESIDENCIAL_PHONE,PAYMENT_DAY,SHOP_RANK,RESIDENCE_TYPE,MONTHS_IN_RESIDENCE,...,FLAG_OTHER_CARD,QUANT_BANKING_ACCOUNTS,FLAG_MOBILE_PHONE,FLAG_CONTACT_PHONE,PERSONAL_NET_INCOME,COD_APPLICATION_BOOTH,QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION,FLAG_CARD_INSURANCE_OPTION,TARGET_LABEL_BAD=1,Unnamed: 27
0,22,F,O,44,N,31,12,0,P,12,...,N,0,N,N,300,0,0,N,0,
1,15,F,S,18,Y,31,20,0,P,216,...,N,0,N,N,300,0,0,N,0,
2,24,F,C,22,Y,31,8,0,P,48,...,N,0,N,N,229,0,0,N,0,
3,12,F,C,47,N,31,25,0,P,180,...,N,0,N,N,304,0,0,N,0,
4,16,F,S,28,Y,31,25,0,O,12,...,N,0,N,N,250,0,0,N,0,


In [10]:
#@title Show the basic info of all columns of the dataset with the **info** function{ vertical-output: true, form-width: "70%" }
df.info()

# taking this form as an opportunity
# to show a different way to show
# the outputs of a cell

















<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 28 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   ID_SHOP                                     40000 non-null  int64  
 1   SEX                                         39999 non-null  object 
 2   MARITAL_STATUS                              40000 non-null  object 
 3   AGE                                         40000 non-null  int64  
 4   FLAG_RESIDENCIAL_PHONE                      40000 non-null  object 
 5   AREA_CODE_RESIDENCIAL_PHONE                 40000 non-null  int64  
 6   PAYMENT_DAY                                 40000 non-null  int64  
 7   SHOP_RANK                                   40000 non-null  int64  
 8   RESIDENCE_TYPE                              40000 non-null  object 
 9   MONTHS_IN_RESIDENCE                         40000 non-null  int64  
 10  FLAG_MOTHE

In [11]:
#@title Show the basic statistics of all numeric columns of the dataset with the **describe** function
df.describe()

Unnamed: 0,ID_SHOP,AGE,AREA_CODE_RESIDENCIAL_PHONE,PAYMENT_DAY,SHOP_RANK,MONTHS_IN_RESIDENCE,MONTHS_IN_THE_JOB,PROFESSION_CODE,MATE_INCOME,QUANT_BANKING_ACCOUNTS,COD_APPLICATION_BOOTH,QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION,Unnamed: 27
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,12.0
mean,20.872575,34.633575,33.824475,15.327675,0.016875,153.4062,50.6772,482.284975,54.502327,0.0,0.35475,0.150175,0.166667
std,14.621861,13.055208,10.389486,7.165507,0.217581,136.421218,74.242608,382.038576,931.590036,0.0,31.415285,0.40457,0.389249
min,1.0,15.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,23.0,31.0,9.0,0.0,36.0,12.0,79.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,33.0,31.0,12.0,0.0,120.0,24.0,514.0,0.0,0.0,0.0,0.0,0.0
75%,24.0,43.0,31.0,20.0,0.0,240.0,60.0,864.0,0.0,0.0,0.0,0.0,0.0
max,96.0,95.0,69.0,28.0,3.0,1188.0,1176.0,999.0,150000.0,0.0,5000.0,3.0,1.0


In [12]:
#@title Show the unique values of a column with the **unique** function
df['SEX'].unique()

array(['F', 'M', nan], dtype=object)

In [13]:
#@title Show the number of each unique value in a column with the **value_counts** function
df["SEX"].value_counts()

F    27842
M    12157
Name: SEX, dtype: int64

In [14]:
#@title Show the type of the values of a column with the **dtype** function
df['SEX'].dtype

dtype('O')

The dtype function returns a different character for all different types of values
* 'b'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;boolean
* 'i'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;(signed) integer
* 'u'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;unsigned integer
* 'f'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;floating-point
* 'c'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;complex-floating point
* 'O'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;(Python) objects
* 'S', 'a'&nbsp; &nbsp; &nbsp;(byte-)string
* 'U'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;Unicode
* 'V'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;raw data (void)

[//]: # (This may be the most platform independent markdown comment)
[//]: # (&nbsp; adds a markdown space between words)

[Source comment](https://stackoverflow.com/questions/4823468/comments-in-markdown)

[Source dtype](https://stackoverflow.com/questions/37561991/what-is-dtypeo-in-pandas)

[Source markdown spaces](https://steemit.com/markdown/@jamesanto/how-to-add-multiple-spaces-between-texts-in-markdown)


In [15]:
#@title Show **maximum value**, **minimum value** and **mean value** of a column { vertical-output: true }
print("The max value of df['AGE'] is:",max(df['AGE']))
print("The min value of df['AGE'] is:", min(df['AGE']))
print("The mean value of df['AGE'] is:",df['AGE'].mean())

The max value of df['AGE'] is: 95
The min value of df['AGE'] is: 15
The mean value of df['AGE'] is: 34.633575


In [16]:
#@title Iterating over all the dataframe columns to show all the unique values of each one { vertical-output: true }
for c in df.columns:
    if len(df[c].unique()) < 100: 
        print(c,':')
        print(df[c].unique())
        print()
    elif df[c].dtype != 'O':
        print(c,':')
        print('[Two many values to unpack]')
        print('Min:', min(df[c]),'Max:', max(df[c]),'Mean:', df[c].mean(), )
        print()
    else:
        print(c,':')
        print('[Two many values to unpack]')
        print()

ID_SHOP :
[22 15 24 12 16 55  6  3 23 25  7 19 96  9  1 21 20 14 11 66 13 17  8 10
 18  4  2  5 50 81 77]

SEX :
['F' 'M' nan]

MARITAL_STATUS :
['O' 'S' 'C' 'V' 'D']

AGE :
[44 18 22 47 28 26 21 27 57 53 32 36 49 46 20 17 41 64 71 33 24 23 40 31
 30 39 55 66 43 34 35 19 42 62 38 45 50 75 58 61 56 51 37 48 52 25 74 65
 29 54 60 16 59 72 69 73 68 67 76 82 78 63 77 70 79 80 86 15 83 88 81 84
 95]

FLAG_RESIDENCIAL_PHONE :
['N' 'Y']

AREA_CODE_RESIDENCIAL_PHONE :
[31 23  1 27 50  5 34 33 49 24 32 68 62 38 56 29 15 41 42  8 46 44 45 18
  6 17 12  2 14 39 54 22 43 59 53 26  7 52  9 37 36 35 69 25 61 10 58  3
 40 19 20 48 67 30 13 28 11 60 21]

PAYMENT_DAY :
[12 20  8 25 28 18  3  1 23 22  9 16 11  6 27 15]

SHOP_RANK :
[0 3 2]

RESIDENCE_TYPE :
['P' 'O' 'A' 'C']

MONTHS_IN_RESIDENCE :
[  12  216   48  180    0   24   60  120  240   72  360  264   96  396
  276   36  228  168  252   84  300  144  408  108  156  288  336  132
  372  192  492  456  384  324  204  480  612  540  528  576  444  

# **Query** Function

Functions:
*   df.query
*   df.astype


In [17]:
df['Unnamed'] = df['Unnamed: 27']

In [18]:
df = df.query("Unnamed != Unnamed") 

In [19]:
#@title Use **query** function to consult what samples of the dataset respect certain condiction
df = df.query("SEX == SEX")
# df.query returns a dataframe
# Here we used this function to remove NaN values of the dataset, since NaN != NaN

### A cell containing a np.nan value will not be equal to anything, including another np.nan value
[Source](https://stackoverflow.com/questions/26535563/querying-for-nan-and-other-names-in-pandas)


In [20]:
#@title the **query** function works with variables
mean_age = df['AGE'].mean()
df.query("AGE < @mean_age").head()

Unnamed: 0,ID_SHOP,SEX,MARITAL_STATUS,AGE,FLAG_RESIDENCIAL_PHONE,AREA_CODE_RESIDENCIAL_PHONE,PAYMENT_DAY,SHOP_RANK,RESIDENCE_TYPE,MONTHS_IN_RESIDENCE,...,QUANT_BANKING_ACCOUNTS,FLAG_MOBILE_PHONE,FLAG_CONTACT_PHONE,PERSONAL_NET_INCOME,COD_APPLICATION_BOOTH,QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION,FLAG_CARD_INSURANCE_OPTION,TARGET_LABEL_BAD=1,Unnamed: 27,Unnamed
1,15,F,S,18,Y,31,20,0,P,216,...,0,N,N,300,0,0,N,0,,
2,24,F,C,22,Y,31,8,0,P,48,...,0,N,N,229,0,0,N,0,,
4,16,F,S,28,Y,31,25,0,O,12,...,0,N,N,250,0,0,N,0,,
5,24,M,S,26,N,31,28,0,P,180,...,0,N,N,800,0,0,N,0,,
6,55,F,S,22,Y,31,12,0,A,0,...,0,N,N,410,0,0,N,0,,


In [21]:
#@title Using the **astype** function to convert the type of a column

print("The type of PERSONAL_NET_INCOME columns is:",df['PERSONAL_NET_INCOME'].dtype)

print("The number of 'not numeric' values in this columns is: ", df.query("PERSONAL_NET_INCOME == 'N'").shape[0])

df = df.query("PERSONAL_NET_INCOME != 'N'") #here we are discarding the rows that have 'not numeric' values for this column

df = df.astype({'PERSONAL_NET_INCOME':float}) #here we are converting the type of this column from object to float

The type of PERSONAL_NET_INCOME columns is: object
The number of 'not numeric' values in this columns is:  0


# **Where** function

Functions:
*   df.where

In [22]:
#@title Defining our threshold for personal net income { run: "auto" }
threshold = 10000 #@param {type:"slider", min:1000, max:100000, step:1000}

print("We verify that only", df.query("PERSONAL_NET_INCOME >= @threshold").shape[0], "samples have a personal net income bigger than {}".format(threshold))
print("We verify that", df.query("PERSONAL_NET_INCOME < @threshold").shape[0], "samples have a personal net income smaller than {}".format(threshold))

We verify that only 36 samples have a personal net income bigger than 10000
We verify that 39951 samples have a personal net income smaller than 10000


In [23]:
#@title Using the **where** function to change the values of a column according to a certain condition 
df.PERSONAL_NET_INCOME = np.where(df.PERSONAL_NET_INCOME < threshold, df.PERSONAL_NET_INCOME,  2 * threshold)
# Here we are keeping the values smaller than the threshold 
# but turning all the values bigger than the threshold to twice the value of the threshold

In [24]:
# Defining a list of columns
columns_flag = [ "FLAG_RESIDENCIAL_PHONE", "FLAG_MOTHERS_NAME", 
                "FLAG_FATHERS_NAME", "FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN", 
               "FLAG_RESIDENCE_STATE_eq_WORKING_STATE", "FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS",
                ]

In [25]:
#@title Using the where function in multiple columns with similar values
for c in columns_flag:
    df[c] = np.where(df[c] == 'Y', 1, 0)

In [26]:
#@title The where function is very useful to convert categorical values to numerical values in some cases
df["SEX"] = np.where(df["SEX"] == 'M', 0, 1)

# **Drop** Function

Functions:
*   df.drop

In [27]:
#@title Adding all the columns that only have one value to the **drop list**
to_drop = []
for c in df.columns:
    if len(df[c].unique())==1:
        to_drop.append(c)

print(to_drop)

['FLAG_OTHER_CARD', 'QUANT_BANKING_ACCOUNTS', 'FLAG_MOBILE_PHONE', 'FLAG_CONTACT_PHONE', 'COD_APPLICATION_BOOTH', 'FLAG_CARD_INSURANCE_OPTION', 'Unnamed: 27', 'Unnamed']


In [28]:
#@title Using the **drop** function to drop the selected columns
df.drop(to_drop, axis=1, inplace=True)

# Checking the actual state of the dataset

Functions:
*   df.columns
*   df.shape
*   df.head()
*   df.info()
*   df.describe()
*   df[col].unique()

In [29]:
print(df.columns)

Index(['ID_SHOP', 'SEX', 'MARITAL_STATUS', 'AGE', 'FLAG_RESIDENCIAL_PHONE',
       'AREA_CODE_RESIDENCIAL_PHONE', 'PAYMENT_DAY', 'SHOP_RANK',
       'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE', 'FLAG_MOTHERS_NAME',
       'FLAG_FATHERS_NAME', 'FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN',
       'FLAG_RESIDENCE_STATE_eq_WORKING_STATE', 'MONTHS_IN_THE_JOB',
       'PROFESSION_CODE', 'MATE_INCOME',
       'FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS', 'PERSONAL_NET_INCOME',
       'QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION', 'TARGET_LABEL_BAD=1'],
      dtype='object')


In [30]:
print(df.shape)
print('The number of rows is:', df.shape[0])
print('The number of columns is:', df.shape[1])

(39987, 21)
The number of rows is: 39987
The number of columns is: 21


In [31]:
n_rows = 5 #@param {type:"integer"}
df.head(n_rows)

Unnamed: 0,ID_SHOP,SEX,MARITAL_STATUS,AGE,FLAG_RESIDENCIAL_PHONE,AREA_CODE_RESIDENCIAL_PHONE,PAYMENT_DAY,SHOP_RANK,RESIDENCE_TYPE,MONTHS_IN_RESIDENCE,...,FLAG_FATHERS_NAME,FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN,FLAG_RESIDENCE_STATE_eq_WORKING_STATE,MONTHS_IN_THE_JOB,PROFESSION_CODE,MATE_INCOME,FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS,PERSONAL_NET_INCOME,QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION,TARGET_LABEL_BAD=1
0,22,1,O,44,0,31,12,0,P,12,...,1,0,1,48,731,0.0,1,300.0,0,0
1,15,1,S,18,1,31,20,0,P,216,...,1,1,1,12,853,0.0,1,300.0,0,0
2,24,1,C,22,1,31,8,0,P,48,...,0,1,1,12,40,0.0,1,229.0,0,0
3,12,1,C,47,0,31,25,0,P,180,...,1,0,1,24,35,0.0,1,304.0,0,0
4,16,1,S,28,1,31,25,0,O,12,...,1,1,1,12,24,0.0,1,250.0,0,0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39987 entries, 0 to 39999
Data columns (total 21 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   ID_SHOP                                     39987 non-null  int64  
 1   SEX                                         39987 non-null  int64  
 2   MARITAL_STATUS                              39987 non-null  object 
 3   AGE                                         39987 non-null  int64  
 4   FLAG_RESIDENCIAL_PHONE                      39987 non-null  int64  
 5   AREA_CODE_RESIDENCIAL_PHONE                 39987 non-null  int64  
 6   PAYMENT_DAY                                 39987 non-null  int64  
 7   SHOP_RANK                                   39987 non-null  int64  
 8   RESIDENCE_TYPE                              39987 non-null  object 
 9   MONTHS_IN_RESIDENCE                         39987 non-null  int64  
 10  FLAG_MOTHE

In [33]:
df.describe()

Unnamed: 0,ID_SHOP,SEX,AGE,FLAG_RESIDENCIAL_PHONE,AREA_CODE_RESIDENCIAL_PHONE,PAYMENT_DAY,SHOP_RANK,MONTHS_IN_RESIDENCE,FLAG_MOTHERS_NAME,FLAG_FATHERS_NAME,FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN,FLAG_RESIDENCE_STATE_eq_WORKING_STATE,MONTHS_IN_THE_JOB,PROFESSION_CODE,MATE_INCOME,FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS,PERSONAL_NET_INCOME,QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION
count,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0,39987.0
mean,20.871033,0.695976,34.629004,0.816215,33.823968,15.327982,0.01688,153.396954,0.996299,0.959012,0.456323,0.991447,50.649561,482.297572,54.520046,0.978243,634.70724,0.150224
std,14.619097,0.459999,13.052142,0.387313,10.390134,7.165121,0.217616,136.426384,0.060726,0.198266,0.498095,0.092086,74.203456,382.053854,931.740941,0.145891,860.341725,0.404626
min,1.0,0.0,15.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,0.0,23.0,1.0,31.0,9.0,0.0,36.0,1.0,1.0,0.0,1.0,12.0,79.0,0.0,1.0,270.0,0.0
50%,21.0,1.0,33.0,1.0,31.0,12.0,0.0,120.0,1.0,1.0,0.0,1.0,24.0,514.0,0.0,1.0,400.0,0.0
75%,24.0,1.0,43.0,1.0,31.0,20.0,0.0,240.0,1.0,1.0,1.0,1.0,60.0,864.0,0.0,1.0,742.0,0.0
max,96.0,1.0,95.0,1.0,69.0,28.0,3.0,1188.0,1.0,1.0,1.0,1.0,1176.0,999.0,150000.0,1.0,20000.0,3.0


In [34]:
for c in df.columns:
    if len(df[c].unique()) < 100: 
        print(c,':')
        print(df[c].unique())
        print()
    elif df[c].dtype != 'O':
        print(c,':')
        print('[Two many values to unpack]')
        print('Min:', min(df[c]),'Max:', max(df[c]),'Mean:', df[c].mean(), )
        print()
    else:
        print(c,':')
        print('[Two many values to unpack]')
        print()

ID_SHOP :
[22 15 24 12 16 55  6  3 23 25  7 19 96  9  1 21 20 14 11 66 13 17  8 10
 18  4  2  5 50 81 77]

SEX :
[1 0]

MARITAL_STATUS :
['O' 'S' 'C' 'V' 'D']

AGE :
[44 18 22 47 28 26 21 27 57 53 32 36 49 46 20 17 41 64 71 33 24 23 40 31
 30 39 55 66 43 34 35 19 42 62 38 45 50 75 58 61 56 51 37 48 52 25 74 65
 29 54 60 16 59 72 69 73 68 67 76 82 78 63 77 70 79 80 86 15 83 88 81 84
 95]

FLAG_RESIDENCIAL_PHONE :
[0 1]

AREA_CODE_RESIDENCIAL_PHONE :
[31 23  1 27 50  5 34 33 49 24 32 68 62 38 56 29 15 41 42  8 46 44 45 18
  6 17 12  2 14 39 54 22 43 59 53 26  7 52  9 37 36 35 69 25 61 10 58  3
 40 19 20 48 67 30 13 28 11 60 21]

PAYMENT_DAY :
[12 20  8 25 28 18  3  1 23 22  9 16 11  6 27 15]

SHOP_RANK :
[0 3 2]

RESIDENCE_TYPE :
['P' 'O' 'A' 'C']

MONTHS_IN_RESIDENCE :
[  12  216   48  180    0   24   60  120  240   72  360  264   96  396
  276   36  228  168  252   84  300  144  408  108  156  288  336  132
  372  192  492  456  384  324  204  480  612  540  528  576  444  564
  420  3

#Ignore 

In [None]:
# df.to_csv(path+'BR_process.csv')

In [None]:
# df.columns

In [None]:
# c_list = ['ID_SHOP', 'SEX', 'AREA_CODE_RESIDENCIAL_PHONE', 'PROFESSION_CODE']

In [None]:
# df2 = df[df.columns.difference(c_list)]

In [None]:
# df2.info()

In [None]:
# df['TARGET_LABEL_BAD=1'] = np.where(df['TARGET_LABEL_BAD=1'] == '0', 0, 1)

In [None]:
# plt.figure(figsize=(14, 12))

# plt.subplot(221)
# ax1 = sns.histplot(data=df, x='PAYMENT_DAY', hue='TARGET_LABEL_BAD=1', multiple='stack', palette='tab10', kde=True)
# ax1.set_title("Payment Day Distribution", fontsize=20)

# plt.subplot(222)
# ax2 = sns.histplot(data=df, x='MONTHS_IN_RESIDENCE', hue='TARGET_LABEL_BAD=1', multiple='stack', palette='tab10', kde=True)
# ax2.set_title("MONTHS_IN_RESIDENCE Distribution", fontsize=20)

# plt.subplot(223)
# ax3 = sns.histplot(data=df, x='RESIDENCE_TYPE', hue='TARGET_LABEL_BAD=1', multiple='stack', palette='tab10', kde=True)
# ax3.set_title("'RESIDENCE_TYPE' Distribution", fontsize=20)

# plt.show()