In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
import seaborn as sns

import warnings

# Statistics functions
from scipy.stats import norm
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [2]:
# Suppressing a warning 
warnings.filterwarnings("ignore") 

# It is a magic function that renders the figure in the notebook
%matplotlib inline 

# Changing the figure size of a seaborn axes 
sns.set(rc={"figure.figsize": (20, 15)})

# The style parameters control properties like the color of the background and whether a grid is enabled by default.
sns.set_style("whitegrid")

ID
Represents a unique identification of an entry

Customer_ID
Represents a unique identification of a person

Month
Represents the month of the year

Name
Represents the name of a person

Age
Represents the age of the person

SSN
Represents the social security number of a person

Occupation
Represents the occupation of the person

Annual_Income
Represents the annual income of the person

Monthly_Inhand_Salary
Represents the monthly base salary of a person

Num_Bank_Accounts
Represents the number of bank accounts a person holds

Num_Credit_Card
Represents the number of other credit cards held by a person

Interest_Rate
Represents the interest rate on credit card

Num_of_Loan
Represents the number of loans taken from the bank

Type_of_Loan
Represents the types of loan taken by a person

Delay_from_due_date
Represents the average number of days delayed from the payment date

Num_of_Delayed_Payment
Represents the average number of payments delayed by a person

Changed_Credit_Limit
Represents the percentage change in credit card limit

Num_Credit_Inquiries
Represents the number of credit card inquiries

Credit_Mix
Represents the classification of the mix of credits

Outstanding_Debt
Represents the remaining debt to be paid (in USD)

Credit_Utilization_Ratio
Represents the utilization ratio of credit card

Credit_History_Age
Represents the age of credit history of the person

Payment_of_Min_Amount
Represents whether only the minimum amount was paid by the person

Total_EMI_per_month
Represents the monthly EMI payments (in USD)

Amount_invested_monthly
Represents the monthly amount invested by the customer (in USD)

Payment_Behaviour
Represents the payment behavior of the customer (in USD)

Monthly_Balance
Represents the monthly balance amount of the customer (in USD)

Credit_Score
Represents the bracket of credit score (Poor, Standard, Good)

## Loading the Data sets

In [3]:
# train set
df_train = pd.read_csv("train.csv")

In [6]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
ID,0x1602,0x1603,0x1604,0x1605,0x1606
Customer_ID,CUS_0xd40,CUS_0xd40,CUS_0xd40,CUS_0xd40,CUS_0xd40
Month,January,February,March,April,May
Name,Aaron Maashoh,Aaron Maashoh,Aaron Maashoh,Aaron Maashoh,Aaron Maashoh
Age,23,23,-500,23,23
SSN,821-00-0265,821-00-0265,821-00-0265,821-00-0265,821-00-0265
Occupation,Scientist,Scientist,Scientist,Scientist,Scientist
Annual_Income,19114.12,19114.12,19114.12,19114.12,19114.12
Monthly_Inhand_Salary,1824.843333,,,,1824.843333
Num_Bank_Accounts,3,3,3,3,3


In [8]:
df_train.shape

(100000, 28)

In [9]:
# test set

df_test = pd.read_csv("test.csv")

In [10]:
df_test.head().T

Unnamed: 0,0,1,2,3,4
ID,0x160a,0x160b,0x160c,0x160d,0x1616
Customer_ID,CUS_0xd40,CUS_0xd40,CUS_0xd40,CUS_0xd40,CUS_0x21b1
Month,September,October,November,December,September
Name,Aaron Maashoh,Aaron Maashoh,Aaron Maashoh,Aaron Maashoh,Rick Rothackerj
Age,23,24,24,24_,28
SSN,821-00-0265,821-00-0265,821-00-0265,821-00-0265,004-07-5839
Occupation,Scientist,Scientist,Scientist,Scientist,_______
Annual_Income,19114.12,19114.12,19114.12,19114.12,34847.84
Monthly_Inhand_Salary,1824.843333,1824.843333,1824.843333,,3037.986667
Num_Bank_Accounts,3,3,3,3,2


In [11]:
df_test.shape

(50000, 27)

In [13]:
# Checking if column headings are the same in both data set
dif_1 = [x for x in df_train.columns if x not in df_test.columns]
dif_1

['Credit_Score']

In [14]:
dif_2 = [x for x in df_test.columns if x not in df_train.columns]
dif_2

[]

## Checking missing data

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [16]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        50000 non-null  object 
 1   Customer_ID               50000 non-null  object 
 2   Month                     50000 non-null  object 
 3   Name                      44985 non-null  object 
 4   Age                       50000 non-null  object 
 5   SSN                       50000 non-null  object 
 6   Occupation                50000 non-null  object 
 7   Annual_Income             50000 non-null  object 
 8   Monthly_Inhand_Salary     42502 non-null  float64
 9   Num_Bank_Accounts         50000 non-null  int64  
 10  Num_Credit_Card           50000 non-null  int64  
 11  Interest_Rate             50000 non-null  int64  
 12  Num_of_Loan               50000 non-null  object 
 13  Type_of_Loan              44296 non-null  object 
 14  Delay_

In [17]:
# Drop the 'ID' column from the train set

df_train.drop(["ID"], axis=1, inplace=True)

In [23]:
# Save the list of 'Id' before dropping it from the test set

ID_test_list = df_test["ID"].tolist()
df_test.drop(["ID"], axis=1, inplace=True)

## Numeriacl Features

### <font color='darkblue'>I.2.1. Exploring and cleaning numerical features</font>

In [24]:
df_train_num = df_train.select_dtypes(exclude=["object"])
df_train_num

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
0,1824.843333,3,4,3,3,4.0,26.822620,49.574949
1,,3,4,3,-1,4.0,31.944960,49.574949
2,,3,4,3,3,4.0,28.609352,49.574949
3,,3,4,3,5,4.0,31.377862,49.574949
4,1824.843333,3,4,3,6,4.0,24.797347,49.574949
...,...,...,...,...,...,...,...,...
99995,3359.415833,4,6,7,23,3.0,34.663572,35.104023
99996,3359.415833,4,6,7,18,3.0,40.565631,35.104023
99997,3359.415833,4,6,5729,27,3.0,41.255522,35.104023
99998,3359.415833,4,6,7,20,3.0,33.638208,35.104023


In [28]:
for col in df_train.columns :
    print(col)
    print(df_train[col].value_counts(dropna=False))
    print("##########################")

Customer_ID
CUS_0xd40     8
CUS_0x9bf4    8
CUS_0x5ae3    8
CUS_0xbe9a    8
CUS_0x4874    8
             ..
CUS_0x2eb4    8
CUS_0x7863    8
CUS_0x9d89    8
CUS_0xc045    8
CUS_0x942c    8
Name: Customer_ID, Length: 12500, dtype: int64
##########################
Month
January     12500
February    12500
March       12500
April       12500
May         12500
June        12500
July        12500
August      12500
Name: Month, dtype: int64
##########################
Name
NaN               9985
Stevex              44
Langep              44
Jessicad            39
Vaughanl            39
                  ... 
Robin Pomeroyz       4
Matt Scuffhamk       4
Julieno              4
Bavierq              4
Timothyl             3
Name: Name, Length: 10140, dtype: int64
##########################
Age
38      2833
28      2829
31      2806
26      2792
32      2749
        ... 
471        1
1520       1
8663       1
3363       1
1342       1
Name: Age, Length: 1788, dtype: int64
#########################

In [30]:
Kontrol edilecek sütunlar
Age
Annual_Income
Num_Bank_Accounts
Num_Credit_Card
Interest_Rate
Num_of_Loan
Num_of_Delayed_Payment
Changed_Credit_Limit
Num_Credit_Inquiries
Num_Credit_Inquiries
Credit_Mix
Outstanding_Debt
Credit_History_Age
Total_EMI_per_month
Amount_invested_monthly
Payment_Behaviour
Monthly_Balance

SyntaxError: invalid syntax (Temp/ipykernel_19460/569891089.py, line 1)

In [34]:
df_train.sample(10)

Unnamed: 0,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
1772,CUS_0x46d4,May,,43,038-09-8528,Scientist,29042.45,2252.204167,3,5,...,Standard,1092.55,28.10245,24 Years and 0 Months,Yes,16.756707,205.5976542885117,Low_spent_Large_value_payments,272.866055840375,Standard
60167,CUS_0xbb43,August,,26_,051-05-4338,Engineer,57539.96,,8,9,...,Bad,3974.8,35.950651,2 Years and 10 Months,Yes,365.006714,142.64728866599046,High_spent_Small_value_payments,216.04566431815311,Standard
70046,CUS_0xaa83,July,Byrnesy,35,331-60-0907,Lawyer,61634.54,4957.211667,3,2,...,Good,793.77,38.14257,29 Years and 7 Months,No,68.856718,__10000__,High_spent_Small_value_payments,454.9853906594503,Good
71324,CUS_0x8638,May,,33,529-04-6046,Developer,117827.22,9684.935,1,1,...,Good,1470.72,43.186536,19 Years and 5 Months,No,120.238504,171.30950780737282,High_spent_Medium_value_payments,926.9454879239124,Standard
38793,CUS_0x802f,February,Emily Flitterg,3369,647-94-9183,Engineer,40334.7,,9,7,...,Bad,1274.12,27.549544,14 Years and 6 Months,Yes,45.341401,140.4811594167865,High_spent_Medium_value_payments,395.4999392139727,Poor
69792,CUS_0x3f6a,January,LaCapra Rossq,32,722-97-6435,Accountant,18426.65,1692.554167,3,7,...,Standard,2329.28,29.08634,14 Years and 11 Months,Yes,38.086238,79.51162837700822,Low_spent_Large_value_payments,321.6575506718197,Poor
72364,CUS_0xba4c,May,,21,507-44-8168,_______,81418.74,7065.895,1,4,...,Good,1369.02,40.507331,,No,121.378505,380.324536168744,Low_spent_Medium_value_payments,484.88645918092135,Good
37539,CUS_0xc18e,April,Raymondr,45,355-28-4498,Musician,172205.96_,14293.496667,2,2,...,Good,1224.88,36.952487,25 Years and 2 Months,No,175.603764,143.84052425074378,High_spent_Large_value_payments,1349.9053786177542,Good
83234,CUS_0x6322,March,Chris Vellacottt,40,843-12-3405,Lawyer,7451.97,463.9975,10,6,...,Bad,1428.75,38.220216,11 Years and 5 Months,Yes,24.087191,38.85830253382922,Low_spent_Large_value_payments,253.4542562023716,Poor
5416,CUS_0x804,January,Dhanya Skariachanx,43,857-32-0525,Media_Manager,20133.61,1671.800833,8,1116,...,Standard,1265.65,39.178989,12 Years and 10 Months,NM,54.973084,103.92911630405773,Low_spent_Small_value_payments,298.2778833577237,Poor


In [36]:
df_train.Age > 100  

TypeError: '>' not supported between instances of 'str' and 'int'