# Data Cleaning - Credit Card Balance

# 1. Import Library dan Dataset

In [1]:
# Import library dasar untuk manipulasi dan analisis data
import pandas as pd
import numpy as np

# Import library untuk visualisasi data
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Memastikan Jupyter Notebook menampilkan semua kolom
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
installments_payments = pd.read_csv('installments_payments.csv')

In [3]:
installments_payments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585
...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66,-1624.0,,67.500,
13605397,1310347,414406,0.0,47,-1539.0,,67.500,
13605398,1308766,402199,0.0,43,-7.0,,43737.435,
13605399,1062206,409297,0.0,43,-1986.0,,67.500,


# 2. Exploratory Data Analysis

## 2.1 Info dan Statistik Dasar dari DataFrame

In [4]:
#Menampilkan ringkasan statistik dari kolom numerik:
installments_payments.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
count,13605400.0,13605400.0,13605400.0,13605400.0,13605400.0,13602500.0,13605400.0,13602500.0
mean,1903365.0,278444.9,0.8566373,18.8709,-1042.27,-1051.114,17050.91,17238.22
std,536202.9,102718.3,1.035216,26.66407,800.9463,800.5859,50570.25,54735.78
min,1000001.0,100001.0,0.0,1.0,-2922.0,-4921.0,0.0,0.0
25%,1434191.0,189639.0,0.0,4.0,-1654.0,-1662.0,4226.085,3398.265
50%,1896520.0,278685.0,1.0,8.0,-818.0,-827.0,8884.08,8125.515
75%,2369094.0,367530.0,1.0,19.0,-361.0,-370.0,16710.21,16108.42
max,2843499.0,456255.0,178.0,277.0,-1.0,-1.0,3771488.0,3771488.0


## 2.2 Checking Duplicate Values

In [6]:
installments_payments.duplicated().sum()

0

Tidak terdapat duplicated value pada dataset "installments_payments"

## 2.3 Checking Missing Values

In [7]:
installments_payments.isna().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

In [8]:
# Menghitung jumlah nilai hilang di setiap kolom
missing_values = installments_payments.isnull().sum()

# Menghitung persentase nilai hilang di setiap kolom
missing_percentage = (missing_values / len(installments_payments)) * 100

# Membuat DataFrame untuk menampilkan kolom dengan nilai hilang dan persentasenya
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})

# Menampilkan kolom yang memiliki nilai hilang (nilai hilang > 0)
missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values(by='Percentage', ascending=False)

missing_data

Unnamed: 0,Missing Values,Percentage
DAYS_ENTRY_PAYMENT,2905,0.021352
AMT_PAYMENT,2905,0.021352


In [9]:
installments_payments_cleaned = installments_payments.dropna()

In [10]:
installments_payments_cleaned

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585
...,...,...,...,...,...,...,...,...
13605344,2006721,442291,1.0,3,-1311.0,-1318.0,2934.225,2934.225
13605345,1126000,428449,0.0,12,-301.0,-302.0,6793.470,6750.000
13605346,1519070,444122,1.0,5,-399.0,-407.0,4363.830,4363.830
13605347,2784672,444977,0.0,4,-157.0,-157.0,373.005,373.005


In [11]:
csv_file_path = 'C:/Users/Inne Andarini/Downloads/installments_payments_cleaned.csv'
installments_payments_cleaned.to_csv(csv_file_path, index=False)