In [1]:
import pandas as pd

### Import data file

In [2]:
# Read the CSV file
df = pd.read_csv('application_data.csv')

# Display the contents of the DataFrame
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Mean, Median and Mode

In [3]:
# Calculate the mean of the numerical columns
num_cols = df.select_dtypes(include='number')
means = num_cols.mean()
medians = num_cols.median()

# Calculate the mode of the categorical columns
cat_cols = df.select_dtypes(include='object')
modes = cat_cols.mode()

# Display the means, medians, and modes
print("Mean:")
print(means)
print("\nMedian:")
print(medians)
print("\nMode:")
print(modes)

Mean:
SK_ID_CURR                    278180.518577
TARGET                             0.080729
CNT_CHILDREN                       0.417052
AMT_INCOME_TOTAL              168797.919297
AMT_CREDIT                    599025.999706
                                  ...      
AMT_REQ_CREDIT_BUREAU_DAY          0.007000
AMT_REQ_CREDIT_BUREAU_WEEK         0.034362
AMT_REQ_CREDIT_BUREAU_MON          0.267395
AMT_REQ_CREDIT_BUREAU_QRT          0.265474
AMT_REQ_CREDIT_BUREAU_YEAR         1.899974
Length: 106, dtype: float64

Median:
SK_ID_CURR                    278202.0
TARGET                             0.0
CNT_CHILDREN                       0.0
AMT_INCOME_TOTAL              147150.0
AMT_CREDIT                    513531.0
                                ...   
AMT_REQ_CREDIT_BUREAU_DAY          0.0
AMT_REQ_CREDIT_BUREAU_WEEK         0.0
AMT_REQ_CREDIT_BUREAU_MON          0.0
AMT_REQ_CREDIT_BUREAU_QRT          0.0
AMT_REQ_CREDIT_BUREAU_YEAR         1.0
Length: 106, dtype: float64

Mode:
  NAME_CO

### Find standard deviation

In [4]:
# Compute the standard deviation of numerical columns
std = num_cols.std()

# Print the standard deviation of numerical columns
print(std)


SK_ID_CURR                    102790.175348
TARGET                             0.272419
CNT_CHILDREN                       0.722121
AMT_INCOME_TOTAL              237123.146279
AMT_CREDIT                    402490.776996
                                  ...      
AMT_REQ_CREDIT_BUREAU_DAY          0.110757
AMT_REQ_CREDIT_BUREAU_WEEK         0.204685
AMT_REQ_CREDIT_BUREAU_MON          0.916002
AMT_REQ_CREDIT_BUREAU_QRT          0.794056
AMT_REQ_CREDIT_BUREAU_YEAR         1.869295
Length: 106, dtype: float64


### Missing Values

In [5]:
# Count the number of missing values in each column
missing_values = df.isnull().sum()

# Display the missing values
print(missing_values)


SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 122, dtype: int64


In [6]:
# Impute the missing values with the mean of the available values in each column
num_cols_mean_imputed = num_cols.fillna(num_cols.mean())

# Replace the original numerical columns with the mean-imputed ones
df[num_cols.columns] = num_cols_mean_imputed


### Outliers

In [7]:
import numpy as np

# Calculate the Z-score of each numerical column
z_scores = np.abs((num_cols - num_cols.mean()) / num_cols.std())

# Find the outliers in each numerical column
outliers = num_cols[z_scores > 3]

# Display the outliers
print(outliers)


        SK_ID_CURR  TARGET  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  \
0              NaN     1.0           NaN               NaN         NaN   
1              NaN     NaN           NaN               NaN         NaN   
2              NaN     NaN           NaN               NaN         NaN   
3              NaN     NaN           NaN               NaN         NaN   
4              NaN     NaN           NaN               NaN         NaN   
...            ...     ...           ...               ...         ...   
307506         NaN     NaN           NaN               NaN         NaN   
307507         NaN     NaN           NaN               NaN         NaN   
307508         NaN     NaN           NaN               NaN         NaN   
307509         NaN     1.0           NaN               NaN         NaN   
307510         NaN     NaN           NaN               NaN         NaN   

        AMT_ANNUITY  AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
0               NaN            