### PREDICT CUSTOMER CREDIT SCORE

In [2]:
import pandas as pd
import numpy as np

### IMPORTING DATA
#### import data and make a deep copy. all manipulation are performed on the copy

In [3]:
data = pd.read_excel("../SMART Banking data_4035.xlsx")

In [4]:
customer_data = data.copy(deep=True)
customer_data.columns = customer_data.iloc[0]
customer_data = customer_data[1:]
customer_data.reset_index(drop=True, inplace=True)
customer_data.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,estimated_salary
0,1563462,619,France,Female,42,2,,11348.88
1,15647311,68,Spain,Female,41,1,8387.86,112542.58
2,1561934,52,France,Female,42,8,15966.8,113931.57
3,1571354,699,France,Female,39,1,,93826.63
4,15737888,85,Spain,Female,43,2,12551.82,7984.1


### SHAPE OF THE DATA

In [5]:
rows, columns = customer_data.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 2020, Columns: 8


### DATA EXPLORATION

### Data Types

In [6]:
dtypes_df = pd.DataFrame({
    'column': customer_data.columns,
    'datatype': customer_data.dtypes.values
})
dtypes_df

Unnamed: 0,column,datatype
0,customer_id,object
1,credit_score,object
2,country,object
3,gender,object
4,age,object
5,tenure,object
6,balance,object
7,estimated_salary,object


##### Looking at the data, customer id, credit score, age, tenure, balance, estimated salary look like numbers
##### Country, gender, tenure appear to be categorical

In [7]:
customer_data.nunique()

0
customer_id         2003
credit_score         382
country                3
gender                 2
age                   63
tenure                 9
balance             1309
estimated_salary    2016
dtype: int64

#### Missing values

In [8]:
missing_values = pd.DataFrame({
    'column': customer_data.columns,
    'no_of_missing_values': customer_data.isna().sum(),
    'percentages_of_missing_values': (customer_data.isna().sum()/len(customer_data)) * 100
})
missing_values = missing_values.reset_index(drop=True)
missing_values

Unnamed: 0,column,no_of_missing_values,percentages_of_missing_values
0,customer_id,0,0.0
1,credit_score,11,0.544554
2,country,0,0.0
3,gender,0,0.0
4,age,6,0.29703
5,tenure,82,4.059406
6,balance,711,35.19802
7,estimated_salary,0,0.0


### check for rowns without any data

In [9]:
nan_rows_count = customer_data.isna().all(axis=1).sum()
print("Rows with no data: ",int(nan_rows_count))

Rows with no data:  0


In [10]:
customer_data_without_nan = customer_data.dropna()
rows_with_nan_data = len(customer_data) - len(customer_data_without_nan)
print("Number of rows with NaN values is: ",rows_with_nan_data)
print("Percentage of rows with NaN values is: ",(rows_with_nan_data/len(customer_data))*100)

Number of rows with NaN values is:  773
Percentage of rows with NaN values is:  38.26732673267327


### Assign numbers to values in country and gender columns bacause they are categorical

In [11]:
customer_data['country'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [12]:
customer_data['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [13]:
# Define mappings
country_map = {'Germany': 1, 'Spain': 2, 'France': 3}
gender_map = {'Female': 1, 'Male': 2}

# Apply mappings
customer_data['country'] = customer_data['country'].map(country_map)
customer_data['gender'] = customer_data['gender'].map(gender_map)
customer_data.head()


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,estimated_salary
0,1563462,619,3,1,42,2,,11348.88
1,15647311,68,2,1,41,1,8387.86,112542.58
2,1561934,52,3,1,42,8,15966.8,113931.57
3,1571354,699,3,1,39,1,,93826.63
4,15737888,85,2,1,43,2,12551.82,7984.1


# REPLACEMENT OF NAN
### Column with most NaN is balance. Check if values are nomally distributed

In [14]:
from scipy.stats import shapiro

stat, p = shapiro(customer_data['balance'].dropna())
print('Shapiro-Wilk test: stat=%.3f, p=%.3f' % (stat, p))

Shapiro-Wilk test: stat=0.082, p=0.000


### p < 0.05. Data is not nomally distributed

### USE KNN TO IMPUTE MISSING VALUES
### SELECT COLUMNS TO SCALE
### Scale only continous numerical columns. DROP CUSTOMER ID COLUMN

In [16]:
numerical_cols = ['credit_score', 'age', 'balance', 'estimated_salary']   
categorical_cols = ['country', 'gender','tenure']                      

In [17]:
# Import
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer

# Scale only numeric columns
scaler = MinMaxScaler()
scaled_numeric_data = scaler.fit_transform(customer_data[numerical_cols])

# Keep categorical columns as they are
categorical_data = customer_data[categorical_cols].values

full_data = np.hstack((scaled_numeric_data, categorical_data))

# Apply KNN imputer on full data
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(full_data)

column_names = ['credit_score', 'age', 'balance', 'estimated_salary','country', 'gender','tenure']
imputed_scaled_df = pd.DataFrame(imputed_data, columns= column_names)
imputed_scaled_df.head()

Unnamed: 0,credit_score,age,balance,estimated_salary,country,gender,tenure
0,0.727488,0.5,0.006503,0.05095,3.0,1.0,2.0
1,0.074645,0.4875,0.000585,0.505507,2.0,1.0,1.0
2,0.055687,0.5,0.001114,0.511746,3.0,1.0,8.0
3,0.822275,0.4625,0.002656,0.421436,3.0,1.0,1.0
4,0.094787,0.5125,0.000876,0.035836,2.0,1.0,2.0


In [18]:
imputed_scaled_df.shape

(2020, 7)

In [19]:
imputed_scaled_df.isna().sum()

credit_score        0
age                 0
balance             0
estimated_salary    0
country             0
gender              0
tenure              0
dtype: int64

In [20]:
# Inverse transform to get back original scale
imputed_df = pd.DataFrame(scaler.inverse_transform(imputed_scaled_df[numerical_cols]), columns=numerical_cols)
imputed_df.head()


Unnamed: 0,credit_score,age,balance,estimated_salary
0,619.0,42.0,93101.008,11348.88
1,68.0,41.0,8387.86,112542.58
2,52.0,42.0,15966.8,113931.57
3,699.0,39.0,38038.282,93826.63
4,85.0,43.0,12551.82,7984.1


In [21]:
imputed_scaled_df.update(imputed_df)
imputed_scaled_df.head()


Unnamed: 0,credit_score,age,balance,estimated_salary,country,gender,tenure
0,619.0,42.0,93101.008,11348.88,3.0,1.0,2.0
1,68.0,41.0,8387.86,112542.58,2.0,1.0,1.0
2,52.0,42.0,15966.8,113931.57,3.0,1.0,8.0
3,699.0,39.0,38038.282,93826.63,3.0,1.0,1.0
4,85.0,43.0,12551.82,7984.1,2.0,1.0,2.0


In [22]:
imputed_scaled_df[['country','gender','tenure','age']] = imputed_scaled_df[['country','gender','tenure','age']].astype(int)

In [23]:
customer_data = imputed_scaled_df

### DELETE REDUNDANT VARIABLES TO FREE UP SPACE

In [24]:
import gc
del imputed_data, imputed_df, imputed_scaled_df,categorical_data, scaled_numeric_data, full_data,column_names
del gender_map, country_map, scaler, imputer, numerical_cols, categorical_cols

gc.collect()

0

In [27]:
missing_values = pd.DataFrame({
    'column': customer_data.columns,
    'no_of_missing_values': customer_data.isna().sum(),
    'percentages_of_missing_values': (customer_data.isna().sum()/len(customer_data)) * 100
})
missing_values = missing_values.reset_index(drop=True)
missing_values

Unnamed: 0,column,no_of_missing_values,percentages_of_missing_values
0,credit_score,0,0.0
1,age,0,0.0
2,balance,0,0.0
3,estimated_salary,0,0.0
4,country,0,0.0
5,gender,0,0.0
6,tenure,0,0.0
