## Data Pre-Processing

In [1]:
import pandas as pd


data_path = "C:/Users/Hande/Desktop/RR_project_HHRPTeam/data/german_credit_data.csv"
data = pd.read_csv(data_path)


In [2]:
data.shape

(1000, 11)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [None]:
data.describe().T

 Removing redundant column 

In [None]:

data.drop("Unnamed: 0", axis = 1, inplace = True)

### Filling missing values

In [None]:
data.isnull().values.any()

In [None]:
data.isnull().sum()

The dataset comprises 1,000 entries, with 183 entries lacking data in the Savings Account attribute, accounting for 18.3% of the total records. Consequently, we will employ the Most Common Technique to impute these missing values.

In [None]:
savings_mode = data['Saving accounts'].mode()
savings_mode

In [None]:
data['Saving accounts'].fillna(savings_mode[0], inplace=True)
data['Saving accounts'].isnull().sum()

There are plenty of missing records in the Checking Account attribute, so it would be appropriate to fill in these values rather than remove them to improve the analysis of the data set.

In [17]:
check_mode = data['Checking account'].mode()
check_mode

0    little
Name: Checking account, dtype: object

In [18]:
data['Checking account'].fillna(check_mode[0], inplace=True)
data['Checking account'].isnull().sum()

0

In [19]:
data.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

Target Variable

In [6]:
data["Risk"].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

### Encoding


In [5]:
from sklearn.preprocessing import LabelEncoder

In [7]:
# Identifying binary and nominal categorical columns
binary_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].nunique() == 2]
nominal_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].nunique() > 2]

In [8]:

# Label encode binary columns directly using pandas
for col in binary_cols:
    data[col] = data[col].astype('category').cat.codes

In [9]:

# One-hot encode nominal columns using pandas get_dummies
data = pd.get_dummies(data, columns=nominal_cols, drop_first=True)


In [10]:

# Calculating Pearson Correlation Coefficients
correlation_matrix = data.corr()
target_correlation = correlation_matrix['Risk'].sort_values(ascending=False) 

print(target_correlation)


Risk                           1.000000
Housing_own                    0.134589
Purpose_radio/TV               0.106922
Age                            0.091127
Saving accounts_rich           0.085749
Sex                            0.075493
Saving accounts_quite rich     0.070954
Checking account_rich          0.044009
Purpose_domestic appliances   -0.008016
Purpose_repairs               -0.020828
Purpose_furniture/equipment   -0.020971
Saving accounts_moderate      -0.022255
Purpose_car                   -0.022621
Purpose_vacation/others       -0.028058
Job                           -0.032735
Unnamed: 0                    -0.034606
Purpose_education             -0.049085
Housing_rent                  -0.092785
Checking account_moderate     -0.119581
Credit amount                 -0.154739
Duration                      -0.214927
Name: Risk, dtype: float64


In [11]:

# Selecting features based on a correlation threshold
threshold = 0.1 
selected_features = target_correlation[abs(target_correlation) > threshold].index.tolist()
print("Selected features based on Pearson correlation:", selected_features)

Selected features based on Pearson correlation: ['Risk', 'Housing_own', 'Purpose_radio/TV', 'Checking account_moderate', 'Credit amount', 'Duration']


Selected Features:

Housing_own: 0.134589

Purpose_radio/TV: 0.106922

Checking account_moderate: -0.119581

Credit amount: -0.154739

Duration: -0.214927