In [1]:
import numpy as np
import pandas as pd 
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor


In [2]:
df=pd.read_csv('german_credit_data_biased_training.csv')

In [3]:
df.head()

Unnamed: 0,CheckingStatus,LoanDuration,CreditHistory,LoanPurpose,LoanAmount,ExistingSavings,EmploymentDuration,InstallmentPercent,Sex,OthersOnLoan,...,OwnsProperty,Age,InstallmentPlans,Housing,ExistingCreditsCount,Job,Dependents,Telephone,ForeignWorker,Risk
0,0_to_200,31,credits_paid_to_date,other,1889,100_to_500,less_1,3,female,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,No Risk
1,less_0,18,credits_paid_to_date,car_new,462,less_100,1_to_4,2,female,none,...,savings_insurance,37,stores,own,2,skilled,1,none,yes,No Risk
2,less_0,15,prior_payments_delayed,furniture,250,less_100,1_to_4,2,male,none,...,real_estate,28,none,own,2,skilled,1,yes,no,No Risk
3,0_to_200,28,credits_paid_to_date,retraining,3693,less_100,greater_7,3,male,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,No Risk
4,no_checking,28,prior_payments_delayed,education,6235,500_to_1000,greater_7,3,male,none,...,unknown,57,none,own,2,skilled,1,none,yes,Risk


In [4]:
df.isnull().sum()

CheckingStatus              0
LoanDuration                0
CreditHistory               0
LoanPurpose                 0
LoanAmount                  0
ExistingSavings             0
EmploymentDuration          0
InstallmentPercent          0
Sex                         0
OthersOnLoan                0
CurrentResidenceDuration    0
OwnsProperty                0
Age                         0
InstallmentPlans            0
Housing                     0
ExistingCreditsCount        0
Job                         0
Dependents                  0
Telephone                   0
ForeignWorker               0
Risk                        0
dtype: int64

In [14]:
label_encoders={}
for column in df.select_dtypes(include=['object']).columns:
    le=LabelEncoder()
    df[column]=le.fit_transform(df[column])
    label_encoders[column]=le
print(df.head())


   CheckingStatus  LoanDuration  CreditHistory  LoanPurpose  LoanAmount  \
0       -1.498350      0.860709      -0.813744     0.636309   -0.639532   
1        0.151815     -0.303985      -0.813744    -0.801028   -1.213089   
2        0.151815     -0.572761       1.129293     0.276975   -1.298298   
3       -1.498350      0.591933      -0.813744     1.714312    0.085553   
4        0.976898      0.591933       1.129293    -0.082359    1.107264   

   ExistingSavings  EmploymentDuration  InstallmentPercent       Sex  \
0        -1.391793            1.260063            0.015617 -1.279504   
1         0.862245           -1.149543           -0.871708 -1.279504   
2         0.862245           -1.149543           -0.871708  0.781553   
3         0.862245            0.456861            0.015617  0.781553   
4        -0.640447            0.456861            0.015617  0.781553   

   OthersOnLoan  ...  OwnsProperty       Age  InstallmentPlans  Housing  \
0      0.436603  ...      0.654267 -0.369

In [13]:
scalar=StandardScaler()
numerical_feature=df.select_dtypes(include=['int64','float64']).columns
df[numerical_feature]=scalar.fit_transform(df[numerical_feature])

In [7]:
X=df.drop('Risk',axis=1)
y=df['Risk']


In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
df_model=RandomForestClassifier(n_estimators=100,random_state=42)

In [18]:
df_model.fit(X_train, y_train)

In [12]:
print(df['Risk'].unique())
print(df['Risk'].value_counts())

[-0.70816771  1.41209489]
Risk
-0.708168    3330
 1.412095    1670
Name: count, dtype: int64


In [11]:
print(df.dtypes)

CheckingStatus               object
LoanDuration                float64
CreditHistory                object
LoanPurpose                  object
LoanAmount                  float64
ExistingSavings              object
EmploymentDuration           object
InstallmentPercent          float64
Sex                          object
OthersOnLoan                 object
CurrentResidenceDuration    float64
OwnsProperty                 object
Age                         float64
InstallmentPlans             object
Housing                      object
ExistingCreditsCount        float64
Job                          object
Dependents                  float64
Telephone                    object
ForeignWorker                object
Risk                        float64
dtype: object


In [12]:
df_encoded=pd.get_dummies(df,drop_first=True)

In [14]:
print(df_encoded.head())


   LoanDuration  LoanAmount  InstallmentPercent  CurrentResidenceDuration  \
0      0.860709   -0.639532            0.015617                  0.130693   
1     -0.303985   -1.213089           -0.871708                 -0.765693   
2     -0.572761   -1.298298           -0.871708                  0.130693   
3      0.591933    0.085553            0.015617                 -0.765693   
4      0.591933    1.107264            0.015617                  0.130693   

        Age  ExistingCreditsCount  Dependents      Risk  \
0 -0.369327             -0.823902   -0.443882 -0.708168   
1  0.100268              0.944887   -0.443882 -0.708168   
2 -0.745003              0.944887   -0.443882 -0.708168   
3 -0.369327             -0.823902   -0.443882 -0.708168   
4  1.978648              0.944887   -0.443882  1.412095   

   CheckingStatus_greater_200  CheckingStatus_less_0  ...  \
0                       False                  False  ...   
1                       False                   True  ...   

In [27]:
print(X_train.dtypes)

CheckingStatus                int64
LoanDuration                float64
CreditHistory                 int64
LoanPurpose                   int64
LoanAmount                  float64
ExistingSavings               int64
EmploymentDuration            int64
InstallmentPercent          float64
Sex                           int64
OthersOnLoan                  int64
CurrentResidenceDuration    float64
OwnsProperty                  int64
Age                         float64
InstallmentPlans              int64
Housing                       int64
ExistingCreditsCount        float64
Job                           int64
Dependents                  float64
Telephone                     int64
ForeignWorker                 int64
dtype: object


In [15]:
le=LabelEncoder()
y=le.fit_transform(y)

In [19]:
y_pred=df_model.predict(X_test)

In [20]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.80      0.92      0.85       665
           1       0.77      0.55      0.64       335

    accuracy                           0.79      1000
   macro avg       0.78      0.73      0.75      1000
weighted avg       0.79      0.79      0.78      1000



In [21]:
print(confusion_matrix(y_test,y_pred))

[[609  56]
 [151 184]]


In [22]:
print(accuracy_score(y_test,y_pred))

0.793


In [23]:
import joblib
joblib.dump(df_model,'german_creditcard.pkl')

['german_creditcard.pkl']