In [1]:
import pandas as pd
import numpy as np

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets - 1
  
# metadata 
statlog_german_credit_data.metadata
  
# variable information 
statlog_german_credit_data.variables


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Attribute1,Feature,Categorical,,Status of existing checking account,,no
1,Attribute2,Feature,Integer,,Duration,months,no
2,Attribute3,Feature,Categorical,,Credit history,,no
3,Attribute4,Feature,Categorical,,Purpose,,no
4,Attribute5,Feature,Integer,,Credit amount,,no
5,Attribute6,Feature,Categorical,,Savings account/bonds,,no
6,Attribute7,Feature,Categorical,Other,Present employment since,,no
7,Attribute8,Feature,Integer,,Installment rate in percentage of disposable i...,,no
8,Attribute9,Feature,Categorical,Marital Status,Personal status and sex,,no
9,Attribute10,Feature,Categorical,,Other debtors / guarantors,,no


In [3]:
import re
feature_transformation_dict = {}

with open('./data/german.doc', 'r') as f:
    for line in f:
        attribute_regex = r'.*A\d+.*:.*'
        if re.match(attribute_regex, line):
            old, new = line.split(sep=' : ', maxsplit=1)
            old = old.strip()
            new = new.strip()
            feature_transformation_dict[old] = new

In [4]:
# Rename Attributes columns in a more descriptive way
X.columns = statlog_german_credit_data.variables[:-1]['description']

# Replace values in the dataframe with more descriptive names
for to_replace, value in feature_transformation_dict.items():
    X.replace(to_replace, value, inplace=True)

# print(X)
#print(X.columns)
#for column in X.columns:
#    print(X[column].unique())
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


description,Status of existing checking account,Duration,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,Present residence since,Property,Age,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker
0,... < 0 DM,6,critical account/,radio/television,1169,unknown/ no savings account,.. >= 7 years,4,male : single,none,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,2,real estate,22,none,own,1,skilled employee / official,1,none,yes
2,no checking account,12,critical account/,education,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,3,real estate,49,none,own,1,unskilled - resident,2,none,yes
3,... < 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,4,if not A121 : building society savings agreement/,45,none,for free,1,skilled employee / official,2,none,yes
4,... < 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,4,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking account,12,existing credits paid back duly till now,furniture/equipment,1736,... < 100 DM,4 <= ... < 7 years,3,female : divorced/separated/married,none,4,real estate,31,none,own,1,unskilled - resident,1,none,yes
996,... < 0 DM,30,existing credits paid back duly till now,car (used),3857,... < 100 DM,1 <= ... < 4 years,4,male : divorced/separated,none,4,if not A121 : building society savings agreement/,40,none,own,1,management/ self-employed/,1,"yes, registered under the customers name",yes
997,no checking account,12,existing credits paid back duly till now,radio/television,804,... < 100 DM,.. >= 7 years,4,male : single,none,4,"if not A121/A122 : car or other, not in attrib...",38,none,own,1,skilled employee / official,1,none,yes
998,... < 0 DM,45,existing credits paid back duly till now,radio/television,1845,... < 100 DM,1 <= ... < 4 years,4,male : single,none,4,unknown / no property,23,none,for free,1,skilled employee / official,1,"yes, registered under the customers name",yes


In [5]:
list(X["Telephone"].unique())

['yes, registered under the customers name', 'none']

## Encoding of the features

In [6]:
# Checking account
checking_dict = {'no checking account': 0, 
                 "... <    0 DM": 1, 
                 '0 <= ... <  200 DM': 2,
                 '... >= 200 DM /': 3}
checking_account = X["Status of existing checking account"].apply(lambda i: checking_dict[i])
# Duration
duration = X["Duration"]
# Credit history
credit_history_dict = {'critical account/': 0,
                       'delay in paying off in the past': 1,
                       'no credits taken/': 2,
                       'existing credits paid back duly till now': 3,
                       'all credits at this bank paid back duly': 4}
credit_history = X["Credit history"].apply(lambda i: credit_history_dict[i])
# Purpose
purpose = pd.get_dummies(X["Purpose"], prefix="Purpose")
# Credit amount
credit_amount = X["Credit amount"]
# Savings / bonds
savings_dict = {'unknown/ no savings account': 0,
                '... <  100 DM': 1,
                '100 <= ... <  500 DM': 2,
                '500 <= ... < 1000 DM': 3,
                '.. >= 1000 DM': 4}
savings = X["Savings account/bonds"].apply(lambda i: savings_dict[i])
# Present employment
employment_dict = {'unemployed': 0,
                   '... < 1 year': 1,
                   '1  <= ... < 4 years': 2,
                   '4  <= ... < 7 years': 3,
                   '.. >= 7 years': 4}
employment = X["Present employment since"].apply(lambda i: employment_dict[i])
# Installment rate
installment = X["Installment rate in percentage of disposable income"]
# Personal status and sex
sex = X["Personal status and sex"].apply(lambda s: int(s.startswith("female"))).rename("Sex")
# Other debtors / guarantors
other_debtors = pd.get_dummies(X["Other debtors / guarantors"], prefix="Other debtors")
# Present Residence
present_residence = X["Present residence since"]
# Age
age = X["Age"]
# Housing
housing_dict = {'rent': 0,
                'own': 1,
                'for free': 2}
housing = X["Housing"].apply(lambda i: housing_dict[i])
# Number of existing credits
existing_credits = X["Number of existing credits at this bank"]
# Job
job_dict = {'unemployed/ unskilled  - non-resident': 0,
            'unskilled - resident': 1,
            'skilled employee / official': 2,
            'management/ self-employed/': 3}
job = X["Job"].apply(lambda i: job_dict[i])
# Number of people providing for
number_of_people = X["Number of people being liable to provide maintenance for"]
# Telephone
telephone = X["Telephone"].apply(lambda s: int(s != "none"))
# Foreign workter
foreign_worker = X["foreign worker"].apply(lambda s: int(s == "yes"))

In [7]:
df = pd.concat([checking_account, duration, credit_history, 
                purpose, credit_amount, savings, 
                employment, installment, sex, 
                other_debtors, present_residence, age, 
                housing, existing_credits, job, 
                number_of_people, telephone, foreign_worker], axis=1)

In [8]:
df

Unnamed: 0,Status of existing checking account,Duration,Credit history,Purpose_business,Purpose_car (new),Purpose_car (used),Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_others,...,Other debtors_guarantor,Other debtors_none,Present residence since,Age,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker
0,1,6,0,0,0,0,0,0,0,0,...,0,1,4,67,1,2,2,1,1,1
1,2,48,3,0,0,0,0,0,0,0,...,0,1,2,22,1,1,2,1,0,1
2,0,12,0,0,0,0,0,1,0,0,...,0,1,3,49,1,1,1,2,0,1
3,1,42,3,0,0,0,0,0,1,0,...,1,0,4,45,2,1,2,2,0,1
4,1,24,1,0,1,0,0,0,0,0,...,0,1,4,53,2,2,2,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,12,3,0,0,0,0,0,1,0,...,0,1,4,31,1,1,1,1,0,1
996,1,30,3,0,0,1,0,0,0,0,...,0,1,4,40,1,1,3,1,1,1
997,0,12,3,0,0,0,0,0,0,0,...,0,1,4,38,1,1,2,1,0,1
998,1,45,3,0,0,0,0,0,0,0,...,0,1,4,23,2,1,2,1,1,1


In [9]:
df.to_csv("data/X.csv")
y.to_csv("data/y.csv")

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

categorical_columns = [0,2,3,5,68,9,11,13,14,16]

# Create a column transformer with one-hot encoding for categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and a random forest classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

#encoder = OneHotEncoder()
#test_column = np.array(X_train.iloc[:,0]).reshape(-1,1)
#res = encoder.fit_transform(X=test_column)
#print(res)

#print(X.head())

#pipeline.fit(X_train, y_train)
print(X_train['Attribute4'])
one_hot_encode = ['Attribute4']

# Make predictions on the test data
#y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the classifier
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Accuracy: {accuracy}")


KeyError: 'Attribute4'