<a href="https://colab.research.google.com/github/hejasevis/Credit-Approval-Dataset-Analysis/blob/main/Imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install UCIMLRepo
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

# Dataset
credit_approval = fetch_ucirepo(id=27)

# Separate data
X = credit_approval.data.features
y = credit_approval.data.targets

# Metadata and variable information
print(credit_approval.metadata)
print(credit_approval.variables)

# Check for missing values
print(X.isnull().sum())


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
{'uci_id': 27, 'name': 'Credit Approval', 'repository_url': 'https://archive.ics.uci.edu/dataset/27/credit+approval', 'data_url': 'https://archive.ics.uci.edu/static/public/27/data.csv', 'abstract': 'This data concerns credit card applications; good mix of attributes', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 690, 'num_features': 15, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['A16'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1987, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C5FS30', 'creators': ['J. R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This file concerns 

In [2]:
# Separate numeric and categorical columns
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(exclude=['int64', 'float64']).columns

print("Sayısal Sütunlar:", numeric_columns)
print("Kategorik Sütunlar:", categorical_columns)

Sayısal Sütunlar: Index(['A15', 'A14', 'A11', 'A8', 'A3', 'A2'], dtype='object')
Kategorik Sütunlar: Index(['A13', 'A12', 'A10', 'A9', 'A7', 'A6', 'A5', 'A4', 'A1'], dtype='object')


In [3]:
from sklearn.impute import SimpleImputer
import pandas as pd

# For numeric columns only
mean_imputer = SimpleImputer(strategy='mean')
X_numeric_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(X[numeric_columns]), columns=numeric_columns)

# Add back categorical data
X_mean_imputed = pd.concat([X_numeric_mean_imputed, X[categorical_columns]], axis=1)

print(X_mean_imputed.head())

     A15    A14  A11    A8     A3     A2 A13 A12 A10 A9 A7 A6 A5 A4 A1
0    0.0  202.0  1.0  1.25  0.000  30.83   g   f   t  t  v  w  g  u  b
1  560.0   43.0  6.0  3.04  4.460  58.67   g   f   t  t  h  q  g  u  a
2  824.0  280.0  0.0  1.50  0.500  24.50   g   f   f  t  h  q  g  u  a
3    3.0  100.0  5.0  3.75  1.540  27.83   g   t   t  t  v  w  g  u  b
4    0.0  120.0  0.0  1.71  5.625  20.17   s   f   f  t  v  w  g  u  b


In [4]:
# For numeric columns only
median_imputer = SimpleImputer(strategy='median')
X_numeric_median_imputed = pd.DataFrame(median_imputer.fit_transform(X[numeric_columns]), columns=numeric_columns)

# Add back categorical data
X_median_imputed = pd.concat([X_numeric_median_imputed, X[categorical_columns]], axis=1)

print(X_median_imputed.head())

     A15    A14  A11    A8     A3     A2 A13 A12 A10 A9 A7 A6 A5 A4 A1
0    0.0  202.0  1.0  1.25  0.000  30.83   g   f   t  t  v  w  g  u  b
1  560.0   43.0  6.0  3.04  4.460  58.67   g   f   t  t  h  q  g  u  a
2  824.0  280.0  0.0  1.50  0.500  24.50   g   f   f  t  h  q  g  u  a
3    3.0  100.0  5.0  3.75  1.540  27.83   g   t   t  t  v  w  g  u  b
4    0.0  120.0  0.0  1.71  5.625  20.17   s   f   f  t  v  w  g  u  b


In [5]:
from sklearn.impute import KNNImputer

# For numeric columns only
knn_imputer = KNNImputer(n_neighbors=5)
X_numeric_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(X[numeric_columns]), columns=numeric_columns)

# Add back categorical data
X_knn_imputed = pd.concat([X_numeric_knn_imputed, X[categorical_columns]], axis=1)

print(X_knn_imputed.head())

     A15    A14  A11    A8     A3     A2 A13 A12 A10 A9 A7 A6 A5 A4 A1
0    0.0  202.0  1.0  1.25  0.000  30.83   g   f   t  t  v  w  g  u  b
1  560.0   43.0  6.0  3.04  4.460  58.67   g   f   t  t  h  q  g  u  a
2  824.0  280.0  0.0  1.50  0.500  24.50   g   f   f  t  h  q  g  u  a
3    3.0  100.0  5.0  3.75  1.540  27.83   g   t   t  t  v  w  g  u  b
4    0.0  120.0  0.0  1.71  5.625  20.17   s   f   f  t  v  w  g  u  b


In [6]:
# Apply mode imputation to categorical columns
mode_imputer = SimpleImputer(strategy='most_frequent')
X_categorical_mode_imputed = pd.DataFrame(mode_imputer.fit_transform(X[categorical_columns]), columns=categorical_columns)

# Combine numerical and categorical data
X_final_imputed = pd.concat([X_numeric_knn_imputed, X_categorical_mode_imputed], axis=1)

print(X_final_imputed.head())


     A15    A14  A11    A8     A3     A2 A13 A12 A10 A9 A7 A6 A5 A4 A1
0    0.0  202.0  1.0  1.25  0.000  30.83   g   f   t  t  v  w  g  u  b
1  560.0   43.0  6.0  3.04  4.460  58.67   g   f   t  t  h  q  g  u  a
2  824.0  280.0  0.0  1.50  0.500  24.50   g   f   f  t  h  q  g  u  a
3    3.0  100.0  5.0  3.75  1.540  27.83   g   t   t  t  v  w  g  u  b
4    0.0  120.0  0.0  1.71  5.625  20.17   s   f   f  t  v  w  g  u  b


In [7]:
from sklearn.impute import SimpleImputer

# Mode imputation
mode_imputer = SimpleImputer(strategy='most_frequent')
X_mode_imputed = pd.DataFrame(mode_imputer.fit_transform(X), columns=X.columns)

print(X_mode_imputed.head())


   A15    A14 A13 A12 A11 A10 A9    A8 A7 A6 A5 A4     A3     A2 A1
0    0  202.0   g   f   1   t  t  1.25  v  w  g  u    0.0  30.83  b
1  560   43.0   g   f   6   t  t  3.04  h  q  g  u   4.46  58.67  a
2  824  280.0   g   f   0   f  t   1.5  h  q  g  u    0.5   24.5  a
3    3  100.0   g   t   5   t  t  3.75  v  w  g  u   1.54  27.83  b
4    0  120.0   s   f   0   f  t  1.71  v  w  g  u  5.625  20.17  b
