1. Para-procesimi i te dhenave përgatitjen e të dhënave për analizë

In [None]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from IPython.display import display

Mbledhja e të dhënave, definimi i tipeve të dhënave, kualiteti i të
dhënave.

In [None]:
#Ngarkimi i te dhenave
file_path ='dataset.csv'
data=pd.read_csv(file_path)
data

In [None]:
# Informacion rreth dataframe-it 
data_info = data.info()
print(data_info)

In [None]:
#numri i rreshtave dhe kolonave ne dataframe
data.shape

In [None]:
#dtypes
data.dtypes

In [None]:
#shfaq statistikat per çdo kolone ne DataFrame
print(data.describe().T)

In [None]:
print(data.describe(include="O").T)

In [None]:
# llogarit perqindjen e rreshtave te dyfishte ne DataFrame

duplicated=data.duplicated().sum()
print((duplicated/len(data))*100 , '%')

In [None]:
# Funksioni missing_values_table llogarit numrin dhe perqindjen e vlerave Null

def missing_values_table(data):
  mis_val = data.isnull().sum()
  mis_val_percent = 100*data.isnull().sum() / len(data)
  mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
  mis_val_table_ren_columns = mis_val_table.rename(columns = {0:'Missing Values' , 1:'% of Total Values'})
  return mis_val_table_ren_columns.round(1)

In [None]:
missing_values_table(data)

In [None]:
data['ApplicantDependents'].value_counts().reset_index().rename(columns={'index' : 'ApplicantDependents' , 'ApplicantDependents' : 'counts'})



In [None]:
data['ApplicantEmplLength'].value_counts().reset_index().rename(columns={'index' : 'ApplicantEmplLength' , 'ApplicantEmplLength' : 'counts'})

In [None]:
data['ApplicantGender'].value_counts().reset_index().rename(columns={'index' : 'ApplicantGender' , 'ApplicantGender' : 'counts'})

In [None]:
data['LoanAmount'].value_counts().reset_index().rename(columns={'index' : 'LoanAmount' , 'LoanAmount' : 'counts'})

In [None]:
data['ApplicantHomeOwn'].value_counts().reset_index().rename(columns={'index' : 'ApplicantHomeOwn' , 'ApplicantHomeOwn' : 'counts'})

In [None]:
data['ApplicantEducation'].value_counts().reset_index().rename(columns={'index' : 'ApplicantEducation' , 'ApplicantEducation' : 'counts'})

In [None]:
data['ApplicantState'].value_counts().reset_index().rename(columns={'index' : 'ApplicantState' , 'ApplicantState' : 'counts'})

In [None]:
data['ApplicantZIP'].value_counts().reset_index().rename(columns={'index' : 'ApplicantZIP' , 'ApplicantZIP' : 'counts'})

In [None]:
ct = pd.crosstab(data['ApplicantGender'], data['LoanApproved'])
ct.plot(kind='bar', stacked=True , color=['#FF3030','#7FFFD4'])
plt.xlabel('ApplicantGender')
plt.ylabel('LoanApproved')
plt.title('Relationship between Years in Applicant Gender   and Loan Approved')

In [None]:
ct = pd.crosstab(data['ApplicantHomeOwn'], data['LoanApproved'])
ct.plot(kind='bar', stacked=True , color=['#FF3030','#7FFFD4'])
plt.xlabel('ApplicantHomeOwn')
plt.ylabel('LoanApproved')
plt.title('Relationship between Years in Applicant House  Own and Loan Approved')

In [None]:
data.head()

In [None]:
data.tail(25)

Aggregating Data

In [None]:
income_aggregation = data.groupby('LoanApproved')['ApplicantIncome'].agg(['mean', 'median', 'count'])

display(income_aggregation)

In [None]:
loan_amunt_aggregation = data.groupby('ApplicantDependents')['LoanAmount'].agg(['sum','mean','count'])

display(loan_amunt_aggregation)

In [None]:
loan_city_aggregation = data.groupby('ApplicantZIP')['LoanAmount'].agg(['min', 'max'])
display(loan_city_aggregation)

Sampling

In [None]:
data_sample = data.sample(frac=0.3, random_state=42)

In [None]:
data_sample.shape

Data cleaning and transformation

In [None]:
#convert Applicant Dependents to numeric values 
data_sample['ApplicantDependents'] = data_sample['ApplicantDependents'].replace({
    '0 oseb': 0, '1 oseba': 1, '2 osebi': 2, '3+ osebe': 3
}).astype('float')

In [None]:
data_sample['ApplicantZIP'] = data_sample['ApplicantZIP'].str.replace('xx', '').astype(int)

print(data_sample[['ApplicantZIP']].head())

In [None]:
data_sample

In [None]:
# standardize 'ApplicantEmplLength' by capturing numbers and converting "< 1 leto" as 0.5
def clean_employment_length(value):
    if isinstance(value, str):
        match = re.search(r"(\d+)", value)
        if match:
            return float(match.group(1))
        elif "< 1" in value:
            return 0.5
    return value


In [None]:
data_sample['ApplicantEmplLength'] = data_sample['ApplicantEmplLength'].apply(clean_employment_length).astype('float')
data_sample

In [None]:
#covert 'LoanIntRate' to float by removing % symbol 
data_sample['LoanIntRate']= data_sample['LoanIntRate'].str.replace('%','').astype('float')
data_sample

In [None]:
#fill missing categorical values with mode
for column in ['ApplicantGender', 'ApplicantMarried', 'ApplicantSelfEmployed', 'ApplicantCreditHistory']:
    data_sample[column].fillna(data_sample[column].mode()[0], inplace=True)


In [None]:
#fill numeric columns with median values 
for column in ['LoanAmount', 'LoanTerm','ApplicantDependents', 'ApplicantEmplLength']: 
    data_sample[column].fillna(data_sample[column].median(), inplace=True)

In [None]:
data_sample['LoanPurpose'].fillna('Unknown', inplace=True)

In [None]:
data_sample.drop(columns=['LoanDesc'], inplace=True)

In [None]:
print("Remaining Missing Values:", data_sample.isnull().sum())
print("Data Types:", data_sample.dtypes)

In [None]:
# Binning 'ApplicantIncome' into categories (low, medium, high income)
income_bins = [0, 2500, 5000, 10000, data_sample['ApplicantIncome'].max()]
income_labels = ['Low', 'Medium', 'High', 'Very High']
data_sample['IncomeLevel'] = pd.cut(data_sample['ApplicantIncome'], bins=income_bins, labels=income_labels)

In [None]:
# Binarization: Convert LoanApproved to 0 and 1
data_sample['LoanApproved'] = data_sample['LoanApproved'].replace({'Y': 1, 'N': 0})

In [None]:
# Discretizing 'LoanTerm' into short, medium, long-term categories
term_bins = [0, 60, 90, 120]  # Bin edges for Loan Term
term_labels = ['Short', 'Medium', 'Long']
data_sample['LoanTermCategory'] = pd.cut(data_sample['LoanTerm'], bins=term_bins, labels=term_labels)

In [None]:
# Transform categorical data into binary format using one-hot encoding
df =pd.get_dummies(data_sample, columns=['ApplicantGender', 'ApplicantMarried', 
                                     'ApplicantEducation', 'ApplicantSelfEmployed', 'ApplicantHomeOwn'
                                     ], drop_first=True)

In [None]:
print(df['ApplicantState'].unique())
#since the applicant state column has a this ['SI' 'si' 'Slovenija' 'Slo' 'slo'] dropping this column is a reasonable choice to simplify the dataset without losing any critical information

df.drop(columns=['ApplicantState'], inplace=True)

In [None]:
# Final Validation
remaining_missing_values = data_sample.isnull().sum()
data_types = data_sample.dtypes

print("Remaining Missing Values:\n", remaining_missing_values[remaining_missing_values > 0])
print("\nData Types:\n", data_types)

df.head()

In [None]:
# df.drop(labels=['LoanID','ApplicantID'], axis=1, inplace=True)

In [None]:
# # 'LoanPurpose' categories
# loan_purpose_mapping = {
#     'Unknown': 0,
#     'kartica': 1,
#     'zdravljenje': 2,
#     'drugo': 3,
#     'prenova': 4,
#     'investicija': 5,
#     'stanovanje': 6,
#     'selitev': 7,
#     'poèitnice': 8,
#     'obnovljivi_viri': 9
# }
# 
# df['LoanPurpose'] = df['LoanPurpose'].map(loan_purpose_mapping)
# df['LoanPurpose'] = df['LoanPurpose'].astype(int)

In [None]:
# categorical_columns = X.select_dtypes(include=['category']).columns
# X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
# print(X_encoded.head())


In [None]:
# PCA_df = pd.DataFrame(PCAPipeline.fit_transform(X_encoded))
# PCA_df = pd.concat([PCA_df, y], axis=1)
# PCA_df.head()