In [2]:
%%writefile requirements.txt
numpy
pandas==2.2.3
matplotlib==3.10.0

Overwriting requirements.txt


In [3]:
%pip install -r requirements.txt

Collecting numpy (from -r requirements.txt (line 1))
  Downloading numpy-2.3.4-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==2.2.3 (from -r requirements.txt (line 2))
  Downloading pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matplotlib==3.10.0 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas==2.2.3->-r requirements.txt (line 2))
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas==2.2.3->-r requirements.txt (line 2))
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib=

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


First, read in the csv files.

In [5]:
# TODO: Read in credit train and credit test csv
df1 = pd.read_csv('archive/credit_train.csv')
df2 = pd.read_csv('archive/credit_test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'archive/credit_train.csv'

In [None]:
# TODO: Return the first 5 rows of the train dataframe
df1.head()

In [None]:
# TODO: Return a summary of the train dataframe
df1.info()

In [None]:
#Drops duplicates and unnecessary columns
df1 = df1.drop_duplicates()
df1 = df1.drop(columns=['Loan ID', 'Customer ID'], axis=1)

Then, handle all null values.

In [None]:
# TODO: Check for missing values in the train dataframe (Hint: use isnull().sum())
df1.isnull().sum()

In [None]:
# standardize nulls to np.nan, so pandas recognizes them as missing
df1.replace(['nan','NaN','NULL','',' '], np.nan, inplace=True)

# TODO: Impute missing values (Hint: Use fillna())
# Hint: you can use median, mode, mean, or a specific value to fill in missing values
df1['Current Loan Amount'].fillna(0, inplace=True)
df1['Years of Credit History'].fillna(df1['Years of Credit History'].median(), inplace=True)

In [None]:
# TODO: Check for missing values again to confirm they were filled
df1.isnull().sum()

Feature Engineering

In [None]:
import re
import numpy as np
import pandas as pd


# 1a) Years in current job: extract numeric and bin
def extract_years(s):
    s = str(s)
    match = re.findall(r'\d+', s)
    if match:
        return int(match[0])
    return None

df1['Years in current job'] = df1['Years in current job'].apply(extract_years)

bins = [0, 2, 5, 10, float('inf')]
labels = ['0-2', '2-5', '5-10', '10+']
df1['Years in current job'] = pd.cut(df1['Years in current job'], bins=bins, labels=labels, right=False)

# 1b) Standardize Purpose values to the canonical list used in the reference
df1['Purpose'] = df1['Purpose'].replace({
    'Home Improvements':'Home Renovation / Improvement',
    'Debt Consolidation':'Debt Consolidation',
    'Buy House':'Home Purchase / Mortgage',
    'Business Loan':'Business / Startup Capital',
    'Buy a Car':'Car / Vehicle Purchase',
    'Take a Trip':'Vacation / Travel',
    'small_business':'Business / Startup Capital',
    'Medical Bills':'Medical Expenses',
    'wedding':'Wedding Expenses',
    'vacation':'Vacation / Travel',
    'Educational Expenses':'Education / Tuition Fees',
    'moving':'Moving / Relocation Costs',
    'other':'Other / Miscellaneous',
    'Other':'Other / Miscellaneous',
    'renewable_energy':'Buying Electronics / Appliances',
    'major_purchase':'Business / Startup Capital'
})

Then, encode the categorical data.

In [None]:
#Separate categorical and numerical columns
cat_list = df1.select_dtypes(include='object').columns.tolist()
num_list = df1.select_dtypes(exclude='object').columns.tolist()
print("Categorical:", cat_list)
print("Numerical:", num_list)

In [1]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

# 2a) Ordinal encoding for 'Years in current job'
od = OrdinalEncoder(categories=[['0-2', '2-5', '5-10', '10+']])
df1['Years in current job'] = od.fit_transform(df1[['Years in current job']])

# 2b) Label encode simple categories
le = LabelEncoder()
for col in ['Loan Status', 'Term', 'Home Ownership']:
    df1[col] = le.fit_transform(df1[col].fillna('Missing'))

# 2c) One-hot encode 'Purpose' 
# TODO: One-hot encode 'Purpose' (Hint: use OneHotEncoder from sklearn)
ohe = OneHotEncoder(sparse_output=False, drop='first')
purpose_encoded = ohe.fit_transform(df1[['Purpose']].fillna('Missing'))
purpose_df = pd.DataFrame(purpose_encoded, columns=ohe.get_feature_names_out(['Purpose']), index=df1.index)
df1 = pd.concat([df1.drop('Purpose', axis=1), purpose_df], axis=1)

ModuleNotFoundError: No module named 'sklearn'

Finally, create a train/test split.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TODO: Create feature matrix X and target vector y
X = df1.drop('Loan Status', axis=1)
y = df1['Loan Status']

# TODO: Create a train/test split (Hint: use train_test_split from sklearn)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify = y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Save the processed data to CSV files
import pandas as pd

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df  = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled_df.to_csv("X_train_scaled.csv", index=False)
X_test_scaled_df.to_csv("X_test_scaled.csv", index=False)

y_train.to_csv("y_train.csv", index=False, header=["Loan Status"])
y_test.to_csv("y_test.csv", index=False, header=["Loan Status"])