In [None]:
import numpy as np
import pandas as pd

In [None]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [None]:
# Download Dataset from https://www.dropbox.com/scl/fi/32vgpt3jvtztu86avdnwg/Mortgage.xlsx?rlkey=qx1d46hzgn4h67zrcyajdyl3e&dl=1
# and add it to colab

In [None]:
# mortgageDf = pd.read_excel("./Mortgage.xlsx")
mortgageDf = pd.read_excel("https://www.dropbox.com/scl/fi/32vgpt3jvtztu86avdnwg/Mortgage.xlsx?rlkey=qx1d46hzgn4h67zrcyajdyl3e&dl=1")

In [None]:
mortgageDf

In [None]:
mortgageDf.size

In [None]:
mortgageDf.describe()

In [None]:
mortgageDf.shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plotting
fig1 = plt.figure(
  figsize=(8, 8)
)

In [None]:
plt.scatter(
  mortgageDf["x1"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.xlabel('x1')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
plt.scatter(
  mortgageDf["x2"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.xlabel('x2')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
model1 = sm.OLS(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1", "x2"]])
)
model1Fit = model1.fit()
print(model1Fit.summary())

In [None]:
predict1 = model1Fit.predict(sm.add_constant(mortgageDf[["x1", "x2"]]))
mortgageDf['predict1'] = predict1
mortgageDf

In [None]:
model1Fit.predict([[1, 20, 30]])

In [None]:
model1Fit.predict([[1, 20, 15]])

In [None]:
model1Fit.predict([[1, 40, 50]])

In [None]:
model2 = sm.Logit(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1", "x2"]])
)
model2Fit = model2.fit()
print(model2Fit.summary())

In [None]:
predict2 = model2Fit.predict(sm.add_constant(mortgageDf[["x1", "x2"]]))
mortgageDf['predict2'] = predict2
mortgageDf

In [None]:
model2Fit.predict([[1, 20, 30]]), model2Fit.predict([[1, 20, 15]]), model2Fit.predict([[1, 40, 50]])

In [None]:
model3 = sm.OLS(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1"]])
)
model3Fit = model3.fit()
print(model3Fit.summary())

In [None]:
model4 = sm.Logit(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1"]])
)
model4Fit = model4.fit()
print(model4Fit.summary())

In [None]:
min = 0
min

In [None]:
max = mortgageDf["x1"].max() + 10
max, len(mortgageDf["x1"])

In [None]:
x = np.linspace(min - 5, max + 5, 500)
# x

In [None]:
import math

In [None]:
lREq = 0.0141 + x *  0.0227
logREq = pow(math.e, (-2.2077  + 0.1043 * x))/ (1+ pow(math.e, (-2.2077  + 0.1043 * x)))

In [None]:
len(lREq)

In [None]:

plt.scatter(
  mortgageDf["x1"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.plot(
  x,
  lREq,
  color='red',
  alpha=0.9,
  label='lREq',
)

plt.plot(
  x,
  logREq,
  color='green',
  alpha=0.9,
  label='logREq',
)

plt.xlabel('x1')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
mortgageDf

In [None]:
mortgageDf['yHat2'] = mortgageDf['predict2'].apply(lambda x: 1 if x > 0.5 else 0)
mortgageDf

Hold-out

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
# trainSet, testSet = train_test_split(wagesDf, test_size=0.15, random_state=55)
trainSet, testSet = train_test_split(mortgageDf, test_size=0.15)

trainSet.head()

In [None]:
mortgageDf.shape, trainSet.shape, testSet.shape

In [None]:
modelHoldOut = sm.Logit(
  trainSet["y"],
  sm.add_constant(trainSet[["x1", "x2"]])
)
modelHoldOutFit = modelHoldOut.fit()
print(modelHoldOutFit.summary())

In [None]:
print(model4Fit.summary())

In [None]:
predictHoldOut = modelHoldOutFit.predict(sm.add_constant(testSet[["x1", "x2"]]))
testSet['predictHoldOut'] = predictHoldOut
testSet

In [None]:
testSet['yHatHoldOut'] = testSet['predictHoldOut'].apply(lambda x: 1 if x > 0.5 else 0)
testSet['isHoldOutCorrect'] = testSet.apply(lambda row: 1 if row['y'] == row['yHatHoldOut'] else 0, axis=1)
testSet

In [None]:
accuracy = (np.sum(testSet['isHoldOutCorrect']) / len(testSet['yHatHoldOut'])) * 100
accuracy

K-Fold Cross validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=55)


In [None]:
check = kf.split(mortgageDf)
check
experiment = 1
# Loop through each fold
# Initialize variables to store results
accuracies = []

for train_index, val_index in check:
    # Split the data
    trainSet, valSet = mortgageDf.iloc[train_index], mortgageDf.iloc[val_index]

    # Fit the model

    trainModel = sm.Logit(
      trainSet["y"],
      sm.add_constant(trainSet[["x1", "x2"]])
    )
    trainModelFit = trainModel.fit()

    # Predict on the validation set
    val_predictions = trainModelFit.predict(sm.add_constant(valSet[["x1", "x2"]]))
    valSet['val_predictions'] = val_predictions
    valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)
    valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)
    accuracy = (np.sum(valSet['isCrossCorrect']) / len(valSet['yHatCross'])) * 100
    accuracies.append(accuracy)


    # Print summary for each fold (optional)
    print(f'expr={experiment}')
    experiment = experiment +1
    print(trainModelFit.summary())

In [None]:
accuracies

In [None]:
print(f"Average accuracies across all folds: {sum(accuracies) /len(accuracies)}")