In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Exercise 1: Defining The Problem And Data Collection
# Problem Statement:
# The objective of this project is to develop a predictive model to identify loan applicants who are likely to default on their loans.

# Data Collection Plan (Hypothetical data collection plan, code will be based on a sample dataset)
data = pd.read_csv('loan_data.csv')  # Assuming you have a dataset 'loan_data.csv'

In [None]:

# Exercise 2: Feature Selection And Model Choice
# Assuming columns: 'age', 'income', 'loan_amount', 'repayment_history', 'credit_score', etc.

# Feature Selection
selected_features = ['age', 'income', 'loan_amount', 'repayment_history', 'credit_score']
X = data[selected_features]
y = data['default']  # Assuming 'default' is the target variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Exercise 3: Training, Evaluating, And Optimizing The Model
# Model Training with Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Model Prediction
y_pred = log_reg.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, log_reg.predict_proba(X_test)[:, 1])

# Print Metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

# Plot Confusion Matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot ROC Curve
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Evaluating the Tuned Random Forest Model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])

print(f"Random Forest - Accuracy: {accuracy_rf}")
print(f"Random Forest - Precision: {precision_rf}")
print(f"Random Forest - Recall: {recall_rf}")
print(f"Random Forest - F1 Score: {f1_rf}")
print(f"Random Forest - ROC AUC: {roc_auc_rf}")

# Plot ROC Curve for Random Forest
fpr_rf, tpr_rf, _rf = roc_curve(y_test, best_rf.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Exercise 4: Designing Machine Learning Solutions For Specific Problems

# Predicting Stock Prices: Supervised Learning (Regression)
# - Using historical stock prices to predict future prices
# - Model: Linear Regression, LSTM, ARIMA

# Organizing a Library of Books: Unsupervised Learning (Clustering)
# - Grouping books into genres or categories
# - Model: K-Means Clustering, Hierarchical Clustering

# Programming a Robot to Navigate a Maze: Reinforcement Learning
# - Teaching a robot to find the shortest path in a maze
# - Model: Q-Learning, Deep Q-Networks (DQN)

# Example pseudocode for these tasks:
# Stock Prices
# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(X_train_stock, y_train_stock)
# y_pred_stock = lr.predict(X_test_stock)

# Books Clustering
# from sklearn.cluster import KMeans
# kmeans = KMeans(n_clusters=5)
# kmeans.fit(book_features)
# clusters = kmeans.predict(book_features)

# Robot Navigation
# from some_reinforcement_learning_library import QLearningAgent
# agent = QLearningAgent()
# agent.train(maze_environment)

In [None]:
# Exercise 5: Designing An Evaluation Strategy For Different ML Models

# Supervised Learning Model (Classification)
# Model Choice: Logistic Regression, Random Forest
# Evaluation Strategy:
# - Metrics: Accuracy, Precision, Recall, F1-Score, ROC-AUC
# - Methods: Cross-validation, Confusion Matrix, ROC Curves

# Unsupervised Learning Model (Clustering)
# Model Choice: K-Means, Hierarchical Clustering
# Evaluation Strategy:
# - Techniques: Silhouette Score, Elbow Method, Cluster Validation Metrics

# Reinforcement Learning Model
# Model Choice: Q-Learning, DQN
# Evaluation Strategy:
# - Metrics: Cumulative Reward, Convergence, Exploration vs. Exploitation Balance