# Installing libraries

In [None]:
# quick & dirty
# better to use a separate requirements.txt file
!pip install matplotlib numpy scipy scikit-learn 

# Importing libraries

In [None]:
# imports
import matplotlib.pyplot as plt
import numpy as np
from scipy.linalg import inv
from sklearn.datasets import make_regression, make_classification
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree

# set random state for reproducibility
kwargs = dict(random_state=42)

# Linear regression

In [None]:
# create dataset
X, y = make_regression(n_samples=100, n_features=1, noise=20, bias=50, **kwargs)

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, **kwargs)

# plot training set
plt.scatter(X_train, y_train)
plt.show()

In [None]:
# fit linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# make predictions on test set
y_pred = lr.predict(X_test)

# plot training set, regression line, and errors on test set
plt.scatter(X_train, y_train)
plt.plot(X, lr.predict(X), color="green")
plt.scatter(X_test, y_test, color="red")
for i in range(len(X_test)):
    plt.plot([X_test[i]]*2, [y_test[i], y_pred[i]], color="red")
    
# calculate mean squared error and mean absolute error
for name, metric in zip(("MSE", "MAE"), (mean_squared_error, mean_absolute_error)):
    print(f"{name}: {metric(y_test, y_pred)}")

# Multiple regression

In [None]:
# create dataset with 2 features
X, y = make_regression(n_samples=100, n_features=2, noise=20, bias=50, **kwargs)

# fit linear regression model
lr = LinearRegression()
lr.fit(X, y)

# calculate intercept and coefficients "by hand"
X = np.c_[np.ones(X.shape[0]), X]
theta = inv(X.T.dot(X)).dot(X.T).dot(y)

# check that intercept and coefficients are equal
print(theta)
print(lr.intercept_, lr.coef_)

# Logistic regression

In [None]:
# create classification dataset
X, y = make_classification(n_samples=1000, n_classes=2, n_features=2, n_informative=2, n_redundant=0, flip_y=0.05, n_clusters_per_class=1, **kwargs)

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, **kwargs)

# visualize training dataset
for label in np.unique(y_train):
    mask = y_train == label
    plt.scatter(X_train[mask, 0], X_train[mask, 1])

In [None]:
# fit logistic regression model
logr = LogisticRegression(max_iter=1000, **kwargs)
logr.fit(X_train, y_train)

# evaluate model
y_pred = logr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Decision trees / train-validation-test split

In [None]:
# create classification dataset with low signal-to-noise ratio
X, y = make_classification(n_samples=10000, n_classes=3, n_features=10, n_informative=2, n_redundant=8, flip_y=0.2, n_clusters_per_class=1, **kwargs)

# split into training, validation, and test set
X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, **kwargs)
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, shuffle=True, **kwargs)

# compare decision trees with max_depth None (no limit) and 2 on validation set and visualize tree
for max_depth in (None, 2):
    dt = DecisionTreeClassifier(max_depth=max_depth, **kwargs)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    print("Accuracy:", accuracy_score(y_val, y_pred))
    plot_tree(dt)
    
# max_depth 2 was better --> re-fit on training + validation set
dt = DecisionTreeClassifier(max_depth=2, **kwargs)
dt.fit(X_train_all, y_train_all)

# evaluate final model on test set
print("Accuracy:", accuracy_score(y_test, dt.predict(X_test)))

# Cross validation / bias-variance trade-off

In [None]:
# compare max_depth from 1 to 50 using 5-fold cross-validation on training (+ validation) set
gs = GridSearchCV(DecisionTreeClassifier(**kwargs), {"max_depth": range(1, 51)}, cv=KFold(5, shuffle=True, **kwargs), refit=True, return_train_score=True)
gs.fit(X_train_all, y_train_all)

# best configuration
gs.best_estimator_

In [None]:
# plot training and validation accuracies for all values of max_depth
# overfitting is apparent: training accuracy increases, validation accuracy decreases
res = gs.cv_results_
plt.plot(res["param_max_depth"].data, res["mean_test_score"], label="validation set")
plt.plot(res["param_max_depth"].data, res["mean_train_score"], label="training set")
plt.xlabel("max_depth")
plt.ylabel("Mean accuracy")
plt.legend()
plt.show()

In [None]:
# evaluate model with best configuration on test set
y_pred = gs.predict(X_test)
print(accuracy_score(y_test, y_pred))