# Chapter 1

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)

In [None]:
knn.fit(X, y)
knn.predict(X_new)

In [None]:
# Measuring model performance
# Using Iris dataset rather than what's used in the videos
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
X = iris.data[["sepal length (cm)", "sepal width (cm)"]]
y = iris.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

In [None]:
train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1, 26)

In [None]:
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)


In [None]:
train_accuracies

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8, 6))
plt.title("KNN: Varying number of neighbors")
plt.plot(neighbors, train_accuracies.values(), label="Training accuracy")
plt.plot(neighbors, test_accuracies.values(), label="Testing accuracy")
plt.legend()
plt.xlabel("Number of neighbors")
plt.ylabel("Accuracy")
plt.show()

# Chapter 2
We pull the diabetes data from Kaggle

## Introduction to Regression

In [None]:

import pandas as pd
diabetes_df = pd.read_csv(
    "https://www.kaggle.com/api/v1/datasets/download/saurabh00007/diabetescsv", compression="zip"
).rename(
    columns = lambda col: col.lower()
)
# Filter out small values of bmi
diabetes_df= diabetes_df[
    (diabetes_df.bmi > 1e-6) & (diabetes_df.glucose > 1e-6) # To better replicate the DataCamp data set
]
# diabetes_df = pd.read_csv("https://hbiostat.org/data/repo/diabetes.csv")
print(diabetes_df.head())

In [None]:
X = diabetes_df.drop("glucose", axis=1).values
y = diabetes_df.glucose.values
print(type(X), type(y))

In [None]:
# The shapes are different than in the tutorial
X_bmi = X[:, 4] # Terrible way to do this by the way
print(y.shape, X_bmi.shape)

In [None]:
X_bmi = X_bmi.reshape(-1, 1)
print(X_bmi.shape)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X_bmi, y)
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_bmi, y)
predictions = reg.predict(X_bmi)

plt.scatter(X_bmi, y)
plt.plot(X_bmi, predictions, color="red")
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

## The Basics of Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)

In [None]:
reg_all.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error
print(mean_squared_error(y_test, y_pred)) # looks like squared=False is no longer supported
print(root_mean_squared_error(y_test, y_pred)) # looks like squared=False is no longer supported

## Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=6, shuffle=True, random_state=42)
reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=kf)

In [None]:
print(cv_results)

In [None]:
print(np.mean(cv_results), np.std(cv_results))

In [None]:
print(np.quantile(cv_results, [0.025, 0.975]))

## Regularized Regression
Lasso can select important features of a dataset

In [None]:
from sklearn.linear_model import Ridge
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test) # NOTE: Not used
    score = ridge.score(X_test, y_test)
    scores.append(score)
print(scores)

In [None]:
from sklearn.linear_model import Lasso
scores = []
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Ridge(alpha=alpha)
    lasso.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    score = lasso.score(X_test, y_test)
    scores.append(score)
    
print(scores)

In [None]:
from sklearn.linear_model import Lasso
# NOTE: Note the column name change here, mainly to make bar chart rendering a bit better
X = diabetes_df.rename(columns={"diabetespedigreefunction": "dpf"}).drop("glucose", axis=1).values
y = diabetes_df.glucose.values
names = diabetes_df.rename(columns={"diabetespedigreefunction": "dpf"}).drop("glucose", axis=1).columns

lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X,y).coef_

plt.bar(names, lasso_coef)
plt.xticks(rotation=45)
plt.show()


# Chapter 3: Fine-Tuning Your Model

## How good is your model?

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Measuring model performance
# Using Iris dataset rather than what's used in the videos
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
X = iris.data[["sepal length (cm)", "sepal width (cm)"]]
y = iris.target

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
iris.target.value_counts()

## Logistic Regression and the ROC Curve

In [None]:
# Measuring model performance
# Using breast cancer dataset for binary classification
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer(as_frame=True)
X = dataset.data
y = dataset.target

In [None]:
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = LogisticRegression(max_iter=3000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
reg.score(X_test, y_test)

In [None]:
y_pred_probs = reg.predict_proba(X_test)[:, 1]
print(y_pred_probs[0:5])

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_pred_probs))

In [None]:
# Manually checking fpr and tpr
threshold = thresholds[5]
print(fpr[5], tpr[5])
# vals, cts = np.unique(y_test[y_pred_probs >= threshold], return_counts=True) # WRONG
vals, cts = np.unique((y_pred_probs >= threshold).astype(int)[y_test == 1], return_counts=True)
print("Estimate of TPR: ", cts[1]/(cts[1] + cts[0])) 
vals, cts = np.unique((y_pred_probs >= threshold).astype(int)[y_test == 0], return_counts=True) # WRONG
print("Estimate of FPR: ", cts[1]/(cts[1] + cts[0])) 

In [None]:
print(confusion_matrix(y_test, (y_pred_probs >= thresholds[5]).astype(int)))

## Hyperparameter Tuning

In [None]:
# Reading the diabetes dataset back in
import pandas as pd
diabetes_df = pd.read_csv(
    "https://www.kaggle.com/api/v1/datasets/download/saurabh00007/diabetescsv", compression="zip"
).rename(
    columns = lambda col: col.lower()
)
# Filter out small values of bmi
diabetes_df= diabetes_df[
    (diabetes_df.bmi > 1e-6) & (diabetes_df.glucose > 1e-6) # To better replicate the DataCamp data set
]
print(diabetes_df.head())

X = diabetes_df.drop("glucose", axis=1).values
y = diabetes_df.glucose.values
print(type(X), type(y))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV, KFold # , cross_val_score
from sklearn.linear_model import Ridge
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {"alpha": np.arange(0.0001, 1, 10), "solver": ["sag", "lsqr"]}
reg = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=kf)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.linear_model import Ridge
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {"alpha": np.arange(0.0001, 1, 10), "solver": ["sag", "lsqr"]}
reg = Ridge()
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv=kf, n_iter=2)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)

In [None]:
test_score = ridge_cv.score(X_test, y_test)
print(test_score)

# Chapter 4: Preprocessing and Pipelines

## Preprocessing data

In [None]:
import pandas as pd
music_df = pd.read_csv(
    # 'https://www.kaggle.com/api/v1/datasets/download/shantanuchaubey/musiccsv',
    'https://www.kaggle.com/api/v1/datasets/download/saurabhshahane/music-dataset-1950-to-2019', 
    # 'https://www.kaggle.com/api/v1/datasets/download/akiboy96/spotify-hit-predictor-merged-with-genre',
    compression='zip'
)
music_dummies = pd.get_dummies(music_df["genre"], drop_first=True)
print(music_dummies.head())

In [None]:
music_df.head()

In [None]:
music_dummies2 = pd.concat([music_df, music_dummies], axis=1)
music_dummies2 = music_dummies2.drop("genre", axis=1)

In [None]:
# # Encoding dummy variables
# music_dummies = pd.get_dummies(music_df, drop_first=True)
# print(music_dummies.columns)

In [None]:
# Linear regression with dummy variables
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

# I couldn't find the data set online, so making some adjustments
data = pd.concat([music_df[["release_date", "loudness"]], music_dummies], axis=1)
# music_dummies = music_dummies.drop("genre", axis=1)
response_variable = "loudness" # "popularity" 

In [None]:
data

In [None]:
X = data.drop(response_variable, axis=1).values
y = data[response_variable].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
linreg= LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf, scoring="neg_mean_squared_error")

In [None]:
print(np.sqrt(-linreg_cv))

## Handling missing data
This section is not run as we don't have the actual data set.

In [None]:
# Dropping missing data
music_df = music_df.dropna(subset=["genre", "popularity", "Loudness", "Liveness", "tempo"])
print(music_df.isna().sum().sort_values())

In [None]:
# Imputation with scikit-learn
from sklearn.impute import SimpleImputer
X_cat = music_df["genre"].values.reshape(-1, 1)
X_num = music_df.drop(["genre", "popularity"], axis=1).values
y=music_df["popularity"].values

X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    X_cat, y, test_size=0.2, random_state=12
)
X_train_num, X_test_num, y_train, y_test train_test_split(
    X_num, y, test_size=0.2, random_state=12
)

imp_cat = SimpleImputer(strategy="most_frequent")
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.fit_transform(X_test_num)
X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)


In [None]:
from sklearn.pipeline import Pipeline
music_df = music_df.dropna(subset=["genre", "popularity", "Loudness", "Liveness", "tempo"])
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)
X = music_df.drop("genre", axis=1).values
y = music_df["genre"].values

In [None]:
steps = [("imputation", SimpleImputer()), ("logistic_regresion", LogisticRegression())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

## Centering and scaling

In [None]:
# Scaling in scikit-learn
from sklearn.preprocessing import StandardScaler

X = music_df.drop("genre", axis=1).values
y = music_df["genre"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(np.mean(X), np.std(X))
print(np.mean(X_train_scaled), np.std(X_train_scaled))

In [None]:
# Scaling in a pipeline
steps = [
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=6))
]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21
)

knn_scaled = pipeline.fit(X_train, y_train)
y_pred = knn_scaled.predict(X_test)
print(knn_scaled.score(X_test, y_test))

In [None]:
# Comparing performance using unscaled data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21
)

knn_unscaled = KNeighborsClassifier(n_neighbors=6).fit(X_train, y_train)
print(knn_unscaled.score(X_test, y_test))

In [None]:
# CV and scaling in a pipeline
from sklearn.model_selection import GridSearchCV
steps = [
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
]
pipeline = Pipeline (steps)
parameters = {"knn__n_neighbors": np.arange(1, 50)}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21
)
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
print(cv.best_score_)
print(cv.best_params_)

## Evaluating multiple models

In [None]:
music_df

In [None]:
# Evaluating classification models
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# This time we'll use the actual DataCamp music data set
import pandas as pd
music_df = pd.read_csv(
    "/home/justinian/Code/datascience-miscellaneous/notebooks/python/music_clean.csv"
).drop(
    columns=["Unnamed: 0"]
)
# music_dummies = pd.get_dummies(music_df["genre"], drop_first=True)
# music = pd.concat([music_df, music_dummies], axis=1)
# This cleaned data set seems to have a simple form of the genre column, no need to create dummies
music = music_df

X = music.drop("genre", axis=1).values
y = music["genre"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Evaluating classification models
models = {
    "Logistic Regression": LogisticRegression(), 
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

results = []
for model in models.values():
    kf = KFold(n_splits=6, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
    results.append(cv_results)

plt.boxplot(results, labels=models.keys())
plt.show()

In [None]:
# Test set performance
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    print("{} Test Set Accuracy: {}".format(name, test_score))

# Experiments

In [None]:
from sklearn.neighbors import KDTree
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
kdt = KDTree(X, leaf_size=30, metric='euclidean')
kdt.query(X, k=2, return_distance=False)

In [None]:
kdt.get_arrays()