# Classification Practice

In [None]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# I made something for y'all

def eval_classification(model, model_name,
                        X_tr, X_te, y_tr, y_te,
                        to_print=False):
    '''
    Finds predictions for train and test sets, then
    prints metrics for classification nicely

    Inputs:
    model : already-fit sklearn model
    model_name : string, name for index for output df
    X_tr : training X (can be scaled, that's fine)
    X_te : testing X
    y_tr : training target
    y_te : testing target
    to_print : boolean, will print output nicely if True

    Outputs:
    metric_df - pandas Dataframe showing output
    '''
    
    metrics = {"Accuracy": accuracy_score,
               "Recall": recall_score,
               "Precision": precision_score,
               "F1-Score": f1_score}

    y_pred_tr = model.predict(X_tr)
    y_pred_te = model.predict(X_te)

    # Defining the column names based on the metric dict keys
    col_list = []  # Starting a list
    for name in metrics.keys():
        col_list.append(f"{name.lower()}_train")
        col_list.append(f"{name.lower()}_test")

    metric_df = pd.DataFrame(columns=col_list)

    for name, metric_function in metrics.items():
        tr_col = f"{name.lower()}_train"
        metric_df.at[model_name, tr_col] = metric_function(y_tr, y_pred_tr)
        te_col = f"{name.lower()}_test"
        metric_df.at[model_name, te_col] = metric_function(y_te, y_pred_te)
        
        # Adding to-print option to print the metrics nicely
        if to_print:
            print(f"{name}:"); print("="*len(name))
            print(f"TRAIN: {metric_function(y_tr, y_pred_tr):.4f}")
            print(f"TEST: {metric_function(y_te, y_pred_te):.4f}")
            print("*" * 15)
    
    return metric_df

In [None]:
# Data import
df = pd.read_csv('data/baseball_height_weight.csv')

In [None]:
df.head()

In [None]:
df['position'].value_counts()

## Problem Definition

I'm curious whether there are definitive differences between types of baseball players in terms of their physical attributes. Let's see if we can define a model that can predict whether a player is a pitcher or not.

In [None]:
# Defining our target
df['pitcher'] = np.where(df['position'].str.contains("_Pitcher", case=False, na=False), 1, 0)

In [None]:
df['pitcher'].value_counts()

In [None]:
# baseline accuracy (if we always predicted pitcher)

len(df[df['pitcher'] == 1]) / len(df)

In [None]:
sns.pairplot(df)

## Modeling

#### Model 1: Vanilla Logistic Regression

In [None]:
# Defining our X and y
X = df[['height_in', 'weight_lb', 'age']]
y = df['pitcher']

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
# Scaling our data
scaler = StandardScaler()

X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
# Instantiating and fitting our first model
logreg = LogisticRegression()
logreg.fit(X_train_sc, y_train)

In [None]:
# Using our predefined function
logreg_scores = eval_classification(logreg, "logreg",
                                    X_train_sc, X_test_sc,
                                    y_train, y_test,
                                    to_print=True)

In [None]:
logreg_scores

#### Model 2: Balanced Logistic Regression

In [None]:
logreg_2 = LogisticRegression(class_weight='balanced')
logreg_2.fit(X_train_sc, y_train)

In [None]:
logreg_bal = eval_classification(logreg_2, "logreg_bal", 
                                 X_train_sc, X_test_sc, 
                                 y_train, y_test,
                                 to_print=True)

In [None]:
# Now, because we have these as dataframes with the same colnames:
metrics = pd.concat([logreg_scores, logreg_bal])
metrics

#### Model 3: KNN with K=3

In [None]:
# Instantiate and fit your model



In [None]:
# Evaluate

In [None]:
# How can I compare KNN results to my earlier logreg?
for k in list(range(1, 11, 2)):
    None

In [None]:
metrics

## Evaluating

In [None]:
test_cols = [c for c in metrics.columns.to_list() if "test" in c]

In [None]:
metrics[test_cols]

# Level Up: Multi-Class Classification

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()

In [None]:
iris.keys()

In [None]:
X = iris['data']
y = iris['target']

In [None]:
iris_df = pd.DataFrame(X, columns = iris['feature_names'])
iris_df['target'] = y

In [None]:
iris_df.sample(n=10)

In [None]:
iris_df['target'].value_counts()

In [None]:
iris_df.describe()

## Modeling

In [None]:
# New train test split, now for iris data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
# Still need to scale
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
# Instantiate and fit a KNN classifier
knn = KNeighborsClassifier()
knn.fit(X_train_sc, y_train)

## Evaluating: How do we evaluate a multi-class model?

https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel

In [None]:
print(f"Train: {knn.score(X_train_sc, y_train)}")
print(f"Test: {knn.score(X_test_sc, y_test)}")

In [None]:
plot_confusion_matrix(knn, X_test_sc, y_test)