# Logistic Regression

## Helper Functions

In [None]:
class LinearModel(object):
    """
    Base class for linear models.
    """

    def __init__(self, step_size=0.2, max_iter=100, eps=1e-5,
                 theta_0=None, verbose=True):
        """
        Args:
            step_size: Step size for iterative solvers only.
            max_iter: Maximum number of iterations for the solver.
            eps: Threshold for determining convergence.
            theta_0: Initial guess for theta. If None, use the zero vector.
            verbose: Print loss values during training.
        """
        self.theta = theta_0
        self.step_size = step_size
        self.max_iter = max_iter
        self.eps = eps
        self.verbose = verbose

    def fit(self, x, y):
        """
        Run solver to fit linear model.

        Args:
            x: Training example inputs. Shape (m, n).
            y: Training example labels. Shape (m,).
        """
        raise NotImplementedError('Subclass of LinearModel must implement fit method.')

    def transform(self, x):
        """
        Transforms the input data to the feature space learned from the training data.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Transformed inputs of shape (m, n').
        """
        raise NotImplementedError('Subclass of LinearModel must implement transform method.')

    def set_params(self, **kwargs):
        """Set the parameters of this model.

        Args:
            **kwargs: A dictionary of parameters and their values.
        """
        for k, v in kwargs.items():
            setattr(self, k, v)

    def predict(self, x):
        """
        Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        raise NotImplementedError('Subclass of LinearModel must implement predict method.')

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def add_intercept(x):
    """Add intercept to matrix x.

    Args:
        x: 2D NumPy array.

    Returns:
        New matrix same as x with 1's in the 0th column.
    """
    new_x = np.zeros((x.shape[0], x.shape[1] + 1), dtype=x.dtype)
    new_x[:, 0] = 1
    new_x[:, 1:] = x

    return new_x


def load_dataset(csv_path, label_col='y', add_intercept=False):
    """Load dataset from a CSV file.

    Args:
         csv_path: Path to CSV file containing dataset.
         label_col: Name of column to use as labels (should be 'y' or 't').
         add_intercept: Add an intercept entry to x-values.

    Returns:
        xs: Numpy array of x-values (inputs).
        ys: Numpy array of y-values (labels).
    """

    def add_intercept_fn(x):
        global add_intercept
        return add_intercept(x)

    # Validate label_col argument
    allowed_label_cols = ('y', 't')
    if label_col not in allowed_label_cols:
        raise ValueError('Invalid label_col: {} (expected {})'
                         .format(label_col, allowed_label_cols))

    # Load headers
    with open(csv_path, 'r') as csv_fh:
        headers = csv_fh.readline().strip().split(',')

    # Load features and labels
    x_cols = [i for i in range(len(headers)) if headers[i].startswith('x')]
    l_cols = [i for i in range(len(headers)) if headers[i] == label_col]
    inputs = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=x_cols)
    labels = np.loadtxt(csv_path, delimiter=',', skiprows=1, usecols=l_cols)

    if inputs.ndim == 1:
        inputs = np.expand_dims(inputs, -1)

    if add_intercept:
        inputs = add_intercept_fn(inputs)

    return inputs, labels

## Logistic Regression

### Load Data

In [None]:
x_train, y_train = load_dataset('ds1_train.csv', add_intercept=True)
x_eval, y_eval = load_dataset('ds1_valid.csv', add_intercept=True)

### In `sklearn` format

In [None]:
class LogisticRegression(LinearModel):
    """Logistic regression with Newton's Method as the solver.

    Example usage:
        > clf = LogisticRegression()
        > clf.fit(x_train, y_train)
        > clf.predict(x_eval)
    """

    def transform(self, x):

      return x

    def fit(self, x, y):
        """Run Newton's Method to minimize J(theta) for logistic regression.

        Args:
            x: Training example inputs. Shape (m, n).
            y: Training example labels. Shape (m,).
        """
        m, n = x.shape
        self.theta = np.zeros(n)

        while True:

            old_theta = self.theta

            hx = 1/(1 + np.exp(-x.dot(self.theta)))
            H = (x.T * hx * (hx - 1)).dot(x)/m
            grad_l = x.T.dot(y - hx)/m

            self.theta = self.theta - np.linalg.inv(H).dot(grad_l)

            if(np.linalg.norm(self.theta-old_theta, ord=1) < self.eps):
                break

    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        hx = x.dot(self.theta)
        hx = 1/(1 + np.exp(-hx))

        return hx

    def score(self, x, y):
        """
        Return accuracy of model on a given dataset (x, y).

        Args:
            x: Inputs of shape (m, n).
            y: Outputs of shape (m,).

        Returns:
            Accuracy (between 0.0 and 1.0).
        """
        y_pred = self.predict(x)
        return (y_pred == y).sum()/len(y)

### Train!

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_eval) > 0.5
print(f"Achieved {100*(y_pred == y_eval).sum()/len(y_eval):.2f}% accuracy.")

### But what's the big talk of using sklearn? I'll talk about `pipelines`, which are very similar to `nn.Sequential` or `keras.Sequential` and so reduces a lot of coding on our part, check out the next cell.

In [None]:
!pip install -q scikit-learn
import sklearn
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
x_train, y_train = load_dataset('ds1_train.csv', add_intercept=False)
x_eval, y_eval = load_dataset('ds1_valid.csv', add_intercept=False)
# try add_intercept=True, can you guess why it fails?!

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('estimator', LogisticRegression())])
pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_eval) > 0.5
print(f"Achieved {100*(y_pred == y_eval).sum()/len(y_eval):.2f}% accuracy.")

### Although the accuracy dropped but this helps with writing (annoying) Cross Validation code!
Note the '__' and the warnings, which are fine.

In [None]:
param_grid = {"estimator__step_size":[1, 0.1, 1e-3, 1e-4]}
search = GridSearchCV(pipe, param_grid, scoring='accuracy')
search.fit(x_train, y_train)

print(search.best_params_)

y_pred = search.best_estimator_.predict(x_eval) > 0.5
print(f"Achieved {100*(y_pred == y_eval).sum()/len(y_eval):.2f}% accuracy.")

## Poisson regression

### You have to basically implement the full above pipeline (not just the pipeline!) add data, write the `.fit()` function and `.predict()` function.

### Poisson regression

In [None]:
class PoissonRegression(LinearModel):
    """
    Poisson Regression.

    Example usage:
        > clf = PoissonRegression(step_size=lr)
        > clf.fit(x_train, y_train)
        > clf.predict(x_eval)
    """

    def fit(self, x, y):
        """
        Run gradient ascent to maximize likelihood for Poisson regression.

        Args:
            x: Training example inputs. Shape (m, n).
            y: Training example labels. Shape (m,).
        """
        # *** START CODE HERE ***
        
        # *** END CODE HERE ***

    def predict(self, x):
        """
        Make a prediction given inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Floating-point prediction for each input, shape (m,).
        """
        # *** START CODE HERE ***
        # *** END CODE HERE ***

### Train!

This tutorial was made by Karan, and borrows from the official <a href="https://scikit-learn.org/stable/">SciKit learn Docs</a>.