# Advanced Programming for AI

# HW8 Classification1

## Solutions

### Use the following data set for the problems

In [2]:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=1000, n_features=20, n_informative=18, n_redundant=2,
                          n_repeated=0, n_classes=3)
                          
import pandas as pd
data = pd.concat([pd.DataFrame(X),pd.DataFrame(y,columns=['class'])],axis=1)

# Problem 1: Make a python class that returns a train test validation split, call it `splitter`.

* Let the arguments be `splitter(data,target)`
    * `data` is the data set (without the target)
    * `target` is the target class of interest; i.e. `data['target']`
* Let there be *optional* arguments of `train_split`, `test_split`, `val_split` to describe the fractional distributions of the whole dataset
    * If those arguments *arent* specified, then let the train split be 70%, test_split be 15% and val split be 15%
        * *optional* means their initialized value is `None`
    * Raise a `ValueError('splits must add to 1')` **if** the splits dont add to 1
* Create a method called `split()` that returns `X_train,y_train,X_test,y_test,X_val,y_val` in that order
    * Instantiating it would look like `X_train,y_train,X_test,y_test,X_val,y_val = splitter(data,'class').split()`

### Hint: The beginning of the class should look like;

`from sklearn.model_selection import train_test_split
class splitter:
    def __init__(self,data,target,train_split=None,test_split=None,val_split=None):
        self.y = data[target]
        self.X = data.drop(target,axis=1)
        if train_split is None:
            self.train_split=0.7
        else:
            self.train_split=train_split
            ....`
            
            finish the rest...

In [3]:
from sklearn.model_selection import train_test_split
class splitter:
    def __init__(self,data,target,train_split=None,test_split=None,val_split=None):
        self.y = data[target]
        self.X = data.drop(target,axis=1)
        if train_split is None:
            self.train_split=0.7
        else:
            self.train_split=train_split
        if test_split is None:
            self.test_split=0.15
        else:
            self.test_split=test_split
        if val_split is None:
            self.val_split=0.15
        else:
            self.val_split=val_split

        if not self.train_split+self.test_split+self.val_split==1:
            raise ValueError('splits must add to 1')
    
    def split(self):
        test_size = self.test_split+self.val_split
        val_size = self.val_split/test_size
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
        X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size=val_size)
        return X_train,y_train,X_test,y_test,X_val,y_val

In [4]:
X_train,y_train,X_test,y_test,X_val,y_val = splitter(data,'class').split()

# Problem 2: Make a class that fits the model just like example 8 in the lecture 8 notebook. That class should inherit the `splitter` class and return a fitted model


* The class should be instantiated using
    * `model(LogisticRegression,[0.0001,0.001,0.01,0.1,1,10],data,'class')`
* Include a `fit()` method that runs through all the possible hyper parameters, finding the best one
* Include a `best_model()` method that find the best model just like example 6
* Include a `report(y_pred,y_obs)` method that returns an `accuracy_score`



### Hint: The beginning of the class should look like this (with a few more things to add to it in the constructor)

`from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np`

`class model(splitter):
    def __init__(self,model,parameters,data,target):
        self.model = model
        self.model_best = model
        self.parameters = parameters
        self.score_reports = {}
        self.scores = {}
        self.max_score = 0
        self.min_diff = np.inf
        super().__init__(data,target)
        self.X_train,self.y_train,self.X_test,self.y_test,self.X_val,self.y_val = super().split()`

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

class model(splitter):
    def __init__(self,model,parameters,data,target):
        self.model = model
        self.model_best = model
        self.parameters = parameters
        self.score_reports = {}
        self.scores = {}
        self.max_score = 0
        self.min_diff = np.inf
        super().__init__(data,target)
        self.X_train,self.y_train,self.X_test,self.y_test,self.X_val,self.y_val = super().split()
        
    def fit(self):
        for c in self.parameters:
            model = self.model(multi_class="multinomial",solver="lbfgs",C=c)
            model.fit(self.X_train,self.y_train)
            y_pred_val = model.predict(self.X_val)
            y_pred_train = model.predict(self.X_train)
            train_score = accuracy_score(y_pred_train,self.y_train)
            val_score = accuracy_score(y_pred_val,self.y_val)
            diff = np.abs(val_score-train_score)
            self.score_reports[c] = classification_report(y_pred_val,y_val)
            self.scores[c] = (val_score,train_score,diff)
            if val_score>self.max_score:
                self.max_score = val_score
            if diff<self.min_diff:
                self.min_diff = diff
        return
    
    def report(self,y_pred,y_obs):
        print('accuracy score: ',accuracy_score(y_pred,y_obs))
    
    def best_model(self):
        self.fit()
        self.c_best = [j for j in self.scores.keys() if self.scores[j][0]==self.max_score][0]
        print("Best C value: ",self.c_best)
        model_best = self.model_best(multi_class="multinomial",solver="lbfgs",C=self.c_best)
        model_best.fit(self.X_train,self.y_train)
        y_pred = model_best.predict(self.X_test)
        self.report(y_pred,self.y_test)
        return

In [8]:
m = model(LogisticRegression,[0.0001,0.001,0.01,0.1,1,10],data,'class')
m.best_model()

Best C value:  1
accuracy score:  0.66
