# Majority Class classifier
A simple example of a classifier in the `sklearn` framework.   
https://sklearn-template.readthedocs.io/en/latest/user_guide.html   
This classifier simply identifies the most frequent class and always predicts that.  
Implementing the classifier entails defining the `fit` and `predict` methods. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import cross_val_score


In [2]:
penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)
print(penguins_af.shape)
penguins_af.head()

(333, 8)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [3]:
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap'])]

In [4]:
# copy with modded values
y = penguins.pop('species').values
X_raw = penguins.values
feature_names = penguins.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
max_k = X_train.shape[1]
X_train.shape, X_test.shape

((166, 4), (167, 4))

In [5]:
y = penguins2C.pop('species').values
X_raw = penguins2C.values
feature_names = penguins2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
max_k = X_train.shape[1]
X_train.shape, X_test.shape

((107, 4), (107, 4))

In [6]:
X_train

array([[1.        , 0.40350877, 0.19444444, 0.44155844],
       [0.55212355, 0.40350877, 0.47222222, 0.44155844],
       [0.50579151, 0.19298246, 0.47222222, 0.20779221],
       [0.27413127, 0.54385965, 0.44444444, 0.72727273],
       [0.33590734, 0.50877193, 0.58333333, 0.54545455],
       [0.03861004, 0.10526316, 0.11111111, 0.02597403],
       [0.82625483, 0.77192982, 0.86111111, 0.85714286],
       [0.54054054, 0.47368421, 0.11111111, 0.20779221],
       [0.14671815, 0.19298246, 0.44444444, 0.1038961 ],
       [0.27027027, 0.56140351, 0.19444444, 0.46753247],
       [0.28571429, 0.40350877, 0.38888889, 0.23376623],
       [0.25096525, 0.29824561, 0.69444444, 0.46753247],
       [0.76833977, 0.61403509, 0.63888889, 0.67532468],
       [0.13513514, 0.35087719, 0.47222222, 0.16883117],
       [0.44015444, 0.45614035, 0.77777778, 0.28571429],
       [0.21621622, 0.75438596, 0.66666667, 0.33766234],
       [0.12355212, 0.59649123, 0.36111111, 0.49350649],
       [0.2007722 , 0.87719298,

## Gaussian NB
Running Gaussian Naive Bayes on the penguin dataset.

In [7]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
gnb.fit(X_train,y_train)
gnb.predict(X_test)

array(['Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Adelie', 'Chinstrap', 'Chinstrap', 'Chinstrap', 'Chinstrap',
       'Adelie', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Adelie', 'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Adelie', 'Chinstrap

In [9]:
gnb.score(X_test, y_test)

0.9626168224299065

## Majority Class Classifier
An implementation of a Majority Class Classifier that fits the framework.

In [10]:
class MyMCC(BaseEstimator, ClassifierMixin):          
    def fit(self, X, y):
        self.X = X
        self.y = y
        
        c_dict = Counter(self.y)
        self.most_freq = max(c_dict, key=c_dict.get)
        print(c_dict, self.most_freq)
        return self
    
    # The predictions are the most common class in the training set.
    def predict(self, Xtes):
        self.Xtes = Xtes
        n_test = self.Xtes.shape[0]
        ytes = np.full((n_test), self.most_freq)
        return ytes
    
    def predict_proba(self, Xtes):
        pass
    # We should really be implementing predict_proba as well. 

In [11]:
# check_estimator(MyMCC())

In [12]:
mcc = MyMCC()

In [38]:
mcc.fit(X_train,y_train)

Counter({'Adelie': 76, 'Chinstrap': 31}) Adelie


MyMCC()

In [39]:
mcc.predict(X_test)

array(['Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adel

The `score` method is inherited from `ClassifierMixin`.

In [40]:
mcc.score(X_test, y_test)

0.6542056074766355

In [41]:
cross_val_score(mcc, X_test, y_test, cv=5, scoring='accuracy', n_jobs=-1)

array([0.63636364, 0.63636364, 0.66666667, 0.66666667, 0.66666667])

In [None]:
# penguins
models = [gaussianPenguin, myPenguin]
for m in models:
    print(m)
    accuracy = cross_val_score(m, penguins, y_penguins, cv=5, scoring='accuracy', n_jobs=-1)
    print(accuracy)