## K-class Bayes Classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

To play around, we have a dataset that has features for mobile phones with their price range (0-3) as target values.

Data: [Mobile Price Classification](https://www.kaggle.com/iabhishekofficial/mobile-price-classification)

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
len(df)

2000

In [4]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

For simplicity, we drop the categorical and ordinal variables.

In [5]:
cat_columns = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
df = df.drop(columns=cat_columns)
df.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,price_range
0,842,2.2,1,7,0.6,188,2,2,20,756,2549,9,7,19,1
1,1021,0.5,0,53,0.7,136,3,6,905,1988,2631,17,3,7,2
2,563,0.5,2,41,0.9,145,5,6,1263,1716,2603,11,2,9,2
3,615,2.5,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,2
4,1821,1.2,13,44,0.6,141,2,14,1208,1212,1411,8,2,15,1


In [6]:
train_df = df.iloc[:1600]
test_df = df.iloc[1600:]
trainX, trainY = train_df.drop('price_range', axis=1).values, train_df['price_range'].values
testX, testY = test_df.drop('price_range', axis=1).values, test_df['price_range'].values

In [7]:
def pluginClassifier(trainX, trainY, testX):
    """Return the posterior probabilities for test set.
    
    Note: The number of classes is hard-coded in.
    """
    nc = 4
    n = len(trainX)
    priors = [0.0] * nc
    means = {}
    cov_invs = {}
    cov_ds = {}
    for c in range(nc):
        X = trainX[trainY == c]
        priors[c] = len(X) / n
        means[c] = np.mean(X, axis=0)
        cov = np.cov(X.T)
        cov_ds[c] = 1 / np.sqrt(np.linalg.det(cov))
        cov_invs[c] = np.linalg.inv(cov)
    posteriors = []
    for x in testX:
        for c in range(nc):
            diff = x - means[c]
            posteriors.append(priors[c] * cov_ds[c] * 
                              np.exp(- 1/2 * np.dot(diff, np.dot(cov_invs[c], diff.T))))
    predYs = np.reshape(posteriors, (len(testX), nc))
    # Normalize to sum to 1
    s = np.reshape(np.sum(predYs, axis=1), (-1, 1))
    predYs = predYs / s
    return predYs

In [8]:
final_outputs = pluginClassifier(trainX, trainY, testX)

In [9]:
np.savetxt("probs_test.csv", final_outputs, delimiter=",") # write output to file

To get the predicted class, we need to take the argmax of the posterior probabilities.

In [10]:
testPredY = np.argmax(final_outputs, axis=1)
testPredY

array([1, 0, 3, 1, 0, 2, 1, 2, 1, 2, 3, 2, 3, 2, 3, 3, 3, 3, 1, 2, 2, 3,
       1, 0, 1, 0, 0, 3, 2, 1, 0, 3, 3, 1, 0, 3, 2, 1, 2, 3, 2, 2, 3, 1,
       1, 1, 1, 0, 2, 1, 0, 1, 0, 3, 0, 3, 1, 1, 3, 3, 2, 1, 2, 3, 2, 1,
       2, 1, 1, 1, 3, 0, 1, 1, 3, 2, 1, 2, 2, 3, 2, 1, 1, 1, 3, 3, 3, 0,
       3, 2, 3, 1, 3, 3, 1, 2, 3, 3, 0, 0, 0, 2, 2, 0, 2, 0, 1, 1, 3, 3,
       0, 1, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 3, 0, 3, 0, 0,
       2, 2, 3, 3, 1, 2, 1, 1, 3, 1, 0, 2, 1, 0, 1, 2, 1, 3, 0, 1, 3, 0,
       3, 3, 3, 2, 0, 3, 1, 1, 0, 2, 2, 2, 0, 3, 3, 0, 3, 2, 3, 0, 2, 0,
       2, 1, 2, 3, 0, 3, 2, 0, 2, 1, 2, 3, 1, 1, 3, 1, 1, 1, 3, 0, 1, 0,
       1, 2, 0, 1, 3, 3, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 3, 3, 1, 0, 2,
       0, 3, 2, 2, 2, 1, 2, 3, 3, 0, 0, 1, 2, 2, 2, 1, 2, 1, 0, 2, 3, 0,
       0, 3, 0, 3, 0, 0, 3, 3, 1, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3,
       0, 2, 3, 2, 3, 3, 2, 3, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1, 0, 0, 3, 0,
       1, 2, 0, 3, 1, 0, 0, 3, 0, 1, 0, 3, 3, 0, 1,

In [11]:
# Accuracy
(testY == testPredY).sum() / len(testY)

0.94