In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import os

In [2]:
# make as class

class multiclass_classification():

    def fit(self, y, X):
        self.X = X
        self.y = y
        ytype = list(y.unique())
        self.ytype = ytype
        resp_b = pd.DataFrame()
        for i in range(len(ytype)):
            resp_b[f'Class {i+1}'] = (y == ytype[i]).astype(int)
        self.resp_b = resp_b
        mod = []
        for i in range(len(ytype)):
            mod.append(sm.Logit(resp_b[f'Class {i+1}'],X).fit())
        self.mod = mod
        
    def predict(self,X_tst):
        ytype = self.ytype
        pred_b = pd.DataFrame()
        mod = self.mod        
        for i in range(len(ytype)):
            pred_b[f'Class {i+1}'] = mod[i].predict(X_tst)
        
        # scale probability
        b_sum = pred_b.sum(axis = 1)
        for i in range(len(ytype)):
            pred_b[f'Class {i+1}'] = pred_b[f'Class {i+1}'] / b_sum
        pred_b = pred_b.round(2)
        y_pred = []
        for i in range(len(pred_b)):
            max_idx = np.argmax(pred_b.iloc[i])
            y_pred.append(ytype[max_idx])    
        y_pred = pd.DataFrame({'Final prediction' : y_pred})
        return pred_b, y_pred
    
    def result(self,y_tst,X_tst):
        pred_b, y_pred = self.predict(X_tst)
        result = pd.concat([pd.DataFrame({'Actual Class' : y_tst}),pred_b, y_pred],axis = 1)
        result.insert(loc=  0, column= 'ID', value = result.index)
        return result

    def print_result(self, y_tst, X_tst):
        result = self.result(y_tst,X_tst)
        # make dummy file
        result.to_csv('temp1.txt', index=False)
        r = open('temp1.txt','r')
        cont = r.readlines()
        r.close()
        print(cont[0])
        print('-----------------------------------------------------------------')
        print(cont[1])
        print(cont[2])
        print(cont[3])
        print('skip \n')
        print(cont[len(cont) -3])
        print(cont[len(cont) -2])
        print(cont[len(cont) -1])
        
        # delete dummy file
        os.remove('temp1.txt')
        pass
    
    def confusion_matrix_accuracy(self, y_tst,X_tst):
        ytype = self.ytype
        result = self.result(y_tst,X_tst)
        C = []
        for i in range(len(ytype)):
            for j in range(len(ytype)):
                C.append(((result['Actual Class'] == ytype[i]) & (result['Final prediction'] == ytype[j])).sum())

        C2 = pd.DataFrame(np.array(C).reshape(len(ytype),len(ytype)),
                          index = ytype, columns = ytype)
        acc_tst = round(np.diag(C2).sum()/C2.sum().sum(),3)
        
        return C2, acc_tst
             
        # accuracy for Test data
        
    def print_c_acc(self, y_tst, X_tst):
        C2, acc_tst = self.confusion_matrix_accuracy(y_tst,X_tst)
        
        C2.to_csv('temp2.txt', sep = '\t')
        temp2 = []
        temp2.append('\tPredicted Class \n')
        r = open('temp2.txt','r')
        for line in r.readlines():
            temp2.append(line)
        r.close()    
        for i in range(len(temp2)):
            if i == 2:
                temp2[i] = 'Actual \t' + temp2[i]+ '\n'
            elif i == 3:
                temp2[i] = 'Class \t' + temp2[i]+ '\n'
            else:
                temp2[i] = '\t' + temp2[i]+ '\n'
        # write file
        f = open('temp3.txt','w+')
        f.write('Confusion Matrix (Test)\n')
        f.write('---------------------------------\n')
        for line in temp2:
            f.write(line)
        f.write('\n')
        f.write('Model Summary (Test) \n')
        f.write('---------------------------------\n')
        f.write(f'Overall accuracy = {acc_tst}')
        f.close()

        # read lines
        r = open('temp3.txt','r')
        for line in r.readlines():
            print(line)
        r.close()
        os.remove('temp2.txt')
        os.remove('temp3.txt')
        pass         

In [4]:
df = pd.read_csv('veh.dat', header = None)
y = df[18]
X = df.drop(18, axis = 1)
X.insert(loc = 0, column = 'constant', value= np.ones(len(X)))

df2 = pd.read_csv('vehtest.dat', header = None)
y_tst = df2[18]
X_tst = df2.drop(18, axis = 1)
X_tst.insert(loc = 0, column = 'constant', value = np.ones(len(X_tst)))

In [5]:
# fit model
model = multiclass_classification()
model.fit(y,X)

Optimization terminated successfully.
         Current function value: 0.360581
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.327391
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.042271
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.027253
         Iterations 16


In [6]:
model.resp_b, model.mod

(     Class 1  Class 2  Class 3  Class 4
 0          1        0        0        0
 1          1        0        0        0
 2          1        0        0        0
 3          1        0        0        0
 4          1        0        0        0
 ..       ...      ...      ...      ...
 420        0        0        0        1
 421        0        0        0        1
 422        0        0        0        1
 423        0        0        0        1
 424        0        0        0        1
 
 [425 rows x 4 columns],
 [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7fa5cd4c3c10>,
  <statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7fa5cd5e37f0>,
  <statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7fa5cd5eccd0>,
  <statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7fa5cd5ecd60>])

In [7]:
model.predict(X_tst)

(     Class 1  Class 2  Class 3  Class 4
 0       0.05     0.95     0.00     0.00
 1       0.44     0.56     0.00     0.00
 2       0.36     0.64     0.00     0.00
 3       0.02     0.03     0.00     0.95
 4       0.51     0.49     0.00     0.00
 ..       ...      ...      ...      ...
 331     0.03     0.03     0.00     0.94
 332     0.02     0.05     0.02     0.92
 333     0.01     0.01     0.01     0.96
 334     0.05     0.01     0.00     0.94
 335     0.01     0.01     0.01     0.97
 
 [336 rows x 4 columns],
      Final prediction
 0                   2
 1                   2
 2                   2
 3                   4
 4                   1
 ..                ...
 331                 4
 332                 4
 333                 4
 334                 4
 335                 4
 
 [336 rows x 1 columns])

In [8]:
model.result(y_tst,X_tst)

Unnamed: 0,ID,Actual Class,Class 1,Class 2,Class 3,Class 4,Final prediction
0,0,1,0.05,0.95,0.00,0.00,2
1,1,1,0.44,0.56,0.00,0.00,2
2,2,1,0.36,0.64,0.00,0.00,2
3,3,1,0.02,0.03,0.00,0.95,4
4,4,1,0.51,0.49,0.00,0.00,1
...,...,...,...,...,...,...,...
331,331,4,0.03,0.03,0.00,0.94,4
332,332,4,0.02,0.05,0.02,0.92,4
333,333,4,0.01,0.01,0.01,0.96,4
334,334,4,0.05,0.01,0.00,0.94,4


In [9]:
model.print_result(y_tst,X_tst)

ID,Actual Class,Class 1,Class 2,Class 3,Class 4,Final prediction

-----------------------------------------------------------------
0,1,0.05,0.95,0.0,0.0,2

1,1,0.44,0.56,0.0,0.0,2

2,1,0.36,0.64,0.0,0.0,2

skip 

333,4,0.01,0.01,0.01,0.96,4

334,4,0.05,0.01,0.0,0.94,4

335,4,0.01,0.01,0.01,0.97,4



In [10]:
model.confusion_matrix_accuracy(y_tst,X_tst)

(    1   2   3   4
 1  50  33   1   2
 2  22  47  11   5
 3   0   0  85   1
 4   1   1   2  75,
 0.765)

In [11]:
model.print_c_acc(y_tst, X_tst)

Confusion Matrix (Test)

---------------------------------

		Predicted Class 



		1	2	3	4



Actual 	1	50	33	1	2



Class 	2	22	47	11	5



	3	0	0	85	1



	4	1	1	2	75





Model Summary (Test) 

---------------------------------

Overall accuracy = 0.765
