In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('codon_usage.csv')
dataset

In [4]:
X = dataset.iloc[:,5:]
y = dataset.iloc[:,0]
print(X.shape,y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(13028, 64) (13028,)
(9771, 64) (9771,) (3257, 64) (3257,)
(9771, 64) (9771, 1) (3257, 64) (3257, 1)


In [43]:
class LogisticRegressionOneVsAll:
    def __init__(self, tolerance, maxIter, learningRate):
        self.maxIter = maxIter
        self.learningRate = learningRate
        self.tolerance = tolerance
        self.cost = []
        self.w = []

    
    def sigmoid(self, z):
        return 1/(1 + np.exp(-z))

    def gradient(self, X, y):
        sig = self.sigmoid(X.dot(self.w))
        return np.dot((sig - y),X)

    def logLoss(self, X, y):
        sig = self.sigmoid(X.dot(self.w))
        cost = -(1/X.shape[0]) * ((y * np.log(sig) + (1-y)*np.log(1-sig)).sum())
        return cost

    def costFunction(self, X, y):
        cost = (np.log(np.ones(X.shape[0]) + np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)).sum()
        return cost

    def addBias(self, X):
        """
        Adds a bias to the input
        """
        X = pd.concat([pd.Series(1, index=0, name='00'), X], axis=1)

    def oneHot_encoding(self, X, y):
        y_encoded = np.zeros([X.shape[0], len(y.unique())]) 
        #created 13028 arrays with 11 zeroes in each array denoting the number of instances and number of classes respectively

        y_encoded = pd.DataFrame(y_encoded)

        for i in range(0, len(y.unique())):
            for j in range(0, X.shape[0]):
                if y.unique()[i] == y[j]:
                    y_encoded.iloc[j, i] = 1
        
        return y_encoded

    def gradientDescent(self, X, y):
        errors = []
        last_error = float('inf')
        
        for i in tqdm(range(self.maxIter)):
            self.w = self.w - self.learningRate * self.gradient(X, y)
            current_error = self.costFunction(X, y)
            
            errors.append(current_error)
            diff = last_error - current_error
            last_error = current_error
            
            if np.abs(diff) < self.tolerance:
                print('model stopped learning')
                break 
                
        return self.w

    def fitOvA(self, X, y):

        # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(dataset.iloc[:,5:].values(), dataset.iloc[:,0], test_size = 0.25, random_state = 42) 
        # print(self.X_train)
        # self.y_train = self.y_train.values.reshape(-1, 1)
        # self.y_test = self.y_test.values.reshape(-1, 1)           
        # self.X_train = self.addBias(self.X_train)
        X = self.addBias(X)
        for i in range(len(np.unique(y))):
            print('Now training for class {} vs all'.format(i))
            wTemp = np.zeros(X.shape[1],dtype=np.float64)
            costTemp = []
            #y_encoded = self.oneHot_encoding(X, y)
            yOvsA = np.where(y == i, 1 ,0,)
            for j in range(self.maxIter):
                wTemp = self.gradientDescent(X, yOvsA)
                costTemp.append(self.logLoss(X, yOvsA))
            self.w.append((i,wTemp))
            self.cost.append((i,costTemp))
            

    def predictOvA(self, X):
        X = self.addBias(X)
        y_hat = [max((self.sigmoid(i.dot(W)), c) for c, W in self.w)[1] for i in X ]
        return y_hat

    def accScore(self,X, y):
        score = sum(self.predict_OvA(X) == y) / len(y)
        return score

    def plot_cost_ovr(self): 
        for c,cost in self.cost:
            plt.plot(range(self.maxIter), cost, label=str(c)+" vs All")
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Convergence Graph of Cost Function')
        plt.legend()
        plt.show()




    

    
        
    


    



In [41]:
print(type(X_train))
X_train.shape[1]

<class 'pandas.core.frame.DataFrame'>


64

In [45]:
model = LogisticRegressionOneVsAll(learningRate=0.2, maxIter=20000, tolerance=0.0005)
model.fitOvA(np.array(X_train), y_train)


AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [46]:

# 2-D List
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
  
# Nested List Comprehension to flatten a given 2-D matrix
[print(val) for sublist in matrix for val in sublist]
  


1
2
3
4
5
6
7
8
9


[None, None, None, None, None, None, None, None, None]

In [121]:
X = np.insert(X, 0, 1, axis=1)

AssertionError: Number of manager items must equal union of block items
# manager items: 65, # tot_items: 66

In [132]:
new[new[4]!=0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
3688,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3720,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3800,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3953,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4049,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4399,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4408,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4409,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4417,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4441,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
