## Algorithm
Here, I will implement the logistic regression with tensorflow. Tensorflow here serves as a tool to conduct gradient descent to find the parameter $\theta$ that minimize the loss function.

Tensorflow is used to conduct stochastic gradient descent to calculate the parameters $\beta$. The tricky part is the calculation of loss function.

1. one hot encoding the outcome, so Y  becomes \[\[1,0,0,...\],...\[0,...1,...,0\],... \]
2. use softmax function to calculate $\hat{p}(y|x)$ and achieve a prediction in the form \[...\[$\hat{p}_0(y|x)$,$\hat{p}_1(y|x)$,...\],...\]
3. the loss function then is the average of the dot product of each element.

In [1]:
import tensorflow as tf   
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
class train_action():
    
    ## uses tensorflow to do backpropagation onece for each epoch.
    def train_one_step(self,X,Y):
        
        with tf.GradientTape() as tape:
            predict = self.predict(X)
            loss=self.loss(predict, Y)
            optimizer=self.optimizer
            # compute gradient
            grads = tape.gradient(loss, self.trainable_variables)
            # update to weights
            optimizer.apply_gradients(zip(grads, self.trainable_variables))   
        
    
    def save_model(self,filename):
        stored_variables=np.array([i.numpy() for i in self.trainable_variables])
        np.save(filename, stored_variables,allow_pickle=True, fix_imports=True)
        print('model has been saved')
        



In [3]:
## shape is a list of tuples.
## The i_th element represents the tensor structure of the i_th layer.
## The 0_th element represents the shape of the input.
## The -1_th element represents the shape of the output.
class model(train_action):
    
    def __init__(self,optimizer=tf.keras.optimizers.SGD(lr=0.1)):       
        super().__init__()
        ## the parameters in this model
        self.optimizer=optimizer
                           
    def fit(self,X,Y,epochs=10,regre=False,save_name=None):
            
        condition1= (np.shape(X)[0]==np.shape(Y)[0])
        
        #whether it is a regression problem or classification problem
        if condition1:
            if len(np.shape(Y))==1:
                Y=np.expand_dims(Y,axis=1)      
        else:
            raise ValueError('shape does not conforms to \(sample,feature \)')    
        
        self.encoder=OneHotEncoder(handle_unknown='ignore') # one hot encoding Y for the calculation convenience of crossentropy
        Y_onehot=self.encoder.fit_transform(Y).toarray()
        
        
        self.regre=regre  
        self.beta=tf.Variable(tf.random.truncated_normal(shape=(np.shape(X)[1]+1,np.shape(Y_onehot)[1])))
        self.trainable_variables=[self.beta]
        
        
        X=tf.constant(X,dtype=tf.float32)
        #concatenate ones to the second dimension of X
        X=tf.concat([X,tf.ones(shape=(X.shape[0],1),dtype=tf.float32)],axis=1)
        
        
        for epoch in range(epochs): 
            predict=self.predict(X)
            print('for epoch {}, loss is {}'.format(epoch,self.loss(predict,Y_onehot)))
            self.train_one_step(X,Y_onehot)  
        if save_name:
            self.save_model(save_name) 
                                
        
                           
            
    def predict(self,X=None):
        """The structure of this model: a flow of how output Y is generated 
        given the input X. The output is returned in the end."""
        
        predict=tf.linalg.matmul(X,self.beta)
        
             
        return predict
    
    
    def loss(self,predict,Y_onehot):
        """The loss function to be minimized with respect to the trainable_variables.
        In general, it is a function of the prediction and the real data Y.
        The result should be returned"""
        
        if self.regre:  # regression problem
            loss=tf.math.reduce_mean(tf.keras.losses.MSE(predict,Y))
       
        else: # classification problem
            loss=tf.math.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(Y_onehot, predict, axis=-1, name=None))
        return loss
     

In [4]:
from sklearn.datasets import load_iris

In [5]:
iris=load_iris()
X=iris.data[:,2:] # petal length and width
Y=iris.target
feature_names=iris.feature_names[2:]
feature_name_type=[[i,'c'] for i in feature_names]
Y=np.expand_dims(Y,axis=1)
Y_onehot=OneHotEncoder(handle_unknown='ignore').fit_transform(Y).toarray()

In [6]:
testmodel=model()
testmodel.fit(X,Y)

for epoch 0, loss is 9.255451202392578
for epoch 1, loss is 7.4199090003967285
for epoch 2, loss is 5.625100612640381
for epoch 3, loss is 3.914612293243408
for epoch 4, loss is 2.439427375793457
for epoch 5, loss is 1.6804108619689941
for epoch 6, loss is 1.4050358533859253
for epoch 7, loss is 1.239380955696106
for epoch 8, loss is 1.1745182275772095
for epoch 9, loss is 1.1530745029449463


In [7]:
testmodel.trainable_variables

[<tf.Variable 'Variable:0' shape=(3, 3) dtype=float32, numpy=
 array([[-0.31379506, -0.3368296 , -0.3491938 ],
        [ 0.36098498,  1.3118964 ,  0.6221312 ],
        [-0.22255819, -0.7897023 ,  0.6636022 ]], dtype=float32)>]

In [None]:
X = [['Male'], ['Female'], ['Tran'],['Les']]
enc.fit_transform(X).toarray()

In [None]:
enc.fit_transform(X).toarray()

In [None]:
enc.transform([['Female', 1], ['Male', 2]]).toarray()

In [None]:
logits = [[4.0, 2.0, 1.0], [0.0, 5.0, 1.0]] 
labels = [[1.0, 0.0, 0.0], [0.0, 0.8, 0.2]] 
tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)