## Scratch Logistic Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.datasets import load_iris
import seaborn as sns
import pandas as pd
import pickle

In [15]:
class ScratchLogisticRegression():
    """
    Scratch implementation of logistic regression
    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    no_bias : bool
      True if no bias term is included
    verbose : bool
      True to output the learning process
    Attributes
    ----------
    self.coef_ : The following form of ndarray, shape (n_features,)
      Parameters
    self.loss : The following form of ndarray, shape (self.iter,)
      Record losses on training data
    self.val_loss : The following form of ndarray, shape (self.iter,)
      Record loss on validation data
    """
    def __init__(self, num_iter, lr, bias, verbose):
        # Record hyperparameters as attributes
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        # Prepare an array to record the loss
        self.coef = np.zeros(2)
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
    def fit(self, X, y, X_val=None, y_val=None):
        """
        Learn logistic regression. If validation data is entered, the loss and accuracy for it are also calculated for each iteration.
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            Features of training data
        y : The following form of ndarray, shape (n_samples,)
            Correct answer value of training data
        X_val : The following forms of ndarray, shape (n_samples, n_features)
            Features of verification data
        y_val : The following form of ndarray, shape (n_samples,)
            Correct value of verification data
        """
        
        self.coef = np.zeros(X.shape[1])
        for i in range(self.iter):
            loss = self.gradient_descent(X,y)
#             print(loss)
            sum_loss = np.sum(loss)
            self.loss[i] = sum_loss

            if X_val is not None and y_val is not None:
                val_loss = self.compute_cost(X_val,y_val)
                sum_val_loss = np.sum(val_loss)
                self.val_loss[i] = sum_val_loss
                if self.verbose and i%10000 ==0:
                   print("Loss train iteration {} is {}".format(i,sum_loss)) 
                   print("Loss val iteration {} is {}".format(i,sum_val_loss)) 
                if self.early_stopping(self.loss,i,self.val_loss):
                    break
            else:
                if self.verbose and i%10000 ==0:
                    print("Loss train iteration {} is {}".format(i,sum_loss) )
                if self.early_stopping(self.loss,i):
                    break
     
    
    
    def early_stopping(self,train_loss,iteration,val_loss =None):
        
        if iteration ==0:
            return False
        
#         if val_loss is not None:
#             if val_loss[int(iteration)] > val_loss[int(iteration) -1 ]:
#                 return True
#             else:
#                 return False
#         else:

        if train_loss[int(iteration)] > train_loss[int(iteration) -1 ]:
            return True
        else:
            return False
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))


    def compute_cost(self,X, y):
        m = len(y)
#         print(self.coef)
        h = self.sigmoid(X @ self.coef )
        epsilon = 1e-5
        cost = (1/m)*(((-y).T @ np.log(h + epsilon))-((1-y).T @ np.log(1-h + epsilon)))
        return cost


    def gradient_descent(self,X, y):
        m = len(y)

        self.coef = self.coef - (self.lr/m) * (X.T @ (self.sigmoid(X @ self.coef ) - y)) 


        return self.compute_cost(X, y)

    def predict(self, X):
        """
        Estimate the label using logistic regression.
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            sample
        Returns
        -------
            The following form of ndarray, shape (n_samples, 1)
            Estimated result by logistic regression
        """
        return np.round(self.sigmoid(X @  self.coef))
       
    def predict_proba(self, X):
        """
        Estimate the probability using logistic regression.
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            sample
        Returns
        -------
            The following form of ndarray, shape (n_samples, 1)
            Estimated result by logistic regression
        """
        
        return self.sigmoid(X.T @  self.coef)
    def save(self,name, type="numpy"):
        if type == "numpy":
            np.savez(name,self.coef)
        elif type == "pickle":
            with open(str(name) +'.pickle', 'wb') as handle:
                pickle.dump(self.coef, handle)
                
    def load(self,name, type="numpy"):
        if type == "numpy":
            self.coef = np.load(name+".npz")
        elif type == "pickle":
            with open(str(name) +'.pickle', 'rb') as handle:
                self.coef = pickle.load(handle)

In [3]:
iris = load_iris()

In [4]:
X = iris.data 
target = iris.target 
names = iris.target_names


In [5]:
df = pd.DataFrame(X, columns=iris.feature_names)

In [6]:
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
df['species'] = iris.target
# df['species'] = df['species'].replace(to_replace= [0, 1, 2], value = ['setosa', 'versicolor', 'virginica'])

In [8]:
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# df['species'] = df['species'].replace(to_replace= ['setosa', 'versicolor', 'virginica'], value = [0, 1, 2])

In [10]:
data = df[df['species'] != 0]

In [11]:
data['species'] = data['species'].replace(to_replace= [1, 2], value = [0, 1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
data.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')

In [17]:
model = ScratchLogisticRegression(num_iter =100000000 , lr = 0.03, bias = 2, verbose = True)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)']].values , data['species'].values , test_size=0.1, random_state=12)

In [None]:
model.fit(X_train,y_train,X_test,y_test)

Loss train iteration 0 is 0.6878195105044322
Loss val iteration 0 is 0.6992790256894046
Loss train iteration 10000 is 0.1389776655425346
Loss val iteration 10000 is 0.07210334705969468
Loss train iteration 20000 is 0.1257055546534399
Loss val iteration 20000 is 0.05767893899976478
Loss train iteration 30000 is 0.12139841489034132
Loss val iteration 30000 is 0.0514250824515271
Loss train iteration 40000 is 0.11943382039847449
Loss val iteration 40000 is 0.0478858300074512
Loss train iteration 50000 is 0.11838660944962358
Loss val iteration 50000 is 0.04564482757074729
Loss train iteration 60000 is 0.117775164120842
Loss val iteration 60000 is 0.04413069548661182
Loss train iteration 70000 is 0.1173961420827946
Loss val iteration 70000 is 0.04306160056063588
Loss train iteration 80000 is 0.11715110410780684
Loss val iteration 80000 is 0.04228149513440546
Loss train iteration 90000 is 0.11698771709528673
Loss val iteration 90000 is 0.04169722036283995


In [None]:
y_predict = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_predict, target_names=["a","b"]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure()
sns.set_style('white')
plt.plot(range(len(model.loss)), model.loss, 'r')
plt.title("Convergence Graph of Cost Function")
plt.xlabel("Number of Iterations")
plt.ylabel("Cost")
plt.show()

In [None]:

score = float(sum(y_predict == y_test))/ float(len(y_test))

print(score)

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

# X_Train_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(X_train)
X_Train_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_train)

#some convert lists of lists to 2 dataframes (df_train_neg, df_train_pos) depending on the label - 



In [None]:
plt.scatter(X_Train_embedded[:,0], X_Train_embedded[:,1], marker='o', c='red')
# plt.scatter(X_Train_embedded[:,0], X_Train_embedded[:,1], marker='o', c='blue')
# plt.scatter(X_train, y_train, marker='o', c='red')

In [None]:
y_train_new = np.expand_dims(y_train,axis = 1)

In [None]:
data_concat = np.concatenate([X_Train_embedded,y_train_new],axis = 1)

In [None]:
df_concat = pd.DataFrame(data_concat)

In [None]:
df_concat[2] = df_concat[2].astype("int64")

In [None]:
plt.scatter(df_concat[df_concat[2] == 0][0], df_concat[df_concat[2] == 0][1], marker='o', c='red')
plt.scatter(df_concat[df_concat[2] == 1][0], df_concat[df_concat[2] == 1][1], marker='o', c='blue')

In [None]:
model.save("test",type="numpy")
model.save("test",type="pickle")