In [5]:
import pandas as pd
import numpy as np
from numpy.linalg import eig
from sklearn.model_selection import train_test_split

In [46]:
df = pd.read_csv("cleaned_K8.csv", header = None, low_memory = False)  # process the data with pandas

In [47]:
# Take a peek at the data:

In [48]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,0
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,0
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,0
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,0
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,0


In [49]:
# get the X and y from the data

In [85]:
feature_cols = [x for x in range(5408)]
X = df[feature_cols]
y = df[5408]
print("dimensions of X", X.shape)

dimensions of X (16592, 5408)


In [86]:
# perform PCA on X, finding PCs that explains "percent" of data
# Output: X_reduced, X with reduced dimension
def PCA_X(X, percent):
    feature_cols = [x for x in range(5408)]     # store the features by their indexes
    feature_cols_np = np.array(feature_cols)

    mean_center_X = X - np.mean(X, axis = 0)    # get the mean centered X from X

    # calculate the covar_matrix
    covar_matrix = mean_center_X.T @ mean_center_X / (len(mean_center_X) - 1)

    # perform eigendecomposition, getting eig_val and eig_vector
    eig_val, eig_vector = eig(covar_matrix)
    print("eig_val", eig_val)
    print("eig_vector", eig_vector)
    print("len eig_val", len(eig_val))
    print("len eig_vector", len(eig_vector))

    # sort through eigen_val, creating "indexes"
    sorted_indexes = eig_val.argsort()[::-1][:len(eig_val)]
    print("sorted_indexes", sorted_indexes)
    eig_val = eig_val[sorted_indexes]

    eig_vector = eig_vector[:,sorted_indexes]   # sort the eig_vector based on sorted_indexes
    feature_cols_np = feature_cols_np[sorted_indexes] # sort the feature_cols based on sorted_indexes

    sum_eig = sum(eig_val)                      # sum over all eig_val for determining percent of variability

    # up toward what number of principle components does 95% of data's variability get explained
    count = 0
    sum_eig_sofar = 0
    for i in range(len(eig_val)):
        if sum_eig_sofar < (percent * sum_eig):
            sum_eig_sofar += eig_val[i]
            count += 1

    print("count", count)
    print("feature_cols_np", feature_cols_np)
    #for i in range(count):
    #    print(feature_cols_np[i])

    # get eig_vectors that explains "percent" of data
    eig_vector_reduced = eig_vector[:, 0:count]
    
    # get X_reduced by projecting each data point in X to the M dimensions described by M eigenvectors
    # Note: M here is the amount of eigenvectors that explains "percent" of data
    X_reduced = mean_center_X @ eig_vector_reduced
    return X_reduced


In [87]:
X_reduced = PCA_X(X, 0.99)
print("X_reduced", X_reduced)
print("X_reduced.shape", X_reduced.shape)
print("X.shape", X.shape)

eig_val [2.26180309e+04 7.15492481e+03 4.84945490e+03 ... 5.51324826e-07
 6.35419047e-07 6.24391237e-07]
eig_vector [[ 8.54825847e-05 -5.04448186e-05 -1.56810934e-04 ... -2.56352334e-04
   2.14088887e-05  1.47862667e-04]
 [ 5.04186112e-05 -1.24204596e-04 -2.10201415e-04 ... -3.00425200e-04
   7.32882954e-04  3.15934092e-04]
 [ 2.57219310e-05  5.44578611e-05  3.39219570e-05 ... -1.35169558e-03
  -3.04687593e-03 -2.77420575e-04]
 ...
 [-6.85638827e-06  7.28126254e-05  8.82171058e-05 ... -2.20932467e-02
  -9.11774535e-02 -4.26462957e-02]
 [-3.26821800e-06  8.61183584e-06  1.79260291e-05 ... -2.71030260e-02
  -7.54592015e-02 -2.65584967e-02]
 [ 1.92276476e-05 -1.58473137e-04 -1.95015608e-04 ...  6.91038245e-02
   4.27699587e-02  9.76849976e-02]]
len eig_val 5408
len eig_vector 5408
sorted_indexes [   0    1    2 ... 4919 4897 4916]
count 449
feature_cols_np [   0    1    2 ... 4919 4897 4916]
X_reduced               0           1           2          3          4          5    \
0      -99

-----------------------------------------------------


In [88]:
class LogisticRegression:
    def __init__(self, learn_rate = 0.001, num_iters=1000):
        self.learn_rate = learn_rate
        self.num_iters = num_iters
        self.W = None 
        self.bias = None
    
    # X is num_samples by num_features 
    # y is 1D row vector for each training sample
    def fit(self, X, y):
        # init params (as zeros)
        num_samples, num_features = X.shape
        self.W = np.zeros(num_features)
        self.bias = 0
        #print("num_samples, num_features", num_samples, num_features)
        #print("self.W.shape", self.W.shape)
        
        # gradient descent
        for i in range(self.num_iters):
            linear_model = np.dot(X, self.W) + self.bias 
            
            y_predicted = self._sigmoid(linear_model)
            
            # derivatives
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)
            
            # update weights and bias 
            self.W -= self.learn_rate * dw
            self.bias -= self.learn_rate * db
            
    def predict(self, X):
        linear_model = np.dot(X, self.W) + self.bias 
        y_predicted = self._sigmoid(linear_model)
        
        # based on y_predicted, get the predicted class label
        y_predicted_label = [1 if i > 0.5 else 0 for i in y_predicted]
        
        return y_predicted_label
    
    # sigmoid func
    def _sigmoid(self, x):
        sigmoid = 1 / (1 + np.exp(-x))
        return sigmoid

In [89]:
# a function to calculate accuracy 
def accuracy(y_observed, y_predicted):
    accuracy = np.sum(y_observed == y_predicted) / len(y_observed)
    return accuracy

In [90]:
def LogisticRegression_calc(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 1234)
    print("X_train.shape", X_train.shape)
    Logistic_regressor = LogisticRegression(learn_rate = 0.001, num_iters=1000)
    Logistic_regressor.fit(X_train, y_train)
    predictions = Logistic_regressor.predict(X_test)
    
    print("Logistic classification accurary:", accuracy(y_test, predictions))

In [91]:
# logistic regression on original data 
LogisticRegression_calc(X, y)

X_train.shape (14103, 5408)
Logistic classification accurary: 0.991562876657292


In [92]:
# logistic regression on dimensionally reduced data
LogisticRegression_calc(X_reduced, y)

X_train.shape (14103, 449)
Logistic classification accurary: 0.6593009240658899
