In [166]:
import numpy as np
import pandas as pd
from autograd import jacobian, hessian
import sklearn.preprocessing as pp
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Part 2.1: Getting Familiar with Linear Algebraic functions

In [167]:
# 1: Create x of size 10*10 with random integer numbers
x = np.array(np.random.randint(0, 1000, size=100)).reshape(10, 10)
x

array([[985, 405, 528, 776,  64, 446, 344, 182, 346, 110],
       [516, 901, 586, 846, 897, 496, 894, 765,  67,  26],
       [462, 835,  70, 902, 194, 133, 967, 650, 742, 294],
       [930, 711, 229, 741, 330, 994, 206, 936, 594, 562],
       [448, 486,  35, 150, 281, 734, 387, 722, 754,  28],
       [408, 596, 217, 825, 787, 256,  75, 481, 567,  39],
       [479, 765, 737, 794, 630, 290, 685, 546,  47, 813],
       [332, 411, 801, 844, 883, 951, 402, 638, 710, 195],
       [195,  89, 989, 301, 956, 720, 253, 135, 622, 396],
       [ 77,  99, 156, 764, 937, 884, 981, 108, 955, 351]])

In [168]:
# Compute the following linear algebric operations on the x 
# using built in functions supported in Numpy, Scipy etc.
# 2: Find inverse of the x and print it
invx = np.linalg.inv(x)
invx

array([[-5.50222466e-03,  1.04299999e-02,  1.17272194e-02,
         1.54707952e-02, -1.73298846e-02, -5.16585281e-03,
        -1.77514923e-02, -9.71596372e-03,  1.56322706e-02,
        -2.80747212e-03],
       [ 1.50090713e-02, -2.38313157e-02, -2.87248568e-02,
        -3.63313142e-02,  4.24551313e-02,  1.35739381e-02,
         4.20960566e-02,  1.89893967e-02, -3.57262946e-02,
         6.65066070e-03],
       [ 6.61251042e-04, -7.95862540e-04, -3.69554996e-04,
        -1.51037244e-03,  1.09929347e-03, -1.68367332e-04,
         1.19447637e-03,  1.40298974e-03, -4.37319660e-04,
        -5.42149016e-04],
       [-1.60150282e-05, -2.45727817e-05,  4.93810421e-04,
         5.50944227e-04, -1.59158236e-03,  7.04531190e-06,
        -5.37240352e-04,  1.07770204e-03, -5.78510434e-04,
         1.35594885e-04],
       [-3.91947047e-03,  6.32795060e-03,  6.34489208e-03,
         8.77633240e-03, -1.00229514e-02, -2.25840753e-03,
        -1.00895504e-02, -5.85312861e-03,  9.17401782e-03,
        -1.

In [169]:
# 3: Calculate dot product of the x with same x in transpose A.AT
dotprodx = np.dot(x, x.T)
dotprodx

array([[2501498, 2590501, 2341911, 2872937, 1646834, 1876394, 2397284,
        2573519, 1736566, 1971564],
       [2590501, 4577420, 3453931, 3625219, 2382139, 2879569, 3803352,
        3888878, 2611088, 3178389],
       [2341911, 3453931, 3779087, 3317557, 2313901, 2449577, 3079832,
        2999296, 1696710, 2948268],
       [2872937, 3625219, 3317557, 4656091, 2922803, 2802770, 3379662,
        3857808, 2495809, 2999459],
       [1646834, 2382139, 2313901, 2922803, 2318695, 1817753, 1838676,
        2606284, 1682952, 2302344],
       [1876394, 2879569, 2449577, 2802770, 1817753, 2494295, 2408758,
        2936109, 1984262, 2398992],
       [2397284, 3803352, 3079832, 3379662, 1838676, 2408758, 3899790,
        3381619, 2538654, 2742077],
       [2573519, 3888878, 2999296, 3857808, 2606284, 2936109, 3381619,
        4427945, 3383096, 3713841],
       [1736566, 2611088, 1696710, 2495809, 1682952, 1984262, 2538654,
        3383096, 3172938, 2936105],
       [1971564, 3178389, 2948268, 29

In [170]:
# 4: Decompose the original x using eigen decomposition 
#    and print the eigen values and eigen vectors
eigval, eigvec = np.linalg.eig(x)
eigval

array([ 5190.42726996  +0.j        , -1229.74802493  +0.j        ,
        -791.07483811  +0.j        ,  -738.87009417  +0.j        ,
         928.96969957  +0.j        ,   691.6049118   +0.j        ,
         605.26940224+320.13296227j,   605.26940224-320.13296227j,
         -16.763823    +0.j        ,   284.9160944   +0.j        ])

In [171]:
eigvec

array([[ 0.25203553+0.j        , -0.01707083+0.j        ,
        -0.03662833+0.j        ,  0.07405516+0.j        ,
         0.40941736+0.j        ,  0.16735857+0.j        ,
         0.11655453-0.26394393j,  0.11655453+0.26394393j,
        -0.3077813 +0.j        ,  0.10450618+0.j        ],
       [ 0.3743759 +0.j        , -0.17395632+0.j        ,
         0.25991996+0.j        , -0.21613845+0.j        ,
         0.45451766+0.j        , -0.73851927+0.j        ,
         0.2215555 -0.25803085j,  0.2215555 +0.25803085j,
         0.69250312+0.j        ,  0.70476301+0.j        ],
       [ 0.33486628+0.j        ,  0.48732895+0.j        ,
         0.0845219 +0.j        , -0.11038044+0.j        ,
        -0.07302582+0.j        ,  0.02369766+0.j        ,
         0.36906691-0.10347301j,  0.36906691+0.10347301j,
         0.03163525+0.j        , -0.11374763+0.j        ],
       [ 0.37106687+0.j        ,  0.23589764+0.j        ,
        -0.49714389+0.j        ,  0.43393999+0.j        ,
         0.

In [172]:
# 5: Calculate jacobian matrix
# 6: Calculate hessian matrix
def cost(x):
    return x**3

jac = jacobian(cost)
hes = hessian(cost)

### Part 2.2: Logistic Regression using newton method

In [173]:
# 1: Download, load the data and print first 5 and last 5 rows
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00529/diabetes_data_upload.csv')
df.head(5)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [174]:
df.tail(5)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative
519,42,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative


In [175]:
# 2: Transform categorical features into numerical features. 
#    Use label encoding or any other suitable preprocessing technique
le = pp.LabelEncoder()
for column in df.columns:
    if df[column].dtype != 'int64':
        df[column] = le.fit_transform(df[column])
df.tail()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0
519,42,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [176]:
# 3: Since the age feature is in larger range, age column can be 
#    normalized into smaller scale (like 0 to 1) using different methods such
#    as scaling, standardizing or any other suitable preprocessing technique
ss = pp.MinMaxScaler()
df['Age'] = ss.fit_transform(np.array(df['Age']).reshape(-1, 1))
df.tail()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
515,0.310811,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,0.432432,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,0.567568,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,0.216216,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0
519,0.351351,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [177]:
# Addition checking step for missing values
df.isna().sum().sum()

0

In [178]:
# 3.1: Define X matrix (independent features) and y vector (target feature)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [179]:
# 4: Split the dataset into 60% for training and rest 40% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [188]:
# 5: Train Logistic Regression Model on the training set
model = LogisticRegression(verbose=1, max_iter=25)
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(max_iter=25, verbose=1)

In [181]:
# 6: Use the trained model to predict on testing set
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0])

In [182]:
# 7: Print 'Accuracy' obtained on the testing dataset
print(f"Accuracy of the model is: {accuracy_score(y_test, y_pred)*100:.3f}%")

Accuracy of the model is: 92.788%
