In [8]:
# LIBRARIES

import numpy as np
import pandas as pd

In [9]:
# MAKE DATA
# we will use np.array function

a = np.array([1,3,4])


# argument is a list supplied [1,3,4]
# that results in 1 dimension array
# lets visualise and think about making a 2d array

print(a)

[1 3 4]


$
\begin{bmatrix}
1 \\
3 \\
4
\end{bmatrix}
$

This is the same as 'a' but python writes horizontally. 

In [10]:
# 2d array
b = np.array([
    [1,3,4],
    [3,5,6]
])

# note np treat tuple marker() vs. list () same 
# lets check

c = np.array([
    (1,3,4),
    (3,5,6)
])

print(b)

[[1 3 4]
 [3 5 6]]


In [11]:
# check equivalence
print(c == b)

[[ True  True  True]
 [ True  True  True]]


# Pandas and Numpy 

- Pandas uses a series 
- Numpy is an array
- Pandas series is very similar to numpy array
- Pandas series can be a column 


In [12]:

ages = np.array([13,25,19])
series1 = pd.Series(ages)
print(series1)

0    13
1    25
2    19
dtype: int64


# Evaluate model : regression

- recall MAE, MSE, RMSE, R^2

In [None]:
#data
true_y = [5,10,15,20,25,30]
predicted_y = [7,12,17,19,18,37]
# print the mean
print(np.mean(true_y))

In [15]:
residuals = true_y - predicted_y
print(residuals)

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [None]:
# what happened here? how do we fix this?
# Python does not support element wise operation. Need to go through a for loop

residualspython = [(i-j) for i,j in zip(true_y,predicted_y)]
print(residualspython)

# it is quite a complex python code. It is the same as 
x = []

# Use a for loop with zip to iterate over both lists
for i, j in zip(true_y, predicted_y):
    x.append(i - j)

[-2, -2, -2, 1, 7, -7]


In [19]:
y = np.array(true_y)
y_hat = np.array(predicted_y)
resid = y - y_hat
print(resid)

[-2 -2 -2  1  7 -7]


# Lets calculate metrics 
- MAE
- MSE
- RMSE

In [None]:
# AE
abs = np.abs(resid)
print(f'Absolute error is {abs}')

Absolute error is [2 2 2 1 7 7]


In [24]:
# MAE
mae = sum(abs)/len(resid)
print(f"Mean asbolute error is {mae}")

Mean asbolute error is 3.5


In [25]:
# MSE
mse = sum(np.square(resid))/len(resid)
print(f"Mean Squared Errorr is {mse}")

Mean Squared Errorr is 18.5


In [27]:
# Using sklearn

from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error,r2_score

print(f"MAE with sklearn is {mean_absolute_error(y,y_hat)}")

MAE with sklearn is 3.5


In [28]:
print(f"MSE with sklearn is {mean_squared_error(y,y_hat)}")

MSE with sklearn is 18.5


In [29]:
print(f"RMSE with sklearn is {root_mean_squared_error(y,y_hat)}")

RMSE with sklearn is 4.301162633521313


In [30]:
print(f"Rsquared with sklearn is {r2_score(y,y_hat)}")

Rsquared with sklearn is 0.7462857142857142


In [31]:
# Lets compare this with null model where all you take a mean

naive = [17.5,17.5,17.5,17.5,17.5,17.5]
naive = np.array(naive)

print(f"Naive model rmse is {root_mean_squared_error(y,naive)}")

Naive model rmse is 8.539125638299666


In [40]:
# Classification
# we will use sklearn iris dataset

from sklearn import datasets
breast = datasets.load_breast_cancer()


In [41]:
# Lets look at what iris looks like
breast

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [42]:
# to make it easier for us
breastdf = pd.DataFrame(data=breast.data,columns = breast.feature_names)

In [47]:
breastdf.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [52]:
#its all in an array, so can seperate X and Y
X = breast.data
# what does [:,:2] mean?
# iris.data is being used to subset using square brakcets iris.data[]
# inside square bracket notation is in row:column
# [:, blank before column means all rows, and then ,:2] means all columns up to 2.

y = breast.target

In [49]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [None]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [53]:
# lets split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.25, random_state= 1)

In [58]:
from sklearn.linear_model import LogisticRegression
# we will use logistic regression as discussed

glm = LogisticRegression(C=1, max_iter=100000) #dont worry about C for now

glm.fit(X_train,y_train)

In [59]:
# lets generate predictions
predictions = glm.predict(X_test)

In [61]:
# thats it model is trained.

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,predictions)

In [63]:
tn, fp, tn, tp = cm.ravel()
print(f"true pos is {tp}")
print(f"fp is {fp}")
print(f"tn is {tn}")
print(f"tp is {tp}")

true pos is 85
fp is 5
tn is 3
tp is 85
