# Capstone - Partial Discharge
## Julian Sweet DSI-LA-6
## Notebook 4 -Logisitic Regression Modeling, Unbalanced Class

In [14]:
import numpy as np
import pandas as pd

from scipy.signal import resample, stft
from sys import getsizeof
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score
from scipy.fftpack import fft

In [45]:
X_train       = np.load('./npy_datasets/X_train_data.npy')
X_test        = np.load('./npy_datasets/X_test_data.npy')
y_test        = np.load('./npy_datasets/y_test.npy')
y_train       = np.load('./npy_datasets/y_train.npy')

In [18]:
%%time
n_fft = 256
X_train_fft = fft(np.abs(X_train_data), n_fft)
X_test_fft  = fft(np.abs(X_test_data), n_fft)

  x = x[index]


CPU times: user 19.1 s, sys: 51.4 s, total: 1min 10s
Wall time: 1min 50s


In [19]:
# np.save('./npy_datasets/X_train_unbal_fft', X_train_fft)

In [20]:
# np.save('./npy_datasets/X_test_unbal_fft', X_test_fft)

Considerably bigger dataset. FFT routinely crashes kernel. We now have both the balanced and unbalanced FFT data saved to disk as NumPy array files.

In [48]:
X_train_fft.shape, X_test_fft.shape

((6969, 256), (1743, 256))

In [46]:
X_train_fft = np.load('./npy_datasets/X_train_unbal_fft.npy')
X_test_fft  = np.load('./npy_datasets/X_test_unbal_fft.npy')

In [70]:
params3 = {
    'penalty' : ['l1', 'l2']
    }

logr3 = GridSearchCV(LogisticRegression(max_iter = 35000), 
                           n_jobs = 6, 
                           verbose = 2,
                           scoring = 'accuracy',
                           param_grid = params3,
                           cv = 5)

In [71]:
logr3.fit(X_train_fft, y_train)
logr3.best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   5 out of  10 | elapsed:   10.8s remaining:   10.8s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   17.6s finished


{'penalty': 'l1'}

In [72]:
logr3.score(X_train_fft, y_train)

0.944181374659205

In [73]:
logr3.score(X_test_fft, y_test)

0.9409064830751578

In [53]:
pd.Series(y_train).value_counts(normalize = True)

0    0.939733
1    0.060267
dtype: float64

In [55]:
(94.09064830751578-93.9733)

0.11734830751578329

The logisitic regression accuracy beats the naive baseline by 0.117%

In [74]:
params4 = {
    'penalty' : ['l1', 'l2']
    }

logr4 = GridSearchCV(LogisticRegression(max_iter = 35000), 
                           n_jobs = 6, 
                           verbose = 2,
                           scoring = 'roc_auc',
                           param_grid = params4,
                           cv = 5)

In [75]:
logr4.fit(X_train_fft, y_train)
logr4.best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   5 out of  10 | elapsed:    3.3s remaining:    3.3s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   13.0s finished


{'penalty': 'l1'}

It is worth noting that the model performs better with Lasso regularization on this larger dataset.

In [76]:
logr4.score(X_train_fft, y_train)

0.8417784612699867

In [77]:
logr4.score(X_test_fft, y_test)

0.7862724577010292

In [78]:
d = {'predictions': logr2.predict(X_test_fft), 'actual': y_test}
con = pd.DataFrame(data = d)
con.head(10)

Unnamed: 0,predictions,actual
0,0,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [80]:
tn, fp, fn, tp = confusion_matrix(con['actual'], con['predictions']).ravel()

In [81]:
(tn, fp, fn, tp)

(1633, 5, 98, 7)

In [82]:
df = pd.DataFrame(logr2.predict_proba(X_test_fft))
df.head()

Unnamed: 0,0,1
0,0.728975,0.271025
1,0.998651,0.001349
2,0.952584,0.047416
3,0.855568,0.144432
4,0.935349,0.064651


In [83]:
biased_guess = (df[1] >= .40)

In [84]:
tn, fp, fn, tp = confusion_matrix(y_test, biased_guess).ravel()

In [85]:
(tn, fp, fn, tp)

(1623, 15, 94, 11)

Again, false positives increase but false negatives slightly decrease