In [1]:
import pandas as pd 
import numpy as np 
import sys
from glob import glob 
import json
import gzip
from tqdm import tqdm 
from pickle import dump
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# sys.path.append('/Users/jerald/Documents/GaTechOMSA/CAPSTONE/repo')
files = 'models/data/*.gz'

class util(object):
    def __init__(self, files):
        files = glob(files)
        self.file_names = sorted([f.split('/')[-1].split('.')[0] for f in files])
        self.file_path_dict = {x:y for x,y in zip(self.file_names, files)}
        self.name_map = {i:j for i, j in enumerate(self.file_names)}
        print(f"Data Files: {len(self.file_names)}")
        self.RandomState = 28
        
    def load(self, num):
        file = self.file_path_dict[self.name_map[num]]
        with gzip.open(file, 'rb') as f:
            data = pd.read_csv(f, index_col=0)
        return data

In [2]:
class setup(util):
    def __init__(self, files):
        super().__init__(files)
        
    def _get_data(self, num=0):
        self.df = self.load(num)
        self.X = self.df.drop('FaultCode', axis=1)
        self.feature_names = self.X.columns
        self.y = self.df['FaultCode'].apply(lambda x: 0 if x == 0 else 1)
        
    def _split(self, X, y, size = .20):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size)
        return X_train, X_test, y_train, y_test
                
    def _pca(self, X):
        sc = StandardScaler()
        self.pca = KernelPCA(kernel = 'rbf',random_state=self.RandomState, n_jobs=-1)
        X = self.pca.fit_transform(sc.fit_transform(X))
        self.ev = self.pca.explained_variance_ratio_[:5]
        #print(f"Explained Variance: {self.pca.explained_variance_ratio_[:4]}")
        return X

    def _save_pca(self, num):
        with open(f'models/decomp/{self.name_map[num]}.pkl', 'wb') as f:
            dump(self.pca, f)

data = setup(files)

Data Files: 43


In [3]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

class model(setup):
    def __init__(self, files):
        super().__init__(files)
        self.decompose = False
    
    def data_init(self, num, pc = False):
        self._get_data(num)
        if pc:
            self.decompose = True
            self.X = self._pca(self.X)
            self._save_pca(num)        
        self.model_name = self.name_map[num]
        self.x_train, self.x_test, self.y_train, self.y_test = self._split(self.X, self.y)

    def save_iso(self):
        outpath = f'models/reg/{self.model_name}.pkl'
        
        if self.decompose:
            outpath = f'models/pca/{self.model_name}.pkl'
        
        with open(outpath, 'wb') as f:
            dump(self.iso_model, f)
                           
    def _run_model(self):
        """ Isolation Forest Model, need to cast the Y values to 1 and -1 """
        model = IsolationForest(random_state=self.RandomState)
        params = {'contamination': [.2, .3,], 'bootstrap': [True, False]}
        self.grid = GridSearchCV(model, params, cv=3, n_jobs=-1, scoring='accuracy')
        self.grid.fit(self.x_train, np.where(self.y_train == 0, 1, -1))
        self.iso_model = self.grid.best_estimator_
        self.save_iso()
    
    def _get_predictions(self):
        """ Predictions, The predictions are casted to 0 and 1 """
        testing_prediction = np.where(self.iso_model.predict(self.x_test) == 1, 0, 1)
        training_prediction = np.where(self.iso_model.predict(self.x_train) == 1, 0, 1)
        results = pd.DataFrame({
            'Testing Accuracy': accuracy_score(self.y_test, testing_prediction),
            'Training Accuracy': accuracy_score(self.y_train, training_prediction),
            'Testing AUC': roc_auc_score(self.y_test, testing_prediction),
            'Training AUC': roc_auc_score(self.y_train, training_prediction)
        }, index = [self.model_name])    
        return results
    
    def run(self, num, pc=True):
        self.data_init(num, pc)
        self._run_model()
        res = self._get_predictions()
        return res


m = model(files)
m.run(0, pc=True)

Data Files: 43


KeyboardInterrupt: 

In [4]:
class workflow(model):
    def __init__(self, files):
        super().__init__(files)
        
    def pca_workflow(self, num):
        pred = self.run(num, pc=True) 
        return pred
    
    def reg_workflow(self, num):
        pred = self.run(num, pc=False)
        return pred

    def main(self, decompose = True):
        iter_list = list(self.name_map.keys())
        results = pd.DataFrame()
        for i in tqdm(iter_list, desc='Model Progress'):
            if decompose == True:
                res = self.pca_workflow(i)
            else:
                res = self.reg_workflow(i)
            results = pd.concat([results, res])
        return results
    
# PCA model 
w = workflow(files)
pca_results = w.main(True)
pca_results.round(4).to_csv('models/results/pca_results.csv')

Data Files: 43


Model Progress: 100%|██████████| 43/43 [07:36<00:00, 10.62s/it]


In [5]:
# Regular Model
# w = workflow(files)
# reg_results = w.main(False)
# reg_results.round(4).to_csv('models/results/reg_results.csv')