In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import os
import itertools
import time
from collections import Counter
import hdbscan
## get that here: https://github.com/scikit-learn-contrib/hdbscan

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.metrics import classification_report

from sqlalchemy import create_engine
import datetime 
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib
pd.options.display.max_columns=300
pd.options.display.max_rows=100
# from TurbineTimeSeries.storage import MachineDataStore
plt.rcParams["figure.figsize"] = (14,8)
#from TurbineTimeSeries.transformations import PCA, StandardScaler, DropCols, DropSparseCols, LeftJoin
# %matplotlib inline

In [2]:
from sklearn.cluster import AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
## for agglomerative clustering, give linkage : {“ward”, “complete”, “average”}

n_clusts = 3
kmeans = KMeans(init='k-means++', n_clusters=n_clusts, n_init=10)
AffinityProp = AffinityPropagation()
Meanshift = MeanShift(n_jobs=7)
Spectral = SpectralClustering(n_clusters=n_clusts, affinity='nearest_neighbors',n_jobs=7)
Agglom_ward = AgglomerativeClustering(n_clusters=n_clusts, linkage='ward')
Agglom_complete = AgglomerativeClustering(n_clusters=n_clusts, linkage='complete')
Agglom_avg = AgglomerativeClustering(n_clusters=n_clusts, linkage='average')
Dbscan = DBSCAN(eps=5, min_samples=n_clusts)
GMM_spherical = GaussianMixture(n_components=n_clusts, covariance_type='spherical' )
GMM_diag = GaussianMixture(n_components=n_clusts, covariance_type='diag' )
GMM_tied = GaussianMixture(n_components=n_clusts, covariance_type='tied' )
GMM_full = GaussianMixture(n_components=n_clusts, covariance_type='full' )



clustering_algo_dict = {
                        'kmeans':kmeans, 
#                         'AffinityProp': AffinityProp, #mem errors out. affinity propogation calculates full distance matrix so it's quadratic memory required. 10k samples would use ~80gb ram.
                        'Meanshift':Meanshift,  #runs forever
                        'Spectral':Spectral, ## runs forever
#                         'Agglom_ward':Agglom_ward, ## uses too much ram
#                         'Agglom_complete':Agglom_complete,
#                         'Agglom_avg':Agglom_avg,
                       'Dbscan':Dbscan,
                        'GMM_spherical':GMM_spherical,
                        'GMM_diag':GMM_diag, 
                        'GMM_tied':GMM_tied,
                        'GMM_full':GMM_full
                       }

In [3]:
# model_number = 2

# store = MachineDataStore('.config')

# # model_data_hr = (store.query(model_number,'1hr')
# #                  .not_null(['timestamp','psn'])
# # #                  .exclude_psn([44,52,54,70])
# #                  .execute())

# model_data_min = (store.query(model_number,'10min')
#                   .not_null(['timestamp','psn'])
#                   .exclude_psn([44,52,54,70])
#                   .execute())

# model_data_min.head()
# model1_1hr = pd.read_csv('../../../data/raw_data_model1.csv',index_col=0)
# model2_1hr = pd.read_csv('../../../data/raw_data_model2.csv',index_col=0)

# model1_10min = pd.read_csv('../../../data/raw_data_model1_10min.csv',index_col=0)
model2_10min = pd.read_csv('../../../data/raw_data_model2_10min.csv')#,index_col=0)

## convert all to timestamps
# model1_1hr['timestamp'] = model1_1hr['timestamp'].apply(lambda x: pd.Timestamp(x))
# model2_1hr['timestamp'] = model2_1hr['timestamp'].apply(lambda x: pd.Timestamp(x))
# model1_10min['timestamp'] = model1_10min['timestamp'].apply(lambda x: pd.Timestamp(x))
model2_10min['timestamp'] = model2_10min['timestamp'].apply(lambda x: pd.Timestamp(x))


# print('Shape of model1 1hr data: ', model1_1hr.shape)
# print('Shape of model2 1hr data: ', model2_1hr.shape)
# print('Shape of model1 10min data: ', model1_10min.shape)
print('Shape of model2 10min data: ', model2_10min.shape)

Shape of model2 10min data:  (1602326, 76)


In [15]:
model2_10min=model2_10min.sort_values(by=['psn','timestamp'],ascending=(True,True))
model2_10min=model2_10min.dropna()

In [16]:
data_dictionary = pd.read_csv('data_dictionary_model2.csv')
# data_dictionary

In [17]:
# [type(model2_10min.head()['timestamp'][0]) ## should be timestamp]
[i for i in data_dictionary['COLUMN_NAME'].str.lower().values if i not in model2_10min.columns.unique()]

['lo_c_brg1']

In [18]:
subsystem_dict = {str(i).lower(): list(data_dictionary[data_dictionary['SUBSYSTEM']==i]['COLUMN_NAME'].str.lower().values) for i in data_dictionary['SUBSYSTEM'].unique()}
del subsystem_dict['summary'] ## this group pretty useless

In [19]:
print(list(subsystem_dict.keys()))

['gas path', 'fuel', 'generator', 'vibration', 'lube oil system', 'enclosure', 'package equipment']


In [20]:
for i in subsystem_dict.keys():
    print(i,len(subsystem_dict[i]))

gas path 29
fuel 12
generator 12
vibration 6
lube oil system 8
enclosure 2
package equipment 1


In [21]:
# model2_10min.isnull().sum().sort_values()

In [24]:
def subsystem_pca(df,subsystem_dict,subsystems = [], psns = 'all'):
    
    subset = []
    if isinstance(subsystems,list)==True:
        if len(subsystems) > 0:
            for i in subsystems:
                subset = subset + subsystem_dict[i] ## populate subset with list of columns
        else:
            print('Empty list of subsystems detected. Using all subsystems for PCA')
            subset = [s for L in subsystem_dict.keys() for s in subsystem_dict[L]]
    else: 
        raise Exception('Please ensure subsystems parameter is a list')
        
    if psns == 'all' or isinstance(psns, (list, int))==True:
        pass
    else:
        raise Exception('Please provide a list of psns, single psn, or "all"')
    


    available_subset = [i for i in subset if i in df.columns.values]
    if len(available_subset)<len(subset):
        print('excluding columns: ', list(set(subset)-set(available_subset)))
    available_subset =  available_subset + ['id','timestamp','psn']
    model_data = df[available_subset]
    
    
    
    skipped_cols = ['sum_esn','sum_eng_st', 'sum_eng_h']
    index_cols = ['id','timestamp','psn']
    data_cols = [c for c in model_data.columns if (c not in index_cols) and (c not in skipped_cols)]
    
    missing_values = model_data.isnull().sum().sort_values()
    sparse_cols = [x for x in missing_values.index if missing_values[x] > 30000]
    clean_data_cols = [x for x in data_cols if x not in sparse_cols]
    data = model_data[index_cols + clean_data_cols].dropna()#.reset_index()
    print(data.shape)
    clean_data = StandardScaler().fit_transform(data[clean_data_cols])

    pca =  PCA().fit(clean_data)
    reduced = pca.transform(clean_data)
    reduced_df = pd.DataFrame(reduced)
    reduced_df['psn'] = data.psn.values
    reduced_df['timestamp'] = data.timestamp.values
    return(reduced_df)

In [25]:
def plot_eigs_subplots(reduced_df, n_eigs_x, n_eigs_y, psns, savefig = False,path=None,figname = None):
    if isinstance(n_eigs_x,int) == True:
        pass
    else:
        raise Exception('n_eigs_x must be an integer')
        
    if isinstance(n_eigs_y,int) == True:
        pass
    else:
        raise Exception('n_eigs_y must be an integer')
    

    
    if psns == 'all':
#         fig_base_title = 'All psns'
        pass
    elif isinstance(psns, list)==True:
        model2_10min[model2_10min['psn'].isin(psns)]
#         fig_base_title = 'PSN ' + str(psns)
        
    elif isinstance(psns,int) == True:
        reduced_df = reduced_df[reduced_df['psn']==psns]
#         fig_base_title = 'PSN ' + str(psns)
    else:
        raise Exception('Please provide a list of psns, single psn, or "all"')
        
    if path == None:
        path = ''
    else:
        pass

        
    f, axarr = plt.subplots(n_eigs_x, n_eigs_y)
    f.set_figheight(40)
    f.set_figwidth(40)
    f.suptitle(path + fname,fontsize=16)
    for i in range(n_eigs_x):
        for j in range(n_eigs_y):

            if i==j:
                continue
            axarr[i, j].scatter(reduced_df[i].values,reduced_df[j].values,3,alpha=0.5)
            axarr[i, j].set_title('Eig '+str(i) + ' vs Eig '+ str(j))
    if savefig == False:
        plt.show()
    else:
        
        f.savefig(path+str('\\') + fname +'.png')      
#         f.suptitle(path + fig_base_title + ' ' + str(n_eigs_x) + 'x' + str(n_eigs_y) + ' subplots',fontsize=16)
#         f.savefig(path + fig_base_title + ' ' + str(n_eigs_x) + 'x' + str(n_eigs_y) + ' subplots.png')
#         plt.show()
        plt.close()

## looping through potential subsystems/tags to look for clusters in eigs

In [26]:
color_pal = ['#33a02c', '#1f78b4', '#ff7f00', '#a6cee3','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#cab2d6','#6a3d9a']

In [27]:
model2_10min.dropna().shape

(1602281, 76)

In [28]:
rd_df = subsystem_pca(model2_10min,subsystem_dict, [], psns = 'all')
print(rd_df.shape)
rd_df['id'] = model2_10min['id']

Empty list of subsystems detected. Using all subsystems for PCA
excluding columns:  ['lo_c_brg1']
(1602281, 72)
(1602281, 71)


In [29]:
for i in sorted(rd_df['psn'].unique()):
    print(i, rd_df[rd_df['psn']==i].shape )

34 (103217, 72)
35 (71208, 72)
36 (70287, 72)
37 (65650, 72)
38 (30025, 72)
39 (22333, 72)
40 (14024, 72)
41 (15484, 72)
42 (95461, 72)
45 (48741, 72)
46 (7592, 72)
47 (10941, 72)
48 (92989, 72)
49 (89536, 72)
50 (2516, 72)
51 (5462, 72)
53 (36714, 72)
55 (77340, 72)
56 (80596, 72)
57 (65672, 72)
58 (58907, 72)
59 (66452, 72)
60 (27075, 72)
61 (17242, 72)
62 (27185, 72)
63 (1331, 72)
64 (56204, 72)
65 (51089, 72)
66 (59654, 72)
67 (57318, 72)
68 (82581, 72)
69 (12866, 72)
71 (22779, 72)
72 (55810, 72)


In [30]:
def hdbscan_eig_clusterer(reduced_df, cols, cluster_algo, cluster_params, psn='all'):
    """
    reduced_df: dataframe output from a PCA
    doesn't matter what index comes in, that's what will go out anywaysaaaaaa
    
    """
    ## todo: index by 
    returndf = pd.DataFrame()
    psns = []
    if psn == 'all':
        psns = sorted(reduced_df['psn'].unique())
    else:
        psns = psn
    
    for pkg in psns:
        tempdf = reduced_df[reduced['psn']==pkg]
        min_clust_size = int(len(newdf)/70.3)+1
        clusterer = cluster_algo(min_cluster_size=min_clust_size)
        clusterer = clusterer.fit(tempdf[cols])
        results = clusterer.predict(tempdf[cols])
        tempdf['cluster'] = results
        returndf = returndf.append(tempdf)
    return(reduced_df)

def plot_clusters(cluster_df, x, y, colorpal, save_fig = False, directory=None, title=''):
    plt.figure(figsize=(28,16))
    for i in sorted(cluster_df['cluster'].unique()):
        tempdf = cluster_df[cluster_df['cluster']==i]
        plt.scatter(tempdf[x].values,tempdf[y].values,s=3,alpha=0.5,c=color_pal[i],label = 'cluster '+str(i))
    plt.xlabel('Eigenvector '+str(x))
    plt.ylabel('Eigenvector '+str(y))
    fulltitle = 'Eig '+str(x) + ' vs Eig ' + str(y)+' '+ str(title)#' Clustering using HDBSCAN 20eigs')
    plt.title(title)
    plt.legend()
    if save_fig == True:
        if directory != None:
            plt.savefig(directory + title + 'png')
        else:
            plt.savefig(title+'.png')
        plt.show()
        plt.close()
    else:
        plt.show()


### make sure final code returns df with psn,timestamp index and cluster num col

In [31]:
# num_eigs_toclust = 20 ## 20 eigenvectors cover 90% variance

# current_directory = os.getcwd()
# final_directory = os.path.join(current_directory,r'Hdbscan Clustering mc lendiv70')
# if not os.path.isdir(final_directory):
#     os.mkdir(final_directory)

# # for psn in sorted(rd_df['psn'].unique()):
# # def clusterandplot(rd_df, psn, num_eigs_toclust):
# # for psn in sorted(rd_df['psn'].unique()):
# for psn in [35,37,40,45,46,47,49,55,57,58,62]:
#     print('started: ',psn)
#     nao = time.time()
#     newdf = rd_df[rd_df['psn']==psn]
#     min_clust_size = int(len(newdf)/70.3)+1
# #     hdbscan = hdbscan.HDBSCAN(min_cluster_size=min_clust_size) ## 70.3 chosen arbitrarily
#     print('    clustering with min_cluster_size={} min_samples={}.....'.format(min_clust_size,int(min_clust_size)),end="",flush=True)
# #     cluster_results = cluster_eigs(reduced_df=rd_df, psn=psn, cols=list(range(num_eigs_toclust)), cluster_algo=hdbscan)
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_clust_size, min_samples=int(min_clust_size))
#     clusterer_results = clusterer.fit_predict(newdf[list(range(num_eigs_toclust))])
#     print(newdf.shape, len(clusterer_results))
#     print('finished in {} seconds'.format(time.time()-nao))
#     newdf['cluster'] = clusterer_results
#     figname = 'PSN '+str(psn)+'HDBSCAN Clustering 20eigs minsize='+str(min_clust_size)
#     print('    saving figure.....',end="",flush=True)
#     plot_clusters(newdf, x=0, y=1, colorpal = color_pal, save_fig=True, directory=final_directory+'\\', title = figname)
#     print('finished in total {} seconds'.format(time.time()-nao))


In [32]:
def find_power_step (df,powercol,jump = 0.2):
    ## function will return another dataframe where the power columns is replace with 1s and 0s. 1 represents an outlier.
    ## jumps is a percentage
    ## assumes data coming in is sorted by psn by timestamp.

    
    df = df[[powercol,'timestamp','psn']]

    df = df.where((pd.notnull(df)),np.nan) ## replaces nulls with nans for math stuffs.
    df2 = pd.DataFrame(index = df.index.values[1:],columns = df.columns.values)
    df2['timestamp'] = df['timestamp'].values[1:]
    ## np.divide will divide the first parameter by the second parameter so the resulting series starts
    ## from the original dataframe's 1st item, not the 0th item.
#     shifted = pd.Series(np.subtract(df[powercol].values[1:],df[powercol].values[:-1]),index=df.index.values[:-1])

    shifted = pd.Series(np.divide(df[powercol].values[:-1],df[powercol].values[1:]),index=df.index.values[1:])-1

    ## take all values and subtract from previous values. if unchanging, then result will be 0
    ## create numpy array of all False

    tomap = np.zeros(len(df2),dtype=int)  
    
    
    ## find where jumps in data are greater than given jump parameter, and set numpy array equal to True in those positions
    for j in shifted[abs(shifted)>=jump].index.values:
        ## for now just doing where power jumps.
        ## does not catch all in betweens since transients are only 10-20 mins long.
        if (df.loc[j]['timestamp'] - df.loc[j-1]['timestamp']) > pd.Timedelta('12 minutes'): ## if data is not continuous
            continue
                        
        else:
            tomap[j] = True


    
    ## map back numpy array to df2
    df2[powercol] = pd.Series(tomap).loc[1:]

    df2.loc[0] = [0,df.loc[0]['timestamp'],0] ## assume not outlier for row 0. this also 0-indexes the dataframe again
#     df2 = df2.sort_index()
    df2['psn'] = df['psn']
    return(df2)


### validate different cluster parameters using powerstep function 

In [33]:
model2_10min_powerstep = find_power_step(model2_10min,'perf_pow',jump=0.3)
model2_10min['powerjump'] = model2_10min_powerstep['perf_pow']
model2_10min['powerjump'] = model2_10min['powerjump'].fillna(0).astype(int)
model2_10min_powerstep[model2_10min_powerstep['perf_pow']==1].shape

(5376, 3)

In [34]:
len(model2_10min_powerstep)

1602280

In [35]:
Counter(model2_10min['powerjump'])

Counter({0: 1596905, 1: 5376})

In [36]:
c =model2_10min[model2_10min['psn']==35]['powerjump']
Counter(c)

Counter({0: 70573, 1: 635})

In [37]:
newdf = rd_df[rd_df['psn']==35]
min_clust_size = int(len(newdf)/10)+1
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_clust_size, min_samples=int(min_clust_size*0.1))
clusterer_results = clusterer.fit_predict(newdf[list(range(3))])

In [38]:
newdf['cluster'] = clusterer_results 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
Counter(newdf['cluster'].values)

Counter({-1: 71208})

In [42]:
cluster_sizes = [40,50,60,70,80,90,100,110]#[10,20,30,40,50,60,70,80,90,100,110]
min_sample_sizes = [0.2,0.4,0.6,0.7,0.8,0.9,1,1.1,1.2]
psns = [35,37,40,45,46,47,49,55,57,58,62]

mindex0 = list(sorted(itertools.chain(*[range(len(cluster_sizes))for i in min_sample_sizes])))
mindex1 = list(itertools.chain(*[range(len(min_sample_sizes))for i in cluster_sizes]))

my_index = pd.MultiIndex(levels=[cluster_sizes,min_sample_sizes],
                             labels=[mindex0,mindex1],
                             names=[u'min_cluster_size', u'min_samples'])


transients_scores_df = pd.DataFrame(index=my_index,columns=psns)
normals_scores_df = pd.DataFrame(index=my_index,columns=psns)
## slice df like this:  testdf.loc[10].loc[0.1][35]




num_eigs_toclust = 20

for clust_sizer in cluster_sizes:
    for sample_sizer in min_sample_sizes:
        for psn in psns:
            print('clust_sizer: {}, sample_sizer: {}, psn: {}'.format(clust_sizer,sample_sizer,psn))

            powerjump_validation = model2_10min[model2_10min['psn']==psn]['powerjump'].values
            newdf = rd_df[rd_df['psn']==psn]
            min_clust_size = int(len(newdf)/clust_sizer)+1
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_clust_size, min_samples=int(min_clust_size*sample_sizer))
            clusterer_results = clusterer.fit_predict(newdf[list(range(num_eigs_toclust))])
            preds = [1 if i==-1 else 0 for i in clusterer_results]
            
            idxs_of_ones = []
            for i,j in enumerate(powerjump_validation):
                if j == 1:
                    idxs_of_ones.append(i)
            transient_validation = [powerjump_validation[i] for i in idxs_of_ones]
            transient_predictions = [preds[i] for i in idxs_of_ones]
            print('number of powerjumps: ', len(transient_validation))
            print('number of noise by cluster: ', Counter(transient_predictions)[1])
            f1score = metrics.f1_score(transient_validation, transient_predictions,average='micro')
            ## use micro to take into account false pos/neg
            transients_scores_df.loc[clust_sizer].loc[sample_sizer][psn]= f1score
            
            idxs_of_zeroes = []
            for i,j in enumerate(powerjump_validation):
                if j == 0:
                    idxs_of_zeroes.append(i)
            normals_validation = [powerjump_validation[i] for i in idxs_of_zeroes]
            normals_predictions = [preds[i] for i in idxs_of_zeroes]
            f1score_norm = metrics.f1_score(normals_validation, normals_predictions,average='micro')
            normals_scores_df.loc[clust_sizer].loc[sample_sizer][psn]= f1score_norm
            
            
            
            
# newdf['cluster'] = clusterer_results
## min_clust size/70 gives 256 "noise" datapoints
## min clust size/40 gives  "noise" datapoints

clust_sizer: 40, sample_sizer: 0.2, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 40, sample_sizer: 0.2, psn: 37
number of powerjumps:  94
number of noise by cluster:  77
clust_sizer: 40, sample_sizer: 0.2, psn: 40
number of powerjumps:  52
number of noise by cluster:  5
clust_sizer: 40, sample_sizer: 0.2, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 40, sample_sizer: 0.2, psn: 46
number of powerjumps:  14
number of noise by cluster:  2
clust_sizer: 40, sample_sizer: 0.2, psn: 47
number of powerjumps:  27
number of noise by cluster:  3
clust_sizer: 40, sample_sizer: 0.2, psn: 49
number of powerjumps:  206
number of noise by cluster:  48
clust_sizer: 40, sample_sizer: 0.2, psn: 55
number of powerjumps:  90
number of noise by cluster:  87
clust_sizer: 40, sample_sizer: 0.2, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 40, sample_sizer: 0.2, psn: 58
number of powerjumps:  5
number of noi

number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 40, sample_sizer: 1.1, psn: 47
number of powerjumps:  27
number of noise by cluster:  5
clust_sizer: 40, sample_sizer: 1.1, psn: 49
number of powerjumps:  206
number of noise by cluster:  76
clust_sizer: 40, sample_sizer: 1.1, psn: 55
number of powerjumps:  90
number of noise by cluster:  88
clust_sizer: 40, sample_sizer: 1.1, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 40, sample_sizer: 1.1, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 40, sample_sizer: 1.1, psn: 62
number of powerjumps:  440
number of noise by cluster:  440
clust_sizer: 40, sample_sizer: 1.2, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 40, sample_sizer: 1.2, psn: 37
number of powerjumps:  94
number of noise by cluster:  82
clust_sizer: 40, sample_sizer: 1.2, psn: 40
number of powerjumps:  52
number of noise by cluster:  9
clust_sizer: 40, sample_

number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 50, sample_sizer: 0.9, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 50, sample_sizer: 0.9, psn: 62
number of powerjumps:  440
number of noise by cluster:  440
clust_sizer: 50, sample_sizer: 1, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 50, sample_sizer: 1, psn: 37
number of powerjumps:  94
number of noise by cluster:  84
clust_sizer: 50, sample_sizer: 1, psn: 40
number of powerjumps:  52
number of noise by cluster:  8
clust_sizer: 50, sample_sizer: 1, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 50, sample_sizer: 1, psn: 46
number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 50, sample_sizer: 1, psn: 47
number of powerjumps:  27
number of noise by cluster:  6
clust_sizer: 50, sample_sizer: 1, psn: 49
number of powerjumps:  206
number of noise by cluster:  49
clust_sizer: 50, sample_sizer: 1, psn:

number of powerjumps:  94
number of noise by cluster:  26
clust_sizer: 60, sample_sizer: 0.8, psn: 40
number of powerjumps:  52
number of noise by cluster:  6
clust_sizer: 60, sample_sizer: 0.8, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 60, sample_sizer: 0.8, psn: 46
number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 60, sample_sizer: 0.8, psn: 47
number of powerjumps:  27
number of noise by cluster:  6
clust_sizer: 60, sample_sizer: 0.8, psn: 49
number of powerjumps:  206
number of noise by cluster:  49
clust_sizer: 60, sample_sizer: 0.8, psn: 55
number of powerjumps:  90
number of noise by cluster:  90
clust_sizer: 60, sample_sizer: 0.8, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 60, sample_sizer: 0.8, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 60, sample_sizer: 0.8, psn: 62
number of powerjumps:  440
number of noise by cluster:  67
clust_sizer: 60, sample_siz

number of powerjumps:  27
number of noise by cluster:  9
clust_sizer: 70, sample_sizer: 0.6, psn: 49
number of powerjumps:  206
number of noise by cluster:  29
clust_sizer: 70, sample_sizer: 0.6, psn: 55
number of powerjumps:  90
number of noise by cluster:  87
clust_sizer: 70, sample_sizer: 0.6, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 70, sample_sizer: 0.6, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 70, sample_sizer: 0.6, psn: 62
number of powerjumps:  440
number of noise by cluster:  64
clust_sizer: 70, sample_sizer: 0.7, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 70, sample_sizer: 0.7, psn: 37
number of powerjumps:  94
number of noise by cluster:  25
clust_sizer: 70, sample_sizer: 0.7, psn: 40
number of powerjumps:  52
number of noise by cluster:  6
clust_sizer: 70, sample_sizer: 0.7, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 70, sample_

number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 80, sample_sizer: 0.2, psn: 62
number of powerjumps:  440
number of noise by cluster:  69
clust_sizer: 80, sample_sizer: 0.4, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 80, sample_sizer: 0.4, psn: 37
number of powerjumps:  94
number of noise by cluster:  26
clust_sizer: 80, sample_sizer: 0.4, psn: 40
number of powerjumps:  52
number of noise by cluster:  5
clust_sizer: 80, sample_sizer: 0.4, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 80, sample_sizer: 0.4, psn: 46
number of powerjumps:  14
number of noise by cluster:  2
clust_sizer: 80, sample_sizer: 0.4, psn: 47
number of powerjumps:  27
number of noise by cluster:  9
clust_sizer: 80, sample_sizer: 0.4, psn: 49
number of powerjumps:  206
number of noise by cluster:  48
clust_sizer: 80, sample_sizer: 0.4, psn: 55
number of powerjumps:  90
number of noise by cluster:  87
clust_sizer: 80, sample_s

number of powerjumps:  52
number of noise by cluster:  7
clust_sizer: 80, sample_sizer: 1.2, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 80, sample_sizer: 1.2, psn: 46
number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 80, sample_sizer: 1.2, psn: 47
number of powerjumps:  27
number of noise by cluster:  12
clust_sizer: 80, sample_sizer: 1.2, psn: 49
number of powerjumps:  206
number of noise by cluster:  49
clust_sizer: 80, sample_sizer: 1.2, psn: 55
number of powerjumps:  90
number of noise by cluster:  88
clust_sizer: 80, sample_sizer: 1.2, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 80, sample_sizer: 1.2, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 80, sample_sizer: 1.2, psn: 62
number of powerjumps:  440
number of noise by cluster:  71
clust_sizer: 90, sample_sizer: 0.2, psn: 35
number of powerjumps:  635
number of noise by cluster:  606
clust_sizer: 90, sample_

number of powerjumps:  206
number of noise by cluster:  44
clust_sizer: 90, sample_sizer: 1, psn: 55
number of powerjumps:  90
number of noise by cluster:  90
clust_sizer: 90, sample_sizer: 1, psn: 57
number of powerjumps:  10
number of noise by cluster:  10
clust_sizer: 90, sample_sizer: 1, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 90, sample_sizer: 1, psn: 62
number of powerjumps:  440
number of noise by cluster:  65
clust_sizer: 90, sample_sizer: 1.1, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 90, sample_sizer: 1.1, psn: 37
number of powerjumps:  94
number of noise by cluster:  26
clust_sizer: 90, sample_sizer: 1.1, psn: 40
number of powerjumps:  52
number of noise by cluster:  6
clust_sizer: 90, sample_sizer: 1.1, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 90, sample_sizer: 1.1, psn: 46
number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 90, sample_sizer: 1

number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 100, sample_sizer: 0.8, psn: 62
number of powerjumps:  440
number of noise by cluster:  65
clust_sizer: 100, sample_sizer: 0.9, psn: 35
number of powerjumps:  635
number of noise by cluster:  635
clust_sizer: 100, sample_sizer: 0.9, psn: 37
number of powerjumps:  94
number of noise by cluster:  26
clust_sizer: 100, sample_sizer: 0.9, psn: 40
number of powerjumps:  52
number of noise by cluster:  6
clust_sizer: 100, sample_sizer: 0.9, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 100, sample_sizer: 0.9, psn: 46
number of powerjumps:  14
number of noise by cluster:  3
clust_sizer: 100, sample_sizer: 0.9, psn: 47
number of powerjumps:  27
number of noise by cluster:  9
clust_sizer: 100, sample_sizer: 0.9, psn: 49
number of powerjumps:  206
number of noise by cluster:  33
clust_sizer: 100, sample_sizer: 0.9, psn: 55
number of powerjumps:  90
number of noise by cluster:  87
clust_sizer: 100

number of powerjumps:  94
number of noise by cluster:  26
clust_sizer: 110, sample_sizer: 0.7, psn: 40
number of powerjumps:  52
number of noise by cluster:  4
clust_sizer: 110, sample_sizer: 0.7, psn: 45
number of powerjumps:  36
number of noise by cluster:  35
clust_sizer: 110, sample_sizer: 0.7, psn: 46
number of powerjumps:  14
number of noise by cluster:  2
clust_sizer: 110, sample_sizer: 0.7, psn: 47
number of powerjumps:  27
number of noise by cluster:  11
clust_sizer: 110, sample_sizer: 0.7, psn: 49
number of powerjumps:  206
number of noise by cluster:  28
clust_sizer: 110, sample_sizer: 0.7, psn: 55
number of powerjumps:  90
number of noise by cluster:  0
clust_sizer: 110, sample_sizer: 0.7, psn: 57
number of powerjumps:  10
number of noise by cluster:  0
clust_sizer: 110, sample_sizer: 0.7, psn: 58
number of powerjumps:  5
number of noise by cluster:  4
clust_sizer: 110, sample_sizer: 0.7, psn: 62
number of powerjumps:  440
number of noise by cluster:  62
clust_sizer: 110, s

In [43]:
transients_scores_df.to_csv('transients_scores2.csv')

In [44]:
normals_scores_df.to_csv('normals_scores.csv')

In [47]:
transients_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,35,37,40,45,46,47,49,55,57,58,62
min_cluster_size,min_samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
40,0.2,1.0,0.819149,0.0961538,0.972222,0.142857,0.111111,0.23301,0.966667,1,0.8,1.0
40,0.4,1.0,0.851064,0.115385,0.972222,0.214286,0.148148,0.174757,0.988889,1,0.8,1.0
40,0.6,1.0,0.882979,0.134615,0.972222,0.214286,0.0,0.237864,0.977778,1,0.8,1.0
40,0.7,1.0,0.893617,0.153846,0.972222,0.214286,0.0,0.237864,0.977778,1,0.8,1.0
40,0.8,1.0,0.893617,0.153846,0.972222,0.214286,0.0,0.237864,0.977778,1,0.8,1.0
40,0.9,1.0,0.893617,0.173077,0.972222,0.214286,0.185185,0.237864,0.977778,1,0.8,1.0
40,1.0,1.0,0.851064,0.173077,0.972222,0.214286,0.185185,0.257282,0.977778,1,0.8,1.0
40,1.1,1.0,0.87234,0.173077,0.972222,0.214286,0.185185,0.368932,0.977778,1,0.8,1.0
40,1.2,1.0,0.87234,0.173077,0.972222,0.0,0.185185,0.378641,0.977778,1,0.8,1.0
50,0.2,1.0,0.276596,0.0576923,0.972222,0.5,0.111111,0.18932,0.966667,1,0.8,0.754545


In [78]:
mylist = []
for col in transients_scores_df.columns:
    print(col,transients_scores_df[col].max())
    ## grabs the rows where f1 score is max
    mylist.append(transients_scores_df[transients_scores_df[col]==transients_scores_df[col].max()][col])
print(Counter([x[1] for ii in mylist for x in ii.index.values ])) ## counts number of times that the min_samples multipler appears
print(Counter([x[0] for ii in mylist for x in ii.index.values ]))

35 1.0
37 0.8936170212765957
40 0.17307692307692307
45 0.9722222222222222
46 0.5714285714285714
47 0.48148148148148145
49 0.3786407766990291
55 1.0
57 1.0
58 0.8000000000000002
62 1.0
Counter({1.1: 42, 1.2: 42, 1.0: 40, 0.9: 39, 0.8: 36, 0.7: 35, 0.4: 34, 0.6: 34, 0.2: 24})
Counter({40: 53, 50: 50, 60: 43, 70: 38, 90: 38, 80: 37, 100: 37, 110: 30})


In [46]:
normals_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,35,37,40,45,46,47,49,55,57,58,62
min_cluster_size,min_samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
40,0.2,0.0,0.711087,0.996994,0.743189,0.992478,0.971963,0.880925,0.877282,0.927386,0.985382,0.0
40,0.4,0.0,0.620081,0.996421,0.613838,0.986408,0.961151,0.983667,0.753631,0.87192,0.976605,0.0
40,0.6,0.0,0.501388,0.995563,0.656729,0.982449,0.999542,0.97001,0.71433,0.833161,0.964025,0.0
40,0.7,0.0,0.470895,0.995133,0.61698,0.979282,0.999542,0.968947,0.684595,0.812662,0.957896,0.0
40,0.8,0.0,0.460965,0.994561,0.614927,0.975851,0.999725,0.967547,0.673618,0.790823,0.951071,0.0
40,0.9,0.0,0.473473,0.994417,0.586511,0.971628,0.996427,0.966193,0.656,0.772334,0.942566,0.0
40,1.0,0.0,0.532934,0.993988,0.549779,0.96569,0.994686,0.964614,0.638084,0.749977,0.933619,0.0
40,1.1,0.0,0.494203,0.993773,0.530151,0.966086,0.994502,0.937031,0.605877,0.725854,0.924094,0.0
40,1.2,0.0,0.474632,0.99363,0.537275,0.997493,0.994502,0.934087,0.592647,0.710822,0.926607,0.0
50,0.2,0.0740793,0.99338,0.998068,0.767375,0.966614,0.985065,0.910724,0.881864,0.945037,0.986724,0.741559


In [77]:
mylist = []
for col in normals_scores_df.columns:
    print(col,normals_scores_df[col].max())
    ## grabs the rows where f1 score is max
    mylist.append(normals_scores_df[normals_scores_df[col]==normals_scores_df[col].max()][col])
print(Counter([x[1] for ii in mylist for x in ii.index.values ])) ## counts number of times that the min_samples multipler appears
print(Counter([x[0] for ii in mylist for x in ii.index.values ]))

35 0.5008289289104898
37 0.9973305265727012
40 0.9989264242771256
45 0.9984806488040242
46 0.9974927421483241
47 0.9997251236943375
49 0.9945818873838576
55 0.9996504854368932
57 0.9999086229478237
58 0.9892024039930731
62 0.9928584782202281
Counter({0.2: 10, 0.4: 2, 0.6: 2, 1.2: 1, 0.8: 1, 0.7: 1})
Counter({110: 8, 100: 5, 80: 2, 40: 2})


In [229]:
normals_predictions

[]

In [230]:
normals_validation

[]