In [1]:
import random
import numpy as np
import pandas as pd
import manage_data as md
import preprocessing as pp
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import metrics

In [2]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues, filename='cm'):
    plt.figure(figsize=(8,8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20, fontstyle='oblique')
    #plt.colorbar()
    ax = plt.subplot()
    tick_marks = np.arange(2)
    labels = ['Sticking', 'Fine']
    plt.xticks(tick_marks, labels, rotation=45, fontsize=20)
    plt.yticks(tick_marks, labels, fontsize=20)
    plt.tight_layout()
    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)
    width, height = cm.shape
    for x in xrange(width):
        xx = np.round(x, 2)
        for y in xrange(height):
            yy = np.round(y, 2)
            valeur = str(int(np.round(100 * cm[x][y],0)))+'%'
            ax.annotate(valeur, xy=(y, x), 
                        horizontalalignment='center',
                        verticalalignment='center',
                        size=30)
    plt.savefig('./images/cm_one/' + filename + '.png')
    #plt.text(0.1, 0.1,'Caption')

def concat_df(coil_list, filename='peaks.h5'):
    big_df = md.importPeak(coil_list[0], filename)#.dropna()
    for coil in coil_list[1:]:
        df = md.importPeak(coil, filename)#.dropna()
        big_df = pd.concat([big_df, df], ignore_index=True)
    return big_df

def create_DB(df):
    feature_col = ['Ximf0', 'Ximf1', 'Ximf2', 'Yimf0', 'Yimf1', 'Yimf2', 'thickness', 'speed']
    X = df[feature_col].values
    Y = df[['sticking']]
    Y = np.array(Y.sticking.values[:], dtype=bool)
    return X, Y

def random_sample(coil_list):
    df = concat_df(coil_list)
    sticking_index = list(df[df.sticking==True].index)
    non_sticking_index = list(df[df.sticking==False].index)
    group_of_items = non_sticking_index               # a sequence or set will work here.
    num_to_select = len(df[df.sticking==True])        # set the number to select here.
    list_of_random_items = random.sample(group_of_items, num_to_select)
    list_of_random_items
    ind = sticking_index+list_of_random_items
    return df.loc[ind]

def random_coils_selection(coil_list, ratio=0.5):
    aux  = list(coil_list)
    random.shuffle(aux)
    n = int(ratio * len(aux))+1
    return aux[:n], aux[n:]

def result_per_coil(coil_list, estimator):
    sum_true = []
    sum_predict = []
    for coil in coil_list:
        df_test = concat_df([coil])
        X_test, Y_true = create_DB(df_test)
        Y_predict = estimator.predict(X_test)
        sum_true.append(sum(Y_true))
        sum_predict.append(sum(Y_predict))
    d = {'truth':sum_true, 'prediction':sum_predict}
    result = pd.DataFrame(d, index=coil_list)
    return result, np.array(sum_true), np.array(sum_predict)

def sticking_coils_finder(coil_list):
    sum_true = []
    coils = []
    for coil in coil_list:
        df_test = concat_df([coil])
        X_test, Y_true = create_DB(df_test)
        sum_true.append(sum(Y_true))
    for coil, sumtrue in zip(coil_list, sum_true):
        if sumtrue>0:
            coils.append(coil)
    fine_coils = sorted(list(set(coil_list) - set(coils)))
    return sorted(coils), sorted(fine_coils)

def compare(df):
    print('     COLLANT            NON COLLANT  ')
    print(' ')
    
    print('Fréquence moyenne du pic')
    for i in range(3):
        print(df.iloc[:,i][df.sticking==1].mean(), df.iloc[:,i][df.sticking==0].mean())
    print(' ')
    
    print('Ecart type')
    for i in range(3):
        print(df.iloc[:,i][df.sticking==1].std(), df.iloc[:,i][df.sticking==0].std())
    print(' ')
    
    print('Amplitude moyenne du pic')
    for i in range(3,6):
        print(df.iloc[:,i][df.sticking==1].mean(), df.iloc[:,i][df.sticking==0].mean())
    print(' ')
    
    print('Ecart type')
    for i in range(3,6):
        print(df.iloc[:,i][df.sticking==1].std(), df.iloc[:,i][df.sticking==0].std())
    print(' ')
    
    print('Vitesse')
    i = 7
    print(df.iloc[:,i][df.sticking==1].mean(), df.iloc[:,i][df.sticking==0].mean())
    print(df.iloc[:,i][df.sticking==1].std(), df.iloc[:,i][df.sticking==0].std())
    print(' ')
    
    print('Epaisseur')
    i = 9
    print(df.iloc[:,i][df.sticking==1].mean(), df.iloc[:,i][df.sticking==0].mean())
    print(df.iloc[:,i][df.sticking==1].std(), df.iloc[:,i][df.sticking==0].std())
    return

# About DB

In [3]:
#peakDB = 'peaks_1s_1_20Hz.h5'
peakDB = 'peaks_1s_1_100Hz.h5'
#peakDb = 'peaks_1s_2_6Hz.h5'
#peakDB = 'peaks_0.25t_16_200.h5'
#peakDB = 'peaks.h5'
all_coils = list( set(range(88)) - set([49, 31]) )
df = concat_df(all_coils, peakDB)
k = 1027
print(len(df))
df.head(1)

9937


Unnamed: 0,Ximf0,Ximf1,Ximf2,Yimf0,Yimf1,Yimf2,coil,speed,sticking,thickness,time_begin,time_end
0,1.283685,6.097503,0.962764,24.797989,6.12363,3.701913,0,697.881497,False,0.6,61.0,62.0


# Statistics

## Sans normalisation

In [4]:
peakDB = 'peaks_1s_1_100Hz.h5'
df = concat_df(all_coils, peakDB)
print(len(df))
compare(df)

9937
     COLLANT            NON COLLANT  
 
Fréquence moyenne du pic
(18.833744588582061, 16.273223929852943)
(23.537472666091563, 25.213059889125326)
(22.014424688119526, 25.439759624925731)
 
Ecart type
(25.160367660781922, 22.978675420675572)
(28.384086244110296, 28.174745066633871)
(27.699922237876009, 28.467917263184447)
 
Amplitude moyenne du pic
(6.8267701909455525, 7.4534204463463691)
(7.0544199737749453, 4.1889993957405895)
(8.5705320160956902, 3.540900698567548)
 
Ecart type
(11.533437502515227, 15.782628498109318)
(11.092538296421248, 7.6679442034126071)
(13.954412452214806, 5.9219791982112397)
 
Vitesse
(852.60887498034469, 917.30464788796996)
(379.81345103853556, 371.27490305180493)
 
Epaisseur
(0.84643250688705229, 0.88865059930403401)
(0.2881564966292019, 0.40298713382719098)


In [5]:
df.head(1)

Unnamed: 0,Ximf0,Ximf1,Ximf2,Yimf0,Yimf1,Yimf2,coil,speed,sticking,thickness,time_begin,time_end
0,1.283685,6.097503,0.962764,24.797989,6.12363,3.701913,0,697.881497,False,0.6,61.0,62.0


## Avec normalisation

In [6]:
peakDB = 'peaks_0.25t_16_100.h5'
df = concat_df(all_coils, peakDB)
print(len(df))
compare(df)

8528
     COLLANT            NON COLLANT  
 
Fréquence moyenne du pic
(37.343024267529415, 37.060113477543439)
(40.960497395162854, 41.553634757942376)
(39.909713650311701, 40.34248036390013)
 
Ecart type
(26.366198954000055, 25.164464932993535)
(27.253639660931391, 26.511198732041432)
(26.822964776423415, 26.149644074610535)
 
Amplitude moyenne du pic
(5.6819804332918906, 4.1261867705328461)
(5.3453469317396705, 3.3795521071626857)
(6.7204039328224905, 4.3584300856573641)
 
Ecart type
(11.729238060509223, 7.6300932237835601)
(7.787596548553064, 4.7169144706544879)
(10.402782304076148, 7.012182838742838)
 
Vitesse
(1066.6135888636147, 1038.2446624728168)
(183.26061926624848, 132.48343508148645)
 
Epaisseur
(0.81905351614329935, 0.79148875059837243)
(0.27658839305031113, 0.29778258336862995)


In [7]:
df

Unnamed: 0,Ximf0,Ximf1,Ximf2,Yimf0,Yimf1,Yimf2,coil,speed,sticking,thickness,time_begin,time_end
0,84.193660,40.020305,17.933627,1.724065,1.255141,1.431057,0,,False,0.60,61.000000,61.250000
1,60.879945,13.025477,20.387702,1.952982,9.510135,1.550076,0,,False,0.60,61.250000,61.500000
2,13.025477,13.025477,13.025477,15.025931,5.537041,3.460636,0,,False,0.60,61.500000,61.750000
3,17.933627,13.025477,17.933627,8.676157,1.258239,1.623738,0,,False,0.60,61.750000,62.000000
4,13.025477,13.025477,13.025477,29.541907,3.703991,6.057327,0,,False,0.60,62.000000,62.250000
5,25.295853,33.885117,38.793267,2.516540,0.565598,0.949207,0,,False,0.60,62.250000,62.500000
6,20.387702,38.793267,17.933627,3.359280,2.016791,2.560979,0,,False,0.60,62.500000,62.750000
7,38.793267,33.885117,22.841778,1.767751,1.025966,1.168663,0,,False,0.60,62.750000,63.000000
8,16.706590,24.068815,25.295853,3.209237,1.141496,1.299013,0,,False,0.60,63.000000,63.250000
9,41.247342,44.928455,20.387702,2.807807,1.047453,1.093043,0,,False,0.60,63.250000,63.500000
