In [24]:
import pandas as pd
import numpy as np
from collections import Counter, OrderedDict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
A = pd.read_csv("../assets/data/data_A.csv", header=None)
B = pd.read_csv("../assets/data/data_B.csv", header=None)
C = pd.read_csv("../assets/data/data_C.csv", header=None)
D = pd.read_csv("../assets/data/data_D.csv", header=None)
E = pd.read_csv("../assets/data/data_E.csv", header=None)

In [3]:
data = pd.concat([A,B,C,D,E], axis=0).reset_index().drop('index', axis=1)
Y = [0]*A.__len__() + [1]*B.__len__() + [2]*C.__len__() + [3]*D.__len__() + [4]*E.__len__()
Y.__len__()

18000

In [4]:
A.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.944581,0.07347,0.595152,0.031151,0.665257,0.637385,0.862465,0.941638,0.445627,0.669958,...,0.291457,0.957417,0.405958,0.946556,0.857191,0.688928,0.003288,0.900103,0.919865,0.005909
1,0.526433,0.241855,0.456214,0.116482,0.599807,0.836635,0.912332,0.950627,0.581529,0.51469,...,0.154393,0.566702,0.509103,0.453779,0.991011,0.900147,0.151426,0.607675,0.478204,0.828624
2,0.362481,0.397287,0.29446,0.614796,0.176636,0.208382,0.216288,0.561427,0.103087,0.311326,...,0.115245,0.447862,0.526769,0.896337,0.920253,0.053284,0.909701,0.689073,0.574441,0.918327
3,0.672451,0.825486,0.816008,0.560363,0.55969,0.858013,0.205535,0.715741,0.815872,0.119277,...,0.938751,0.947538,0.586773,0.813136,0.251255,0.641132,0.16578,0.253323,0.931356,0.897332
4,0.867482,0.259081,0.329302,0.554069,0.601874,0.470497,0.791504,0.541178,0.662337,0.078658,...,0.25662,0.999742,0.859287,0.475762,0.998712,0.403417,0.43493,0.688447,0.264142,0.441126


## Check distribution of signal value for each timestamp for each tool

In [5]:
distr_A = {}
for idx in range(24):
    signal = A.iloc[:,idx].values
    print(f"{idx}: Mean: {np.mean(signal)} and Std: {np.std(signal)}")
    distr_A[idx] = (np.mean(signal), np.std(signal))

0: Mean: 0.5025964681498825 and Std: 0.2902587022827323
1: Mean: 0.5085746308847962 and Std: 0.2887392666222625
2: Mean: 0.5114946562000392 and Std: 0.2902455184681676
3: Mean: 0.5137520414903997 and Std: 0.2864925165052686
4: Mean: 0.5221034833156347 and Std: 0.28883676021802995
5: Mean: 0.5228762810352664 and Std: 0.28829650187444134
6: Mean: 0.5270778986933777 and Std: 0.2856581214323996
7: Mean: 0.5270183830348746 and Std: 0.28672614095253873
8: Mean: 0.5279824944380879 and Std: 0.2899841840375673
9: Mean: 0.532977562843652 and Std: 0.28518432803378574
10: Mean: 0.5374322380001959 and Std: 0.28625893144470166
11: Mean: 0.5465282161246081 and Std: 0.28580586967655003
12: Mean: 0.5438167393248433 and Std: 0.28543393344869794
13: Mean: 0.5502900546201018 and Std: 0.2841836456640366
14: Mean: 0.5516557479674765 and Std: 0.2856830930586056
15: Mean: 0.5467370230985501 and Std: 0.28812178915419345
16: Mean: 0.5579806419106583 and Std: 0.2807457813228525
17: Mean: 0.5695508264465126 and S

In [6]:
distr_B = {}
for idx in range(24):
    signal = B.iloc[:,idx].values
    print(f"{idx}: Mean: {np.mean(signal)} and Std: {np.std(signal)}")
    distr_B[idx] = (np.mean(signal), np.std(signal))

0: Mean: 0.5034235366047746 and Std: 0.29197728227445324
1: Mean: 0.5082137542921561 and Std: 0.2878537675891718
2: Mean: 0.48908428120197034 and Std: 0.29221362969442954
3: Mean: 0.5077695509124669 and Std: 0.28086441508672266
4: Mean: 0.4987136986748768 and Std: 0.2881082569213527
5: Mean: 0.5061950300128837 and Std: 0.2951605218811823
6: Mean: 0.5123362297499053 and Std: 0.283844800407551
7: Mean: 0.5073783535619554 and Std: 0.2904626473669558
8: Mean: 0.503855797546798 and Std: 0.290036988910617
9: Mean: 0.5092538906654035 and Std: 0.28959494608073094
10: Mean: 0.5258310789352026 and Std: 0.289717444910802
11: Mean: 0.5214013241845395 and Std: 0.2907496273547873
12: Mean: 0.5255809047514209 and Std: 0.28484453932931564
13: Mean: 0.5203845506290262 and Std: 0.28637759208965247
14: Mean: 0.5242150645471769 and Std: 0.2882716257636771
15: Mean: 0.5199546780083365 and Std: 0.28998839959183675
16: Mean: 0.5225422036146268 and Std: 0.2771686041958546
17: Mean: 0.522933600392194 and Std: 

In [7]:
distr_C = {}
for idx in range(24):
    signal = C.iloc[:,idx].values
    print(f"{idx}: Mean: {np.mean(signal)} and Std: {np.std(signal)}")
    distr_C[idx] = (np.mean(signal), np.std(signal))

0: Mean: 0.48849711494186054 and Std: 0.2901848576735514
1: Mean: 0.4974505645930233 and Std: 0.29356613041375273
2: Mean: 0.4840250793807063 and Std: 0.28834922983736244
3: Mean: 0.492498402086994 and Std: 0.2909581509377182
4: Mean: 0.5031035547962963 and Std: 0.28950031284848055
5: Mean: 0.5090416896093884 and Std: 0.28797002672745015
6: Mean: 0.5016439589069768 and Std: 0.2888879510402921
7: Mean: 0.49980414467097334 and Std: 0.2887132083328212
8: Mean: 0.4880481830159346 and Std: 0.28425519251114234
9: Mean: 0.5118264301864772 and Std: 0.287590253212707
10: Mean: 0.4911321914599483 and Std: 0.29053882367841866
11: Mean: 0.49952458681739875 and Std: 0.291229923269559
12: Mean: 0.5054025957140396 and Std: 0.29356000861295256
13: Mean: 0.5027890987368647 and Std: 0.2863659340028686
14: Mean: 0.4936173099216193 and Std: 0.2893207477232978
15: Mean: 0.4952441609651163 and Std: 0.28682849884007333
16: Mean: 0.4905439209069768 and Std: 0.2795609157693484
17: Mean: 0.4941995742911284 and 

In [8]:
distr_D = {}
for idx in range(24):
    signal = D.iloc[:,idx].values
    print(f"{idx}: Mean: {np.mean(signal)} and Std: {np.std(signal)}")
    distr_D[idx] = (np.mean(signal), np.std(signal))

0: Mean: 0.5002339113573844 and Std: 0.2908093351935116
1: Mean: 0.5018149657373168 and Std: 0.2915497685268972
2: Mean: 0.4898121082762119 and Std: 0.2884390854273117
3: Mean: 0.4978901596798196 and Std: 0.2874117203299099
4: Mean: 0.49230434793761746 and Std: 0.2867116244977272
5: Mean: 0.49240419874257796 and Std: 0.2851371114328267
6: Mean: 0.49076283098910184 and Std: 0.2908436402683718
7: Mean: 0.4982383876486284 and Std: 0.2912654264714067
8: Mean: 0.47555727698384065 and Std: 0.2907592904704774
9: Mean: 0.4865374627846674 and Std: 0.2859719856156414
10: Mean: 0.48092166874032316 and Std: 0.29141418941218633
11: Mean: 0.48422505182487785 and Std: 0.28616425023633224
12: Mean: 0.4808971230796693 and Std: 0.28972514079470835
13: Mean: 0.48499367964787665 and Std: 0.2903960519104757
14: Mean: 0.47651632998196164 and Std: 0.28500010751986593
15: Mean: 0.47740672916384824 and Std: 0.28974713351178033
16: Mean: 0.46346849435362647 and Std: 0.2841229451434594
17: Mean: 0.47241772716459

In [9]:
distr_E = {}
for idx in range(24):
    signal = E.iloc[:,idx].values
    print(f"{idx}: Mean: {np.mean(signal)} and Std: {np.std(signal)}")
    distr_E[idx] = (np.mean(signal), np.std(signal))

0: Mean: 0.4915175417956011 and Std: 0.28943285578429834
1: Mean: 0.4968950822629883 and Std: 0.28919539675178063
2: Mean: 0.4892313882887751 and Std: 0.2853111335749019
3: Mean: 0.48138247823492614 and Std: 0.2862945498256753
4: Mean: 0.4867929547700037 and Std: 0.28636562236669566
5: Mean: 0.49058170251971933 and Std: 0.2882261884117892
6: Mean: 0.4805407321245733 and Std: 0.2868318319909901
7: Mean: 0.46911243097629884 and Std: 0.286025788292963
8: Mean: 0.4730912093164581 and Std: 0.2857683443894855
9: Mean: 0.46243615339874855 and Std: 0.2877400340958106
10: Mean: 0.46178059321255216 and Std: 0.28656915676810274
11: Mean: 0.4649612332119833 and Std: 0.2860278105929251
12: Mean: 0.4611804478553281 and Std: 0.2875579944259108
13: Mean: 0.4551184138257489 and Std: 0.2810610836743929
14: Mean: 0.44641179965718614 and Std: 0.2827585658543393
15: Mean: 0.44387849037713306 and Std: 0.28217671611150724
16: Mean: 0.44637082463822525 and Std: 0.28714413018697993
17: Mean: 0.4380789554876754

<b><u> A simple generative model based on gaussian kernel: </u></b><br/>
Use the means and dev to majority vote the tool for a sample signal

In [10]:
distr = {0:distr_A,1:distr_B,2:distr_C,3:distr_D,4:distr_E}

In [11]:
from sklearn.metrics.pairwise import rbf_kernel
from scipy.stats import mode

In [13]:
def gaussianVote(sample, distr):
    vote = []
    # For each timestamp
    for t in range(24):
        local_vote = []
        # For each prior
        vote.append(np.argmax([rbf_kernel([[sample[t]]], [[distr[key][t][0]]], distr[key][t][1]**(-2)) for key in distr]))
    return ['A','B','C','D','E'][mode(vote)[0][0]], vote

In [14]:
def predictSample(dataset, distr):
    idx = np.random.randint(low = 0, high = dataset.__len__() - 1, size = 1)[0]
    tool, vote = gaussianVote(dataset[idx], distr)
    return vote

## Indices of clarity

<b> Return the most prominently-differentiable-common-timestamps for each tool using <u>bootstrap sampling</u></b>

In [16]:
dataS = {0:A,1:B,2:C,3:D,4:E}

In [17]:
def indicesOfClarity(idx, dataS, distr, size=None):
    if not size:
        size = dataS[idx].__len__()
    indices = []
    for _ in range(size):
        votes = predictSample(dataS[idx].values, distr)
        for _ in range(votes.count(idx)):
            indices.append(votes.index(idx))
            votes[votes.index(idx)] = 24
    return indices

In [18]:
indices = {}
for idx in tqdm(range(5)):
    dic = Counter(indicesOfClarity(idx, dataS, distr))
    indices[idx] = sorted(dic, key=lambda k: dic[k], reverse=True)[:10]

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:28<00:00, 77.80s/it]


In [19]:
indices

{0: [23, 19, 18, 17, 14, 11, 22, 16, 7, 21],
 1: [0, 2, 16, 23, 17, 19, 22, 13, 18, 21],
 2: [0, 1, 2, 23, 20, 22, 16, 18, 21, 14],
 3: [8, 23, 18, 21, 20, 22, 17, 9, 19, 15],
 4: [20, 22, 23, 21, 19, 15, 17, 9, 16, 18]}

In [20]:
def predictClearerSample(X, distr, indices, threshold=10):
    predicted = []
    for sample in X:
        tool, vote = gaussianVote(sample, distr)
        clarity = [0]*indices.__len__()
        for tool in indices.keys():
            for idxx in indices[tool][:threshold]:
                if vote[idxx] == tool:
                    clarity[tool] += 1
        predicted.append(np.argmax(clarity))
    return predicted

## Lets check how well these perform

In [25]:
_, X_test, _, y_test = train_test_split(data, Y, test_size=0.1)
predicted = predictClearerSample(X_test.values, distr, indices, 5)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.42      0.83      0.56       514
           1       0.11      0.01      0.03       279
           2       0.17      0.13      0.14       214
           3       0.12      0.00      0.01       276
           4       0.51      0.58      0.54       517

    accuracy                           0.42      1800
   macro avg       0.27      0.31      0.26      1800
weighted avg       0.32      0.42      0.34      1800



In [26]:
print(confusion_matrix(y_test, predicted))

[[425   6  29   1  53]
 [196   4  14   1  64]
 [129   3  27   1  54]
 [117   8  36   1 114]
 [145  15  53   4 300]]
