In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
from sklearn.decomposition import PCA
from sklearn.svm  import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [4]:
PATH_CARDS = '/home/jcscabral/Studies/UspEsalq/Tcc/projeto/pesquisa/code/R/swipecards'

pathfiles = [p for p in Path(PATH_CARDS).iterdir()]    

In [5]:
def norm_data(X):
    
    Xn = (X.iloc[:,3:] - X.iloc[:,3:].mean())/X.iloc[:,3:].std()
    Xn.fillna(0, inplace=True)
    
    #rearrange columns
    Xn["user_id"] = X["user_id"] 
    Xn["swipe_number"] = X["swipe_number"] 
    Xn["action_number"] = X["action_number"] 
    cols = np.concatenate([Xn.columns.values[-3:], Xn.columns.values[:-3]])
    Xn = Xn[cols] 
       
    return Xn

In [101]:
def split_train_test(X):
    
    start = 1
    end = 6

    for _ in range(2):    
        train = X[(X["action_number"] >= start) &
                  (X["action_number"] < end)]
        test = X[X["action_number"] == end]

        start += 1        
        end += 1
        yield train.iloc[:,3:], test.iloc[:,3:]
        

In [11]:
def template_data(X, n = 2):
    
    start = 1
    end = 6

    for _ in range(n):    
        data = X[(X["action_number"] >= start) &
                  (X["action_number"] < end)]        

        start += 1        
        end += 1
        yield data.iloc[:,3:]

In [8]:
def session_attack_data(X):
    
    for i in range(7):        
        action_number = i +1
        yield X[X["action_number"] == action_number]

In [190]:
def pca_data(Xbase, Xtest, n_components):
    pca = PCA(n_components = n_components)
    Xpca = pca.fit_transform(Xbase)
    Xtransf  = pca.transform(Xtest)
    return Xpca, Xtransf

### GridSearch "vanilla"

<p>Scenario in real world:</p>

<p>Train with five sessions. Next session validates the tempĺate</p>
<p>DAKOTA transforms 126 features into 30-70 new ones.</p>
<p>Here with 280 features, stands from 20 to 160.</p> 

In [12]:
def grid_search_svm_one(X):
    
    '''
    Receives PCA data train and test
    '''
    gamma = 1.0e-7
    nus = np.arange(0.001, 1, step = 0.01)

    df = pd.DataFrame(columns=["nu", "score"])
    for i, nu in enumerate(nus):
        svm =  OneClassSVM(gamma = gamma, kernel = "rbf", nu = nu)
        model = svm.fit(X)
        results = model.predict(X)
        score = results[results==1].shape[0]/results.shape[0] # true positives
        df.loc[i] = {"nu": nu, "score": score}
    
    index = df["score"].idxmax()
    dfbest = df.iloc[index]
    score = dfbest["score"]
    best_nu = dfbest["nu"]
    svm =  OneClassSVM(gamma = gamma, kernel = "rbf", nu = best_nu)
    model = svm.fit(X)

    return model, best_nu, score  

In [10]:
def grid_search_svm(Xtrain, Xtest):
    
    '''
    Receives PCA data train and test
    '''
    gamma = 1.0e-7
    nus = np.arange(0.001, 1, step = 0.01)

    df = pd.DataFrame(columns=["nu", "score"])
    for i, nu in enumerate(nus):
        svm =  OneClassSVM(gamma = gamma, kernel = "rbf", nu = nu)
        model = svm.fit(Xtrain)
        results = model.predict(Xtest)
        score = results[results==1].shape[0]/results.shape[0] # true positives
        df.loc[i] = {"nu": nu, "score": score}
    
    index = df["score"].idxmax()
    dfbest = df.iloc[index]
    score = dfbest["score"]
    best_nu = dfbest["nu"]
    svm =  OneClassSVM(gamma = gamma, kernel = "rbf", nu = best_nu)
    model = svm.fit(Xtrain)

    return model, best_nu, score  
    

### User x user

##### 1) Data from template w/ 5 sessions

In [None]:
results = pd.DataFrame(columns=["id", "cmp", "tpl", "nu", "score"])
models = {}

df_id = 0
for path in pathfiles:
    
    df = pd.read_csv(path, index_col=0)    
    id = np.unique(df["user_id"].values)[0]

    dfnorm = norm_data(df)

    tpl = 1 # template number
    templates = list()
    for Xtrain in template_data(dfnorm):

        n_m = Xtrain.shape[0]
        max_c = 160 + 1
        end_c = min(n_m, max_c)
        n_components = np.arange(10, end_c)

        tpl_score = 0.0
        tpl_cmp = 0        
        template = {}
        for n_cmp in n_components:
            pca = PCA(n_components = n_cmp)
            Xpca = pca.fit_transform(Xtrain)
            
            model, nu, score = grid_search_svm_one(Xpca)
            if score > tpl_score:                        
                template = {"tpl": tpl, "cmp": n_cmp, "model": model}
                tpl_score =  score

            results.loc[df_id] = {"id": id, "tpl": tpl, "cmp": n_cmp,
                                "nu": nu, "score": score}
            df_id += 1    
        templates.append(template)
        tpl += 1    

    models[id] = templates

In [92]:
np.unique(results["id"].values)

array([ 1,  2,  3,  6,  7,  8,  9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 24,
       25, 26, 27, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
       55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
       74])

In [93]:
group_results = results.groupby(["id", "tpl"])
group_results

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f232bc84340>

In [94]:
indxmax =  group_results["score"].idxmax()

In [95]:
best_results = results.iloc[indxmax]
best_results.head(5)

Unnamed: 0,id,cmp,tpl,nu,score
1523,1,10,1,0.581,1.0
1539,1,10,2,0.411,1.0
1676,2,10,1,0.521,0.945946
1703,2,10,2,0.361,0.911765
904,3,10,1,0.031,0.970588


In [96]:
best_results["cmp"].min(), best_results["cmp"].max()

(10, 22)

In [97]:
# It's not a real scenario. Overfitting data
THRESHOLD = 0.5
TP = best_results[best_results['score']>THRESHOLD].shape[0]/best_results.shape[0]
FN = 1 - TP
TP, FN

(1.0, 0.0)

In [98]:
best_results.head(3)

Unnamed: 0,id,cmp,tpl,nu,score
1523,1,10,1,0.581,1.0
1539,1,10,2,0.411,1.0
1676,2,10,1,0.521,0.945946


In [99]:
models[1]

[{'tpl': 1, 'cmp': 10, 'model': OneClassSVM(gamma=1e-07, nu=0.581)},
 {'tpl': 2,
  'cmp': 10,
  'model': OneClassSVM(gamma=1e-07, nu=0.4109999999999999)}]

##### 2) Scenario: Real authentication

In [None]:
auth_results = pd.DataFrame(columns=["id", "cmp", "tpl", "score"])

j = 0
for path in pathfiles:
    
    df = pd.read_csv(path, index_col=0)    
    id = np.unique(df["user_id"].values)[0]    
    dic_model = models[id]
    dfnorm = norm_data(df)   
    
    i = 0
    for Xtrain, Xtest in split_train_test(dfnorm):        

        tpl = i + 1
        n_cmp = dic_model[i]['cmp']
        model = dic_model[i]['model']

        _ , Xtest_pca = pca_data(Xtrain, Xtest, n_cmp)
        
        results = model.predict(Xtest_pca)
        score = results[results==1].shape[0]/results.shape[0] # true positives
        auth_results.loc[j] = {"id": id, "tpl": tpl,
                               "cmp": n_cmp, "score": score}         
        i += 1
        j += 1
   

True Positives
False Negatives

In [122]:
THRESHOLD = 0.5
TP = auth_results[auth_results['score']>THRESHOLD].shape[0]/auth_results.shape[0]
FN = 1 - TP
TP, FN

(0.9519230769230769, 0.04807692307692313)

In [123]:
auth_results.to_csv('swipe_userxuser.csv')

##### 1) All x user by all sections

In [None]:
results_all = pd.DataFrame(columns=["id", "id_attack", "tpl", 
                                    "section", "tn", "sz"])

df_id = 0
for path in pathfiles:    
    
    df = pd.read_csv(path, index_col=0)
    id = np.unique(df["user_id"].values)[0]
    dic_model = models[id]
    dfnorm = norm_data(df)

    npfiles = np.array(pathfiles)
    npfiles_attack = npfiles[npfiles != path]

    # two template's scenarios
    for t, Xuser in enumerate(template_data(dfnorm)):

        tpl = t + 1
        n_cmp = dic_model[t]['cmp']
        model = dic_model[t]['model']
    
        for file_attack in npfiles_attack:

            df_attack = pd.read_csv(file_attack, index_col=0)
            id_attack = np.unique(df_attack["user_id"].values)[0]
            dfnorm_attack = norm_data(df_attack)

            for s, Xattack in enumerate(session_attack_data(dfnorm_attack)):

                section = s + 1
                Xattacknorm = Xattack.iloc[:,3:]
                _ , Xattack_pca = pca_data(Xuser, Xattacknorm, n_cmp)                                  
                
                results = model.predict(Xattack_pca)
                sz = results.shape[0]
                tn = results[results==-1].shape[0] # true negative

                results_all.loc[df_id] = {"id": id, 
                                        "id_attack": id_attack,
                                        "tpl": tpl, 
                                        "section": section,
                                        "tn": tn,
                                        "sz": sz}
                df_id += 1

In [None]:
df = pd.read_csv(path, index_col=0)
id = np.unique(df["user_id"].values)[0]
dic_model = models[id]
dfnorm = norm_data(df)

npfiles = np.array(pathfiles)
npfiles_attack = npfiles[npfiles != path]

In [180]:
file_attack = npfiles_attack[0]

In [181]:
df_attack = pd.read_csv(file_attack, index_col=0)
id_attack = np.unique(df_attack["user_id"].values)[0]
dfnorm_attack = norm_data(df_attack)

  Xn["user_id"] = X["user_id"]
  Xn["swipe_number"] = X["swipe_number"]
  Xn["action_number"] = X["action_number"]


In [143]:
t = 0
Xuser = next(template_data(dfnorm))

In [None]:
tpl = t + 1
n_cmp = dic_model[t]['cmp']
model = dic_model[t]['model']

In [155]:
df_attack[df_attack.action_number == 1]

Unnamed: 0,user_id,action_number,swipe_number,ps_fs,ps_ls,ps_md,ps_fq,ps_tq,ps_ir,ps_am,...,ay_rg,ay_sk,ay_ku,dp_mg,dp_ec,dp_ang,dp_avang,dp_ra,tm_dr,tm_fl
1,49,1,1,0.072527,0.035653,0.086935,0.074359,0.096398,0.022039,0.081258,...,0.404341,3.519046,11.718555,1887.187007,607.896373,0.340479,0.0,0.322118,9,0
2,49,1,2,0.075702,0.040293,0.092308,0.078388,0.095238,0.01685,0.084284,...,0.224731,2.308413,5.404416,1995.637092,516.705912,0.437679,0.0,0.258918,8,0
3,49,1,3,0.072772,0.024908,0.088645,0.080586,0.096581,0.015995,0.084232,...,0.125529,2.914169,7.909749,2550.352277,424.925876,0.414162,-0.360919,0.166615,15,0
4,49,1,4,0.072772,0.031746,0.031746,0.031746,0.052259,0.020513,0.045421,...,0.0,0.0,0.0,1064.658161,1.0,0.0,0.0,0.000939,16,0
5,49,1,5,0.059585,0.025397,0.081074,0.073382,0.093162,0.01978,0.078008,...,0.249969,1.516548,0.978127,2143.05287,643.580609,0.502571,0.0,0.30031,16,0
6,49,1,6,0.073993,0.023199,0.048596,0.035897,0.061294,0.025397,0.048596,...,0.0,0.0,0.0,789.361768,0.0,0.0,0.0,0.0,81,0
7,49,1,38,0.072527,0.035653,0.086935,0.074359,0.096398,0.022039,0.081258,...,0.404341,3.519046,11.718555,1887.187007,607.896373,0.340479,0.0,0.322118,9,0
8,49,1,39,0.075702,0.040293,0.092308,0.078388,0.095238,0.01685,0.084284,...,0.224731,2.308413,5.404416,1995.637092,516.705912,0.437679,0.0,0.258918,8,0
9,49,1,40,0.072772,0.024908,0.088645,0.080586,0.096581,0.015995,0.084232,...,0.125529,2.914169,7.909749,2550.352277,424.925876,0.414162,-0.360919,0.166615,15,0
10,49,1,41,0.072772,0.031746,0.031746,0.031746,0.052259,0.020513,0.045421,...,0.0,0.0,0.0,1064.658161,1.0,0.0,0.0,0.000939,16,0


In [150]:
s = 0
Xattack = next(session_attack_data(dfnorm_attack))

In [154]:
dfnorm_attack[dfnorm_attack.action_number == 1]

Unnamed: 0,user_id,swipe_number,action_number,ps_fs,ps_ls,ps_md,ps_fq,ps_tq,ps_ir,ps_am,...,ay_rg,ay_sk,ay_ku,dp_mg,dp_ec,dp_ang,dp_avang,dp_ra,tm_dr,tm_fl
1,49,1,1,-0.497697,-0.150163,1.705109,1.680542,2.079918,0.212534,1.835615,...,1.525148,2.951563,3.479965,1.936041,1.525207,0.596956,0.060553,-0.038805,-0.604132,0.0
2,49,2,1,-0.06085,0.251223,1.988684,1.93938,1.993322,-0.436415,2.086738,...,0.676348,1.906834,1.594544,2.146136,1.139595,0.848418,0.060553,-0.215616,-0.639953,0.0
3,49,3,1,-0.464093,-1.079687,1.795337,2.080565,2.093591,-0.5433,2.082395,...,0.207538,2.429577,2.342644,3.220761,0.751489,0.787578,-5.190404,-0.473848,-0.3892,0.0
4,49,4,1,-0.464093,-0.488172,-1.207981,-1.056871,-1.215308,0.021666,-1.138057,...,-0.385687,-0.085234,-0.019232,0.342593,-1.041144,-0.283877,0.060553,-0.937349,-0.353378,0.0
5,49,5,1,-2.278689,-1.037436,1.395754,1.617793,1.838359,-0.06995,1.565997,...,0.795618,1.223486,0.27284,2.431718,1.676103,1.016295,0.060553,-0.099814,-0.353378,0.0
6,49,6,1,-0.296075,-1.227566,-0.318587,-0.790189,-0.540766,0.632442,-0.874631,...,-0.385687,-0.085234,-0.019232,-0.190726,-1.045373,-0.283877,0.060553,-0.939977,1.975045,0.0
7,49,38,1,-0.497697,-0.150163,1.705109,1.680542,2.079918,0.212534,1.835615,...,1.525148,2.951563,3.479965,1.936041,1.525207,0.596956,0.060553,-0.038805,-0.604132,0.0
8,49,39,1,-0.06085,0.251223,1.988684,1.93938,1.993322,-0.436415,2.086738,...,0.676348,1.906834,1.594544,2.146136,1.139595,0.848418,0.060553,-0.215616,-0.639953,0.0
9,49,40,1,-0.464093,-1.079687,1.795337,2.080565,2.093591,-0.5433,2.082395,...,0.207538,2.429577,2.342644,3.220761,0.751489,0.787578,-5.190404,-0.473848,-0.3892,0.0
10,49,41,1,-0.464093,-0.488172,-1.207981,-1.056871,-1.215308,0.021666,-1.138057,...,-0.385687,-0.085234,-0.019232,0.342593,-1.041144,-0.283877,0.060553,-0.937349,-0.353378,0.0


In [132]:
results_all.shape

(37128, 6)

In [189]:
results_all['tn'].sum() / results_all["sz"].sum()


0.02148747224865208

### All against template user

In [None]:
scores = ['precision', 'recall']
for score in scores: 
    search  = GridSearchCV(OneClassSVM(), tuned_parameters, cv=10,
                           scoring='%s_macro' % score, return_train_score=True)
    search.fit(Xpca)
    
    #resultDf = pd.DataFrame(search.cv_results_)
    #print(resultDf[["mean_test_score", "std_test_score", "params"]].sort_values(by=["mean_test_score"], ascending=False).head())

    print("Best parameters set found on development set:")
    print()
    print(search.cv_results_)

In [11]:
pipe = Pipeline(steps=[("pca", pca), "svm", svm])


In [12]:
param_grid = {
    "pca__n_components": [30, 40, 50, 60, 70]
}

In [20]:
search  = GridSearchCV(pipe, param_grid= param_grid, n_jobs= 2)

TypeError: GridSearchCV.__init__() got an unexpected keyword argument 'tuned_parameters'

In [None]:
splits = KFold(n_splits=5).split(X)

In [19]:
search.fit(X, X)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('pca', PCA(n_components='mle')), 'svm',
                OneClassSVM(gamma='auto')]) does not.

In [285]:
pathfile = pathfiles[0]
print(pathfile)

/home/jcscabral/Studies/UspEsalq/Tcc/projeto/pesquisa/code/R/swipecards/card49.csv


In [167]:
X =  pd.read_csv(pathfile, index_col=0)
X.shape

(52, 283)

In [134]:
X.head()

Unnamed: 0,user_id,action_number,swipe_number,ps_fs,ps_ls,ps_md,ps_fq,ps_tq,ps_ir,ps_am,...,ay_rg,ay_sk,ay_ku,dp_mg,dp_ec,dp_ang,dp_avang,dp_ra,tm_dr,tm_fl
1,55,1,1,0.059585,0.037851,0.059585,0.057631,0.065446,0.007814,0.059604,...,0.012787,0.377864,-1.286558,1706.729543,504.892068,0.059454,0.0,0.295824,15,0
2,55,1,2,0.060562,0.035165,0.064103,0.061111,0.069353,0.008242,0.060073,...,0.025736,0.521633,-1.393238,1321.616549,399.405809,0.045082,-0.080144,0.30221,15,0
3,55,1,3,0.055433,0.040781,0.066911,0.048107,0.068254,0.020147,0.058538,...,0.019428,0.359178,-1.557358,976.216914,330.945615,0.075613,-0.116659,0.339008,15,0
4,55,1,4,0.060806,0.035409,0.048107,0.041758,0.054457,0.012698,0.048107,...,0.0,0.0,0.0,880.160213,0.0,0.0,0.0,0.0,97,0
5,55,1,5,0.052259,0.038584,0.063004,0.059829,0.065568,0.005739,0.061775,...,0.093586,4.423592,20.103405,2816.120776,539.334775,0.035236,0.0,0.191517,17,0


In [138]:
Xtrain = X[X["swipe_number"] < 6]
Xtrain["swipe_number"].unique

<bound method Series.unique of 1    1
2    2
3    3
4    4
5    5
Name: swipe_number, dtype: int64>

In [139]:
Xtest = X[X["swipe_number"] == 6]
Xtest["swipe_number"].unique

<bound method Series.unique of 6    6
Name: swipe_number, dtype: int64>

In [140]:
Xval = X[X["swipe_number"] == 7]
Xval["swipe_number"].unique

<bound method Series.unique of 7    7
Name: swipe_number, dtype: int64>

In [7]:
# Normalization
Xn = (X.iloc[:,2:] - X.iloc[:,2:].mean())/X.iloc[:,2:].std()

In [291]:
# Standardization (worst perfomance)
#Xn  = (X.iloc[:,2:] -  (X.iloc[:,2:]).min()) / ((X.iloc[:,2:]).max() - (X.iloc[:,2:]).min())

In [9]:
Xn['user_id'] = X["user_id"] 
Xn['swipe_number'] = X["swipe_number"] 

  Xn['user_id'] = X["user_id"]
  Xn['swipe_number'] = X["swipe_number"]


In [10]:
cols = np.concatenate([Xn.columns.values[-2:], Xn.columns.values[:-2]])

In [11]:
Xn = Xn[cols]

In [12]:
Xn.fillna(0, inplace=True)

In [13]:
# first five sessions

Xtrain  = Xn[Xn["swipe_number"] <=41]

In [14]:
Xtrain = Xtrain.iloc[:,2:] 

In [50]:
Xtrain.shape

(41, 280)

In [16]:
pca = PCA()
Xpca = pca.fit_transform(Xtrain)

In [51]:
Xpca.shape

(41, 41)

In [17]:
Xtest  = Xn[(Xn["swipe_number"] >41) & (Xn["swipe_number"] <48)]

In [18]:
Xtest = Xtest.iloc[:,2:] 

In [49]:
Xtest.shape

(6, 280)

In [19]:
Xtransf  = pca.transform(Xtest)

In [48]:
Xtransf.shape

(6, 41)

In [20]:
svm =  OneClassSVM(gamma = 1.0e-7, kernel="rbf", nu=0.1)

In [111]:
model = svm.fit(Xpca)
model

In [22]:
results = model.predict(Xpca)
results

array([ 1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1, -1])

In [23]:
results[results==1].shape[0], results[results==-1].shape[0]

(32, 9)

In [24]:
svm.score_samples(Xpca)

array([4.09991406, 4.09992503, 4.09992009, 4.09983713, 4.09981922,
       4.09987028, 4.09982766, 4.09989281, 4.09986248, 4.09985626,
       4.09988872, 4.09982903, 4.09985487, 4.09989683, 4.09980536,
       4.09988387, 4.09984858, 4.09982408, 4.09986194, 4.09982349,
       4.09975219, 4.09985403, 4.09986727, 4.09982284, 4.0998931 ,
       4.09992491, 4.09979999, 4.09980347, 4.09955329, 4.09981595,
       4.09985165, 4.09974829, 4.09990628, 4.09989999, 4.09983396,
       4.09983102, 4.09988856, 4.099897  , 4.09990428, 4.09990186,
       4.09978332])

In [29]:
results = model.predict(Xtransf)
results

array([-1,  1,  1,  1,  1, -1])

In [28]:
svm.score_samples(Xtransf)

array([4.09981798, 4.09985134, 4.09983998, 4.09988365, 4.09989666,
       4.09976309])

In [97]:
nus = np.arange(0.001, 1, step = 0.01)
tuned_parameters = {'kernel' : ['rbf'], 'nu': nus}