## Feature Analysis

In [1]:
import pandas as pd
from scipy.stats import spearmanr
df_1688 = pd.read_csv("/Volumes/JJ_Media/Data/RobustPrediction/wo_noise/feature_list_1688.tsv", delimiter="\t")
df_1688['lxpath'] = df_1688.xpath.apply(lambda v:len(v.split("/")))
df_1688.changed = df_1688.changed.apply(lambda v:int(v))
df_1688.siblings = df_1688.siblings.apply(lambda v:eval(v))
# correlation 
features = ['position', 'depth', 'nr_siblings', 'nr_children', 'lxpath']
for ftype in features:
    print (df_1688[[ftype, 'changed']].corr(method = 'spearman'))
    print ()
print ()
for ftype in features:
    print (ftype)
    print (spearmanr(df_1688[ftype].values, df_1688.changed.values))

          position  changed
position   1.00000 -0.02941
changed   -0.02941  1.00000

            depth   changed
depth    1.000000  0.071256
changed  0.071256  1.000000

             nr_siblings   changed
nr_siblings     1.000000 -0.035157
changed        -0.035157  1.000000

             nr_children   changed
nr_children     1.000000 -0.036122
changed        -0.036122  1.000000

          lxpath  changed
lxpath   1.00000  0.06755
changed  0.06755  1.00000


position
SpearmanrResult(correlation=-0.029410063191568844, pvalue=2.3000111685560715e-73)
depth
SpearmanrResult(correlation=0.07125577114381565, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0351573988458202, pvalue=4.830554136126205e-104)
nr_children
SpearmanrResult(correlation=-0.036121728857713295, pvalue=1.0067965595987004e-109)
lxpath
SpearmanrResult(correlation=0.06754966070500934, pvalue=0.0)


In [2]:
df_1688.timestamp.unique().shape

(98,)

In [3]:
import pandas as pd
from scipy.stats import spearmanr
df_104 = pd.read_csv("/Volumes/JJ_Media/Data/RobustPrediction/wo_noise/feature_list_104.tsv", delimiter="\t")
df_104['lxpath'] = df_104.xpath.apply(lambda v:len(v.split("/")))
df_104.changed = df_104.changed.apply(lambda v:int(v))
df_104.siblings = df_104.siblings.apply(lambda v:eval(v))
# correlation 
features = ['position', 'depth', 'nr_siblings', 'nr_children', 'lxpath']
for ftype in features:
    print (df_104[[ftype, 'changed']].corr(method = 'spearman'))
    print ()
print ()
for ftype in features:
    print (ftype)
    print (spearmanr(df_104[ftype].values, df_104.changed.values))

KeyboardInterrupt: 

### Comparing the results of 104 vs 1688

In [4]:
import numpy as np

In [5]:
for ftype in features:
    print (ftype)
    print (spearmanr(df_104[ftype].values, df_104.changed.values))

print ("\n=====\n")
for ftype in features:
    print (ftype)
    print (spearmanr(df_1688[ftype].values, df_1688.changed.values))


position
SpearmanrResult(correlation=-0.16956587398648945, pvalue=0.0)
depth
SpearmanrResult(correlation=0.2299133514118026, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0873611231845423, pvalue=0.0)
nr_children
SpearmanrResult(correlation=-0.13211054221728108, pvalue=0.0)
lxpath
SpearmanrResult(correlation=0.20859463006879164, pvalue=0.0)

=====

position
SpearmanrResult(correlation=-0.029410063191568844, pvalue=2.3000111685560715e-73)
depth
SpearmanrResult(correlation=0.07125577114381565, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0351573988458202, pvalue=4.830554136126205e-104)
nr_children
SpearmanrResult(correlation=-0.036121728857713295, pvalue=1.0067965595987004e-109)
lxpath
SpearmanrResult(correlation=0.06754966070500934, pvalue=0.0)


it seems that currently, for 104, we have some potentials in terms of position (not sure), (nr_sibling, may too wek), depth, nr_children, and lxpath.

Actually, for 1688, the trend itself is the same: depth > lxpath > nr_children > (this signal becomes weaker) nr_sibilings > position (actully, this revesed with nr_siblings)

In [6]:
print(df_104.timestamp.unique().shape, len(df_104))
print(df_1688.timestamp.unique().shape, len(df_1688))

(98,) 443920
(98,) 379351


In [7]:
# the number of changed
print (len(df_104), df_104.changed.sum(), 100*df_104.changed.sum()/len(df_104))
print (len(df_1688), df_1688.changed.sum(), 100*df_1688.changed.sum()/len(df_1688))

443920 16670 3.755181113714183
379351 13058 3.442194695677618


so, it is not about the ratio of changed ones that affect the results .. 

### data balancing

In [8]:
def resample(data, feature_names, oversamp = True):   
    X = data[feature_names].values
    y = data.changed.values

    if oversamp:
        from imblearn.over_sampling import SMOTE
        sm = SMOTE(random_state=0)
        X_res, y_res = sm.fit_resample(X, y)
    else:
        from imblearn.under_sampling import RandomUnderSampler 
        centroid = RandomUnderSampler(random_state = 0)
        X_res, y_res = centroid.fit_resample(X, y)
    
    return X_res, y_res 

def resample_vectors(X, y, oversamp = True):   
    if oversamp:
        from imblearn.over_sampling import SMOTE
        sm = SMOTE(random_state=0)
        X_res, y_res = sm.fit_resample(X, y)
    else:
        from imblearn.under_sampling import RandomUnderSampler 
        centroid = RandomUnderSampler(random_state = 0)
        X_res, y_res = centroid.fit_resample(X, y)
    
    return X_res, y_res 

def compute_corr(X, y, ftypes):
    for idx, ftype in enumerate(ftypes):
        print (ftype)
        print ("\t", spearmanr(X[:,idx], y))
    


In [9]:
oversamp = True
print ("=== Oversampling ===")
over_X_104_res, over_y_104_res  = resample(df_104, features, oversamp = oversamp)
compute_corr(over_X_104_res, over_y_104_res, features)

oversamp = False
print ("=== Undersampling ===")
under_X_104_res, under_y_104_res  = resample(df_104, features, oversamp = oversamp)
compute_corr(under_X_104_res, under_y_104_res, features)

=== Oversampling ===
position
	 SpearmanrResult(correlation=-0.5451968732354452, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.6270237843895687, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.23025238796447, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.44329421304211303, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.5770531360381647, pvalue=0.0)
=== Undersampling ===
position
	 SpearmanrResult(correlation=-0.5425495894402638, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.6254013532252595, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.2291494219058163, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.44227273757814173, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.5766459404080755, pvalue=0.0)


In [10]:
oversamp = True
print ("=== Oversampling ===")
over_X_1688_res, over_y_1688_res  = resample(df_1688, features, oversamp = oversamp)
compute_corr(over_X_1688_res, over_y_1688_res, features)

oversamp = False
print ("=== Undersampling ===")
under_X_1688_res, under_y_1688_res  = resample(df_1688, features, oversamp = oversamp)
compute_corr(under_X_1688_res, under_y_1688_res, features)

=== Oversampling ===
position
	 SpearmanrResult(correlation=-0.08608750513789196, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.19603700740261298, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.09647232025702354, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.10668040995042985, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.1854184563693287, pvalue=0.0)
=== Undersampling ===
position
	 SpearmanrResult(correlation=-0.08230722206139451, pvalue=1.6970689739336358e-40)
depth
	 SpearmanrResult(correlation=0.18908196267141328, pvalue=9.309685167692483e-209)
nr_siblings
	 SpearmanrResult(correlation=-0.09481733336529916, pvalue=3.190472921859148e-53)
nr_children
	 SpearmanrResult(correlation=-0.10540193060491397, pvalue=2.084051061384027e-65)
lxpath
	 SpearmanrResult(correlation=0.17892078277553766, pvalue=8.725556961382762e-187)


Not much, but it seems like the signals increase slightly 

### simple classifier 

In [11]:
under_X_1688_res.shape, under_X_104_res.shape
under_y_104_res.shape, over_y_104_res.shape

((33340,), (854500,))

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

data = df_104 
X = data[features].values
y = data.changed.values 
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state=0)
# resample 
## undersample
res_train_X, res_train_y  = resample_vectors(train_X, train_y, oversamp=False) 
clf = RandomForestClassifier()
clf.fit(res_train_X, res_train_y)


In [13]:
#clf.score(train_X, train_y)
train_predc_prob = clf.predict_proba(res_train_X)
test_predc_prob = clf.predict_proba(test_X)
train_predc = np.argmax(train_predc_prob, axis = 1)
test_predc = np.argmax(test_predc_prob, axis = 1)

print (np.sum(train_predc == res_train_y)/len(res_train_y))
print (np.sum(test_predc == test_y)/len(test_y))

0.9133840842701603
0.8543421573579806


In [14]:
from sklearn.metrics import balanced_accuracy_score

print('train', balanced_accuracy_score(res_train_y, train_predc))
print('test', balanced_accuracy_score(test_y, test_predc))

train 0.9133840842701602
test 0.9140144919716668


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

data = df_1688
X = data[features].values
y = data.changed.values 
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state=0)
# resample 
## undersample
res_train_X, res_train_y  = resample_vectors(train_X, train_y, oversamp=False) 
clf = RandomForestClassifier()
clf.fit(res_train_X, res_train_y)

#clf.score(train_X, train_y)
train_predc_prob = clf.predict_proba(res_train_X)
test_predc_prob = clf.predict_proba(test_X)
train_predc = np.argmax(train_predc_prob, axis = 1)
test_predc = np.argmax(test_predc_prob, axis = 1)

print (np.sum(train_predc == res_train_y)/len(res_train_y))
print (np.sum(test_predc == test_y)/len(test_y))

print('train', balanced_accuracy_score(res_train_y, train_predc))
print('test', balanced_accuracy_score(test_y, test_predc))


0.6696590909090909
0.6159394820507086
train 0.6696590909090909
test 0.6442996650235517


## Time sequence

currently, don't care about the timestamp granularity during the training (collect all & train) (meaning, currenlty the batch (if we use DNN terms) is the entire training dataset)

For prediction, can do whatever you want: either as the entire dataset or per timestamp

In [19]:
df = df_104
print(df_104.columns)

Index(['node', 'timestamp', 'position', 'depth', 'nr_siblings', 'nr_children',
       'xpath', 'siblings', 'changed', 'lxpath'],
      dtype='object')


In [27]:
# get unique timestamps
uniq_timestamps = df.timestamp.unique()
print (uniq_timestamps.shape)
uniq_timestamps.sort() # from the oldest to the latest

(98,)


In [35]:
# divide the collected timstamps to train (older) and test (more recents)
indices = np.arange(len(uniq_timestamps))
train_indices, test_indices = train_test_split(indices, test_size=0.3)
print (train_indices.shape, test_indices.shape)
ts_for_train = uniq_timestamps[train_indices]
ts_for_test = uniq_timestamps[test_indices]

(68,) (30,)


In [51]:
grouped = df.groupby('timestamp') # grouping
# set train and test data
train_X, train_y = None,None 
for i, t in enumerate(ts_for_train):
    adf = grouped.get_group(t)
    if i == 0:
        train_X = adf[features].values
        train_y = adf.changed.values
    else:
        train_X = np.append(train_X, adf[features].values, axis = 0)
        train_y = np.append(train_y, adf.changed.values)

test_X, test_y = None,None 
for i, t in enumerate(ts_for_test):
    adf = grouped.get_group(t)
    if i == 0:
        test_X = adf[features].values
        test_y = adf.changed.values
    else:
        test_X = np.append(test_X, adf[features].values, axis = 0)
        test_y = np.append(test_y, adf.changed.values)

# resample (the same as before)
## undersample
res_train_X, res_train_y  = resample_vectors(train_X, train_y, oversamp=False) 
clf = RandomForestClassifier()
clf.fit(res_train_X, res_train_y)

#clf.score(train_X, train_y)
train_predc_prob = clf.predict_proba(res_train_X)
test_predc_prob = clf.predict_proba(test_X)
train_predc = np.argmax(train_predc_prob, axis = 1)
test_predc = np.argmax(test_predc_prob, axis = 1)

print ('train acc', np.sum(train_predc == res_train_y)/len(res_train_y))
print ('test acc', np.sum(test_predc == test_y)/len(test_y))

print('train balacc', balanced_accuracy_score(res_train_y, train_predc))
print('test balacc', balanced_accuracy_score(test_y, test_predc))


0.9134648817802503
0.8586875658316331
train 0.9134648817802503
test 0.8854613640018452


In [54]:
# per timestamp prediction for test data

accs, balaccs = [], []
for i, t in enumerate(ts_for_test):
    adf = grouped.get_group(t)
    test_X = adf[features].values
    test_y = adf.changed.values

    test_predc_prob = clf.predict_proba(test_X)
    test_predc = np.argmax(test_predc_prob, axis = 1)

    acc = np.sum(test_predc == test_y)/len(test_y)
    balacc = balanced_accuracy_score(test_y, test_predc) # can raise a warning when nothing has been changed in the curent timestamp
    print (f"For {t}, \
        \n\tacc: {acc}\
        \n\tbalacc:{balacc}")
    accs.append(acc)
    balaccs.append(balacc)

print (f"On average:\n\t{np.mean(accs)},\n\t{np.mean(balaccs)}")


For 20180203045438,         
	acc: 0.8593333333333333        
	balacc:0.9240782878248253
For 20180411221320,         
	acc: 0.857550482879719        
	balacc:0.9231877679756213
For 20180223191215,         
	acc: 0.8575530035335689        
	balacc:0.9231717193531841
For 20180120144503,         
	acc: 0.8610184567489437        
	balacc:0.9249491616894732
For 20180305082408,         
	acc: 0.8570797437596642        
	balacc:0.9201332336528449
For 20180326191449,         
	acc: 0.8570797437596642        
	balacc:0.9201332336528449
For 20180303063840,         
	acc: 0.8570797437596642        
	balacc:0.9229252453793084
For 20180227220815,         
	acc: 0.8199690744422354        
	balacc:0.8199690744422354
For 20180429194224,         
	acc: 0.8577699736611062        
	balacc:0.9178471357409713




For 20180313004000,         
	acc: 0.8573006406008394        
	balacc:0.9202646692563612
For 20180413235719,         
	acc: 0.857550482879719        
	balacc:0.9231877679756213
For 20180214002214,         
	acc: 0.85726891683212        
	balacc:0.9202351178255468
For 20180321110434,         
	acc: 0.8570797437596642        
	balacc:0.9229252453793084
For 20180328224122,         
	acc: 0.8570797437596642        
	balacc:0.9229252453793084
For 20180228004844,         
	acc: 0.8557543627126132        
	balacc:0.9058778285033744
For 20180330230209,         
	acc: 0.8570797437596642        
	balacc:0.9229252453793084
For 20180205071524,         
	acc: 0.8590455049944506        
	balacc:0.9239318104461061
For 20180206113733,         
	acc: 0.8590455049944506        
	balacc:0.9239318104461061
For 20180126210808,         
	acc: 0.8613089937666963        
	balacc:0.9250970595279872
For 20180428172319,         
	acc: 0.857550482879719        
	balacc:0.9231877679756213
For 20180322122921,      