## Feature Analysis

In [1]:
import pandas as pd
from scipy.stats import spearmanr
df_1688 = pd.read_csv("/Volumes/JJ_Media/Data/RobustPrediction/wo_noise/feature_list_1688.tsv", delimiter="\t")
df_1688['lxpath'] = df_1688.xpath.apply(lambda v:len(v.split("/")))
df_1688.changed = df_1688.changed.apply(lambda v:int(v))
df_1688.siblings = df_1688.siblings.apply(lambda v:eval(v))
# correlation 
features = ['position', 'depth', 'nr_siblings', 'nr_children', 'lxpath']
for ftype in features:
    print (df_1688[[ftype, 'changed']].corr(method = 'spearman'))
    print ()
print ()
for ftype in features:
    print (ftype)
    print (spearmanr(df_1688[ftype].values, df_1688.changed.values))

          position  changed
position   1.00000 -0.02941
changed   -0.02941  1.00000

            depth   changed
depth    1.000000  0.071256
changed  0.071256  1.000000

             nr_siblings   changed
nr_siblings     1.000000 -0.035157
changed        -0.035157  1.000000

             nr_children   changed
nr_children     1.000000 -0.036122
changed        -0.036122  1.000000

          lxpath  changed
lxpath   1.00000  0.06755
changed  0.06755  1.00000


position
SpearmanrResult(correlation=-0.029410063191568844, pvalue=2.3000111685560715e-73)
depth
SpearmanrResult(correlation=0.07125577114381565, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0351573988458202, pvalue=4.830554136126205e-104)
nr_children
SpearmanrResult(correlation=-0.036121728857713295, pvalue=1.0067965595987004e-109)
lxpath
SpearmanrResult(correlation=0.06754966070500934, pvalue=0.0)


In [2]:
import pandas as pd
from scipy.stats import spearmanr
df_104 = pd.read_csv("/Volumes/JJ_Media/Data/RobustPrediction/wo_noise/feature_list_104.tsv", delimiter="\t")
df_104['lxpath'] = df_104.xpath.apply(lambda v:len(v.split("/")))
df_104.changed = df_104.changed.apply(lambda v:int(v))
df_104.siblings = df_104.siblings.apply(lambda v:eval(v))
# correlation 
features = ['position', 'depth', 'nr_siblings', 'nr_children', 'lxpath']
for ftype in features:
    print (df_104[[ftype, 'changed']].corr(method = 'spearman'))
    print ()
print ()
for ftype in features:
    print (ftype)
    print (spearmanr(df_104[ftype].values, df_104.changed.values))

          position   changed
position  1.000000 -0.169566
changed  -0.169566  1.000000

            depth   changed
depth    1.000000  0.229913
changed  0.229913  1.000000

             nr_siblings   changed
nr_siblings     1.000000 -0.087361
changed        -0.087361  1.000000

             nr_children   changed
nr_children     1.000000 -0.132111
changed        -0.132111  1.000000

           lxpath   changed
lxpath   1.000000  0.208595
changed  0.208595  1.000000


position
SpearmanrResult(correlation=-0.16956587398648945, pvalue=0.0)
depth
SpearmanrResult(correlation=0.2299133514118026, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0873611231845423, pvalue=0.0)
nr_children
SpearmanrResult(correlation=-0.13211054221728108, pvalue=0.0)
lxpath
SpearmanrResult(correlation=0.20859463006879164, pvalue=0.0)


### Comparing the results of 104 vs 1688

In [3]:
import numpy as np

In [4]:
for ftype in features:
    print (ftype)
    print (spearmanr(df_104[ftype].values, df_104.changed.values))

print ("\n=====\n")
for ftype in features:
    print (ftype)
    print (spearmanr(df_1688[ftype].values, df_1688.changed.values))


position
SpearmanrResult(correlation=-0.16956587398648945, pvalue=0.0)
depth
SpearmanrResult(correlation=0.2299133514118026, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0873611231845423, pvalue=0.0)
nr_children
SpearmanrResult(correlation=-0.13211054221728108, pvalue=0.0)
lxpath
SpearmanrResult(correlation=0.20859463006879164, pvalue=0.0)

=====

position
SpearmanrResult(correlation=-0.029410063191568844, pvalue=2.3000111685560715e-73)
depth
SpearmanrResult(correlation=0.07125577114381565, pvalue=0.0)
nr_siblings
SpearmanrResult(correlation=-0.0351573988458202, pvalue=4.830554136126205e-104)
nr_children
SpearmanrResult(correlation=-0.036121728857713295, pvalue=1.0067965595987004e-109)
lxpath
SpearmanrResult(correlation=0.06754966070500934, pvalue=0.0)


it seems that currently, for 104, we have some potentials in terms of position (not sure), (nr_sibling, may too wek), depth, nr_children, and lxpath.

Actually, for 1688, the trend itself is the same: depth > lxpath > nr_children > (this signal becomes weaker) nr_sibilings > position (actully, this revesed with nr_siblings)

In [5]:
print(df_104.timestamp.unique().shape, len(df_104))
print(df_1688.timestamp.unique().shape, len(df_1688))

(98,) 443920
(98,) 379351


In [6]:
# the number of changed
print (len(df_104), df_104.changed.sum(), 100*df_104.changed.sum()/len(df_104))
print (len(df_1688), df_1688.changed.sum(), 100*df_1688.changed.sum()/len(df_1688))

443920 16670 3.755181113714183
379351 13058 3.442194695677618


so, it is not about the ratio of changed ones that affect the results .. 

### data balancing

In [10]:
def resample(data, feature_names, oversamp = True):   
    X = data[feature_names].values
    y = data.changed.values

    if oversamp:
        from imblearn.over_sampling import SMOTE
        sm = SMOTE(random_state=0)
        X_res, y_res = sm.fit_resample(X, y)
    else:
        from imblearn.under_sampling import RandomUnderSampler 
        centroid = RandomUnderSampler(random_state = 0)
        X_res, y_res = centroid.fit_resample(X, y)
    
    return X_res, y_res 

def resample_vectors(X, y, oversamp = True):   
    if oversamp:
        from imblearn.over_sampling import SMOTE
        sm = SMOTE(random_state=0)
        X_res, y_res = sm.fit_resample(X, y)
    else:
        from imblearn.under_sampling import RandomUnderSampler 
        centroid = RandomUnderSampler(random_state = 0)
        X_res, y_res = centroid.fit_resample(X, y)
    
    return X_res, y_res 

def compute_corr(X, y, ftypes):
    for idx, ftype in enumerate(ftypes):
        print (ftype)
        print ("\t", spearmanr(X[:,idx], y))
    


In [8]:
oversamp = True
print ("=== Oversampling ===")
over_X_104_res, over_y_104_res  = resample(df_104, features, oversamp = oversamp)
compute_corr(over_X_104_res, over_y_104_res, features)

oversamp = False
print ("=== Undersampling ===")
under_X_104_res, under_y_104_res  = resample(df_104, features, oversamp = oversamp)
compute_corr(under_X_104_res, under_y_104_res, features)

=== Oversampling ===
position
	 SpearmanrResult(correlation=-0.5451968732354452, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.6270237843895687, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.23025238796447, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.44329421304211303, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.5770531360381647, pvalue=0.0)
=== Undersampling ===
position
	 SpearmanrResult(correlation=-0.5425495894402638, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.6254013532252595, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.2291494219058163, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.44227273757814173, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.5766459404080755, pvalue=0.0)


In [9]:
oversamp = True
print ("=== Oversampling ===")
over_X_1688_res, over_y_1688_res  = resample(df_1688, features, oversamp = oversamp)
compute_corr(over_X_1688_res, over_y_1688_res, features)

oversamp = False
print ("=== Undersampling ===")
under_X_1688_res, under_y_1688_res  = resample(df_1688, features, oversamp = oversamp)
compute_corr(under_X_1688_res, under_y_1688_res, features)

=== Oversampling ===
position
	 SpearmanrResult(correlation=-0.08608750513789196, pvalue=0.0)
depth
	 SpearmanrResult(correlation=0.19603700740261298, pvalue=0.0)
nr_siblings
	 SpearmanrResult(correlation=-0.09647232025702354, pvalue=0.0)
nr_children
	 SpearmanrResult(correlation=-0.10668040995042985, pvalue=0.0)
lxpath
	 SpearmanrResult(correlation=0.1854184563693287, pvalue=0.0)
=== Undersampling ===
position
	 SpearmanrResult(correlation=-0.08230722206139451, pvalue=1.6970689739336358e-40)
depth
	 SpearmanrResult(correlation=0.18908196267141328, pvalue=9.309685167692483e-209)
nr_siblings
	 SpearmanrResult(correlation=-0.09481733336529916, pvalue=3.190472921859148e-53)
nr_children
	 SpearmanrResult(correlation=-0.10540193060491397, pvalue=2.084051061384027e-65)
lxpath
	 SpearmanrResult(correlation=0.17892078277553766, pvalue=8.725556961382762e-187)


Not much, but it seems like the signals increase slightly 

### simple classifier 

In [12]:
under_X_1688_res.shape, under_X_104_res.shape
under_y_104_res.shape, over_y_104_res.shape

((33340,), (854500,))

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

data = df_104 
X = data[features].values
y = data.changed.values 
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state=0)
# resample 
## undersample
res_train_X, res_train_y  = resample_vectors(train_X, train_y, oversamp=False) 
clf = RandomForestClassifier()
clf.fit(res_train_X, res_train_y)


In [26]:
#clf.score(train_X, train_y)
train_predc_prob = clf.predict_proba(res_train_X)
test_predc_prob = clf.predict_proba(test_X)
train_predc = np.argmax(train_predc_prob, axis = 1)
test_predc = np.argmax(test_predc_prob, axis = 1)

print (np.sum(train_predc == res_train_y)/len(res_train_y))
print (np.sum(test_predc == test_y)/len(test_y))

0.9133840842701603
0.8543558097942577


In [25]:
from sklearn.metrics import balanced_accuracy_score

print('train', balanced_accuracy_score(res_train_y, train_predc))
print('test', balanced_accuracy_score(test_y, test_predc))

train 0.9133840842701602
test 0.9140215780892538


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

data = df_1688
X = data[features].values
y = data.changed.values 
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state=0)
# resample 
## undersample
res_train_X, res_train_y  = resample_vectors(train_X, train_y, oversamp=False) 
clf = RandomForestClassifier()
clf.fit(res_train_X, res_train_y)

#clf.score(train_X, train_y)
train_predc_prob = clf.predict_proba(res_train_X)
test_predc_prob = clf.predict_proba(test_X)
train_predc = np.argmax(train_predc_prob, axis = 1)
test_predc = np.argmax(test_predc_prob, axis = 1)

print (np.sum(train_predc == res_train_y)/len(res_train_y))
print (np.sum(test_predc == test_y)/len(test_y))

print('train', balanced_accuracy_score(res_train_y, train_predc))
print('test', balanced_accuracy_score(test_y, test_predc))


0.6696590909090909
0.6204847187385171
train 0.6696590909090909
test 0.644952934662409
