In [1]:
import pandas as pd
import numpy as np
from dataloader import *

In [2]:
import multiprocessing as mp

In [3]:
empirical_data = load_empirical_study_data()
empirical_data.head()

Unnamed: 0,TC,TM,CUT,D,L,ABS,INT,JDK,ICB,DEP,...,TUAPI,UINT,SYNC,CALLSITES,AFPR,RBFA,EXPCAT,CONDCALL,PROJ,IS_MOCK
0,org.apache.hadoop.resourceestimator.translator...,testGetContainerSpec,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,15,...,0,False,0,0,0,0,0,0,Hadoop,False
1,org.apache.hadoop.resourceestimator.translator...,testGetJobSize,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,15,...,0,False,0,0,0,0,0,0,Hadoop,False
2,org.apache.hadoop.resourceestimator.translator...,testGetRecurrenceeId,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,15,...,0,False,0,0,0,0,0,0,Hadoop,False
3,org.apache.hadoop.resourceestimator.translator...,testStringToUnixTimestamp,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,15,...,0,False,0,0,0,0,0,0,Hadoop,False
4,org.apache.hadoop.resourceestimator.translator...,testResourceSkyline,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,15,...,0,False,0,0,0,0,0,0,Hadoop,False


In [4]:
empirical_data['TC'].count()

353935

In [5]:
data_statistics = pd.DataFrame([
    {
        'PRIOJ': p,
        'TOTAL': empirical_data[(empirical_data['PROJ']==p)]['PROJ'].count(),
        'MOCK': empirical_data[(empirical_data['PROJ']==p) & empirical_data['IS_MOCK']]['PROJ'].count(),

    }
    for p in empirical_study_projects
])
data_statistics

Unnamed: 0,PRIOJ,TOTAL,MOCK
0,Hadoop,325335,14771
1,Camel,12962,1864
2,HBase,11990,1093
3,Storm,3648,377


In [6]:
("%d mocks(%.2f%%)" % (empirical_data['IS_MOCK'].sum(), empirical_data['IS_MOCK'].sum()/empirical_data['IS_MOCK'].count()*100))

'18105 mocks(5.12%)'

In [7]:
rule_names = [
    'Rule 1.1',
    'Rule 1.2',
    'Rule 1.3',
    'Rule 2.1',
    'Rule 2.2',
    'Rule 3.1',
    'Rule 4.1',
    'Rule 4.2',
    'Rule 5.1',
    'Rule 5.2',
]

## Validation study

In [8]:
def calculate_mock_ratio(dataset, cond):
    subset = dataset[cond(dataset)]
    return {
        'Mock Ratio': subset['IS_MOCK'].mean()*100,
        'Matches': len(subset),
        'Mock': subset['IS_MOCK'].sum()
    }

In [9]:
rules = {
    'Rule 1.1': lambda x: (x['UAPI']+x['TUAPI']) > (x['UAPI']+x['TUAPI']).mean(),
    'Rule 1.2': lambda x: x['UINT'],
    'Rule 1.3': lambda x: x['SYNC'] > x['SYNC'].mean(),
    'Rule 2.1': lambda x: x['FIELD'] > x['FIELD'].mean(),
    'Rule 2.2': lambda x: x['TDEP'] > x['TDEP'].mean(),
    'Rule 3.1': lambda x: x['INT'] | x['ABS'],
    'Rule 4.1': lambda x: x['RBFA'] > 0,
    'Rule 4.2': lambda x: x['EXPCAT'] > 0,
    'Rule 5.1': lambda x: x['CONDCALL'] > 0,
    'Rule 5.2': lambda x: x['AFPR'] > 0,
}

In [12]:
results = [
    {
        'Rule': r,
        **calculate_mock_ratio(empirical_data, rules[r])
    }
    for r in rule_names
]
results = pd.DataFrame(results).round({'Mock Ratio': 1})
results

Unnamed: 0,Rule,Mock Ratio,Matches,Mock
0,Rule 1.1,11.8,49789,5874
1,Rule 1.2,7.4,5669,419
2,Rule 1.3,13.8,33356,4611
3,Rule 2.1,8.8,86469,7620
4,Rule 2.2,7.9,104416,8204
5,Rule 3.1,8.3,122471,10197
6,Rule 4.1,10.9,74510,8119
7,Rule 4.2,12.6,60193,7565
8,Rule 5.1,12.5,64488,8089
9,Rule 5.2,11.4,56080,6380


In [31]:
result_comp = results
result_comp['Comp'] = (result_comp['Mock Ratio'] / 5.1 - 1) * 100
result_comp = result_comp.round({'Comp': 1})
result_comp['Mock Ratio'] = result_comp['Mock Ratio'].apply(lambda it: f"{it}%")
result_comp['Comp'] = result_comp['Comp'].apply(lambda it: f"+{it}%")
result_comp

Unnamed: 0,Rule,Mock Ratio,Matches,Mock,Comp
0,Rule 1.1,11.8%,49789,5874,+131.4%
1,Rule 1.2,7.4%,5669,419,+45.1%
2,Rule 1.3,13.8%,33356,4611,+170.6%
3,Rule 2.1,8.8%,86469,7620,+72.5%
4,Rule 2.2,7.9%,104416,8204,+54.9%
5,Rule 3.1,8.3%,122471,10197,+62.7%
6,Rule 4.1,10.9%,74510,8119,+113.7%
7,Rule 4.2,12.6%,60193,7565,+147.1%
8,Rule 5.1,12.5%,64488,8089,+145.1%
9,Rule 5.2,11.4%,56080,6380,+123.5%


In [32]:
print(result_comp[['Rule','Matches','Mock','Mock Ratio','Comp']].to_latex(header=False))

\begin{tabular}{llrrll}
\toprule
0 &  Rule 1.1 &   49789 &   5874 &  11.8\% &  +131.4\% \\
1 &  Rule 1.2 &    5669 &    419 &   7.4\% &   +45.1\% \\
2 &  Rule 1.3 &   33356 &   4611 &  13.8\% &  +170.6\% \\
3 &  Rule 2.1 &   86469 &   7620 &   8.8\% &   +72.5\% \\
4 &  Rule 2.2 &  104416 &   8204 &   7.9\% &   +54.9\% \\
5 &  Rule 3.1 &  122471 &  10197 &   8.3\% &   +62.7\% \\
6 &  Rule 4.1 &   74510 &   8119 &  10.9\% &  +113.7\% \\
7 &  Rule 4.2 &   60193 &   7565 &  12.6\% &  +147.1\% \\
8 &  Rule 5.1 &   64488 &   8089 &  12.5\% &  +145.1\% \\
9 &  Rule 5.2 &   56080 &   6380 &  11.4\% &  +123.5\% \\
\bottomrule
\end{tabular}

