In [1]:
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

from utils import mem_usage, save_dtypes, cache_dtypes, save_df, \
typecast_ints, typecast_floats, typecast_objects

gl = pd.read_csv('../datasets/ExcessiveMethodCalls.csv')
gl.info(memory_usage='deep')

# downcast integer columns
converted_int = typecast_ints(gl.select_dtypes(include=['int']))

# downcast float columns
converted_float = typecast_floats(gl.select_dtypes(include=['float']))

# convert object columns to lowercase
gl_obj = gl.select_dtypes(include=['object'])
gl_obj = gl_obj.apply(lambda x: x.str.strip())
gl_obj = gl_obj.apply(lambda x: x.str.lower())

# convert object to category columns
# when unique values < 50% of total
converted_obj = typecast_objects(gl_obj)

# transform optimized types
gl[converted_int.columns] = converted_int
gl[converted_float.columns] = converted_float
gl[converted_obj.columns] = converted_obj

print('\n')

gl.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7125 entries, 0 to 7124
Data columns (total 37 columns):
EGAP(s)                 7125 non-null object
# EGAPs fixed           7125 non-null int64
# Hits on fixed code    7125 non-null int64
Energy (before fix)     7125 non-null float64
Energy (after fix)      7125 non-null float64
Time (before fix)       7125 non-null float64
Time (after fix)        7125 non-null float64
# Total method calls    7125 non-null int64
# Diff. method calls    7125 non-null int64
nosi (before)           5125 non-null float64
nom (before)            5125 non-null float64
wmc (before)            5125 non-null float64
noc (before)            5125 non-null float64
nopf (before)           5125 non-null float64
cbo (before)            5125 non-null float64
dit (before)            5125 non-null float64
loc (before)            5125 non-null float64
rfc (before)            5125 non-null float64
nosf (before)           5125 non-null float64
lcom (before)           5125

In [2]:
gl.sample(5)

Unnamed: 0,EGAP(s),# EGAPs fixed,# Hits on fixed code,Energy (before fix),Energy (after fix),Time (before fix),Time (after fix),# Total method calls,# Diff. method calls,nosi (before),...,noc (after),rfc (after),nof (after),dit (after),wmc (after),loc (after),cbo (after),nosf (after),lcom (after),nosm (after)
1662,excessivemethodcalls,1,0,25901.503906,40459.0,25819.710938,40485.0,47,16,,...,,,,,,,,,,
5142,excessivemethodcalls,1,0,24933.603516,43412.0,24960.734375,43384.0,22,7,2.75,...,0.0,9.25,239.875,1.5,5.25,764.875,4.375,239.5,7.625,1.375
4393,excessivemethodcalls,1,0,20886.978516,43173.0,20616.890625,43108.0,6,6,0.8,...,0.0,6.866667,5.0,1.466667,5.8,47.866665,4.466667,2.8,12.266666,0.2
1588,excessivemethodcalls,2,0,27355.136719,49804.0,24630.857422,50012.0,54,45,1.989362,...,0.021277,16.436171,30.617022,1.670213,14.287234,170.212769,7.255319,28.425531,37.446808,0.989362
962,excessivemethodcalls,0,0,30691.316406,49597.0,31300.662109,49687.0,55,20,1.736842,...,0.0,15.684211,71.052635,1.526316,12.473684,236.473679,7.894737,68.894737,17.263159,1.421053


In [3]:
gl.describe(include=[np.number])

Unnamed: 0,# EGAPs fixed,# Hits on fixed code,Energy (before fix),Energy (after fix),Time (before fix),Time (after fix),# Total method calls,# Diff. method calls,nosi (before),nom (before),...,noc (after),rfc (after),nof (after),dit (after),wmc (after),loc (after),cbo (after),nosf (after),lcom (after),nosm (after)
count,7125.0,7125.0,7125.0,7125.0,7125.0,7125.0,7125.0,7125.0,5125.0,5125.0,...,5125.0,5125.0,5125.0,5125.0,5125.0,5125.0,5125.0,5125.0,5125.0,5125.0
mean,1.964912,11.57193,25026.927734,40932.554688,24476.277344,40098.097656,480.354386,29.364912,2.174864,5.836714,...,0.100286,13.573089,77.279442,1.769074,12.910982,307.013611,6.781427,74.856728,41.500999,0.658209
std,2.255483,186.336412,7294.243652,14800.394531,7293.128418,15555.564453,1378.137049,34.091442,2.196381,3.017144,...,0.149516,5.591204,90.709679,0.409458,8.933576,259.452362,2.140282,91.34726,50.076595,0.690645
min,0.0,0.0,7542.529297,4968.0,7549.152832,5018.0,0.0,0.0,0.142857,2.0,...,0.0,3.733333,5.0,1.25,3.166667,47.866665,2.411765,2.8,2.166667,0.0
25%,1.0,0.0,22198.990234,40490.0,21375.730469,40399.0,12.0,6.0,0.541667,3.545455,...,0.0,9.166667,12.929961,1.521739,6.8,120.721054,5.238095,10.066148,11.083333,0.16
50%,1.0,0.0,25498.695312,42676.0,25356.908203,42389.0,39.0,16.0,1.65625,5.52,...,0.019608,13.722628,51.717392,1.666667,11.940298,236.473679,6.766423,47.391304,19.365854,0.5
75%,3.0,0.0,29145.708984,49477.0,28806.128906,49472.0,196.0,43.0,2.761905,6.853659,...,0.146341,16.878788,98.099998,1.84,15.176471,352.320007,7.826087,95.190475,52.390244,0.933333
max,12.0,3151.0,81310.726562,67147.0,68018.148438,75616.0,11569.0,153.0,10.689655,16.24,...,0.536965,29.040001,378.333344,3.37931,52.84,1157.166626,11.447369,377.5,203.058395,3.586207


In [4]:
cols = ['pattern', 'patterns_fixed', 'hits', 'energy_before', 
              'energy_after', 'time_before', 'time_after', 'total_calls', 'diff_calls']
df = gl.iloc[:, 0:9]
df.columns = cols
df.head()

Unnamed: 0,pattern,patterns_fixed,hits,energy_before,energy_after,time_before,time_after,total_calls,diff_calls
0,excessivemethodcalls,5,0,22601.039062,43258.0,11029.81543,8237.0,68,25
1,excessivemethodcalls,5,0,22849.322266,43259.0,11061.456055,8336.0,68,25
2,excessivemethodcalls,5,0,22963.242188,43264.0,13220.598633,8338.0,68,25
3,excessivemethodcalls,5,0,23147.324219,43264.0,13285.56543,8342.0,68,25
4,excessivemethodcalls,5,0,23413.677734,43277.0,13307.464844,8358.0,68,25


In [7]:
df['patterns_fixed'].value_counts()

1     2500
0     1750
2      875
3      625
4      500
5      375
7      250
6      125
12     125
Name: patterns_fixed, dtype: int64

In [5]:
df['energy_diff'] = df['energy_before'] - df['energy_after']
df['has_hits'] = df['hits'] > 0
df.sample(5)

Unnamed: 0,pattern,patterns_fixed,hits,energy_before,energy_after,time_before,time_after,total_calls,diff_calls,energy_diff,has_hits
5521,excessivemethodcalls,2,0,26856.474609,43411.0,28487.03125,43408.0,701,81,-16554.525391,False
2870,excessivemethodcalls,12,4,34251.882812,64020.0,16291.270508,14563.0,10514,103,-29768.117188,True
6698,excessivemethodcalls,2,0,34244.433594,42467.0,34131.867188,42397.0,31,23,-8222.566406,False
7041,excessivemethodcalls,0,0,30737.689453,40872.0,30649.107422,40706.0,835,55,-10134.310547,False
3456,excessivemethodcalls,0,0,12675.426758,9739.0,12560.029297,9747.0,14,8,2936.426758,False


In [6]:
df['has_energy_gain'] = df['energy_before'] > df['energy_after']
df.sample(5)

Unnamed: 0,pattern,patterns_fixed,hits,energy_before,energy_after,time_before,time_after,total_calls,diff_calls,energy_diff,has_hits,has_energy_gain
3825,excessivemethodcalls,0,0,22628.833984,49248.0,23102.654297,49180.0,43,21,-26619.166016,False,False
3113,excessivemethodcalls,0,0,29298.650391,59465.0,29278.302734,59508.0,153,51,-30166.349609,False,False
902,excessivemethodcalls,0,0,26113.294922,40271.0,26499.765625,40263.0,66,20,-14157.705078,False,False
1273,excessivemethodcalls,1,0,24693.507812,43401.0,24734.013672,43555.0,1365,71,-18707.492188,False,False
6722,excessivemethodcalls,2,0,32845.527344,49894.0,34024.164062,49925.0,23,20,-17048.472656,False,False
