# INTERPRETATION

In [1]:
import pandas as pd
import math
from statsmodels.miscmodels.ordinal_model import OrderedModel

## Imports Data

In [2]:
FEATURES_PATH = "../data/output/features.csv"
Y_PATH = "../data/output/y.csv"

SPARK_3_0_PREVIEW_RELEASE  = pd.to_datetime("Oct 29, 2019") 
SPARK_3_0_PREVIEW2_RELEASE = pd.to_datetime("Dec 16, 2019") 
SPARK_3_0_RELEASE          = pd.to_datetime("Jun 05, 2020") 
SPARK_3_0_1_RELEASE        = pd.to_datetime("Aug 27, 2020")

In [3]:
features = pd.read_csv(FEATURES_PATH, index_col="File")
features.head()

Unnamed: 0_level_0,LinesCount,LinesAvgLength,LineCodeProportion,LineBlankProportion,TypesCount,ClassProportion,TraitProportion,MaxChildren,MethodCount,PrivateMethodProportion,ProtectedMethodProportion,MethodOverrideProportion,FinalMethodProportion,VarLocalCount,VarFieldCount,MagicNumbersCount
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala,134,34,0.634328,0.253731,1,1.0,0.0,1,4,0.25,0.0,0.0,0.0,0,0,17
common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala,152,31,0.657895,0.243421,1,1.0,0.0,1,4,0.25,0.0,0.0,0.0,0,0,14
core/src/test/scala/org/apache/spark/AccumulatorSuite.scala,205,33,0.702439,0.146341,3,0.666667,0.0,1,12,0.0,0.0,0.333333,0.0,0,3,3
core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala,262,35,0.843511,0.087786,1,1.0,0.0,1,2,1.0,0.0,0.0,0.0,0,0,35
core/src/test/scala/org/apache/spark/CheckpointSuite.scala,645,41,0.784496,0.130233,7,0.714286,0.142857,2,24,0.166667,0.416667,0.125,0.0,0,1,8


In [4]:
y = pd.read_csv(Y_PATH, index_col="File")
y.head()

Unnamed: 0_level_0,DTCreated
File,Unnamed: 1_level_1
common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala,3
common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala,3
core/src/test/scala/org/apache/spark/AccumulatorSuite.scala,3
core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala,2
core/src/test/scala/org/apache/spark/CheckpointSuite.scala,3


## Training the model
Lets train our model using the entire dataset and retrieve the estimated coefficients.

In [5]:
mod_log = OrderedModel(y, features, distr='logit')
res_log = mod_log.fit(method='bfgs')
summary_log = pd.DataFrame(res_log.summary(alpha=0.05).tables[1].data).set_index(0)
summary_log.columns = summary_log.iloc[0]
summary_log = summary_log.iloc[1:, :]
summary_log.index.name = None
summary_log = summary_log.applymap(lambda x : float(x))

summary_log

Optimization terminated successfully.
         Current function value: 0.624511
         Iterations: 102
         Function evaluations: 110
         Gradient evaluations: 110


Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
LinesCount,-0.0017,0.0,-6.619,0.0,-0.002,-0.001
LinesAvgLength,-0.0266,0.021,-1.245,0.213,-0.068,0.015
LineCodeProportion,-4.6666,1.164,-4.01,0.0,-6.948,-2.385
LineBlankProportion,2.1147,2.833,0.746,0.455,-3.438,7.668
TypesCount,-0.0268,0.03,-0.883,0.377,-0.086,0.033
ClassProportion,0.1088,0.345,0.316,0.752,-0.567,0.785
TraitProportion,-0.1548,0.625,-0.247,0.805,-1.381,1.071
MaxChildren,-0.0751,0.095,-0.787,0.431,-0.262,0.112
MethodCount,0.0111,0.011,0.984,0.325,-0.011,0.033
PrivateMethodProportion,-0.2115,0.283,-0.748,0.454,-0.766,0.342


## ANOVA

Let conduct an analysis of variance for our model.

In [6]:
summary_log["Wald chi-sqr"] = pd.NA
summary_log["Wald chi-sqr"].iloc[:-3] = summary_log.index[:-3].map(lambda index : res_log.wald_test(r_matrix="{} = 0".format(index)).statistic[0][0])

summary_log


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_log["Wald chi-sqr"].iloc[:-3] = summary_log.index[:-3].map(lambda index : res_log.wald_test(r_matrix="{} = 0".format(index)).statistic[0][0])


Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975],Wald chi-sqr
LinesCount,-0.0017,0.0,-6.619,0.0,-0.002,-0.001,43.815074
LinesAvgLength,-0.0266,0.021,-1.245,0.213,-0.068,0.015,1.548925
LineCodeProportion,-4.6666,1.164,-4.01,0.0,-6.948,-2.385,16.077091
LineBlankProportion,2.1147,2.833,0.746,0.455,-3.438,7.668,0.557069
TypesCount,-0.0268,0.03,-0.883,0.377,-0.086,0.033,0.779362
ClassProportion,0.1088,0.345,0.316,0.752,-0.567,0.785,0.099551
TraitProportion,-0.1548,0.625,-0.247,0.805,-1.381,1.071,0.061225
MaxChildren,-0.0751,0.095,-0.787,0.431,-0.262,0.112,0.619438
MethodCount,0.0111,0.011,0.984,0.325,-0.011,0.033,0.967703
PrivateMethodProportion,-0.2115,0.283,-0.748,0.454,-0.766,0.342,0.560053


In [7]:
significant_features = ["LinesCount", "LineCodeProportion", "VarFieldCount", "ProtectedMethodProportion", "MethodOverrideProportion"]
features.corr().loc[significant_features, significant_features]


Unnamed: 0,LinesCount,LineCodeProportion,VarFieldCount,ProtectedMethodProportion,MethodOverrideProportion
LinesCount,1.0,0.402058,0.334456,-0.037925,-0.109857
LineCodeProportion,0.402058,1.0,0.178948,-0.060035,-0.205886
VarFieldCount,0.334456,0.178948,1.0,-0.076503,0.09819
ProtectedMethodProportion,-0.037925,-0.060035,-0.076503,1.0,0.157351
MethodOverrideProportion,-0.109857,-0.205886,0.09819,0.157351,1.0
