In [1]:
import numpy as np
import pandas as pd

from scipy.io import arff
from sklearn import svm, naive_bayes
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

In [2]:
data_class_raw = arff.loadarff('data/data-class.arff')
feature_envy_raw = arff.loadarff('data/feature-envy.arff')
god_class_raw = arff.loadarff('data/god-class.arff')
long_method_raw = arff.loadarff('data/long-method.arff')

feature_envy_data = feature_envy_raw[0]
god_class_data = god_class_raw[0]
long_method_data = long_method_raw[0]

data_class_df = pd.DataFrame(data_class_raw[0])
feature_envy_df = pd.DataFrame(feature_envy_data).dropna(axis=0)
god_class_df = pd.DataFrame(god_class_data).dropna(axis=0)
long_method_df = pd.DataFrame(long_method_data).dropna(axis=0)

In [3]:
feature_envy_df.drop('is_feature_envy', axis=1).to_numpy().shape

(776, 82)

In [4]:
feature_envy_df.loc[:, 'is_feature_envy']

0      b'false'
1      b'false'
2      b'false'
3      b'false'
4      b'false'
         ...   
835    b'false'
836    b'false'
837    b'false'
838    b'false'
839    b'false'
Name: is_feature_envy, Length: 776, dtype: object

In [5]:
data_class_df.shape

(840, 62)

In [6]:
feature_envy_df.shape

(776, 83)

In [7]:
god_class_df.shape

(784, 62)

In [8]:
long_method_df.shape

(776, 83)

In [9]:
data_class_df.columns

Index(['NOII_type', 'NOAM_type', 'NOCS_type', 'NOM_type', 'NMO_type',
       'ATFD_type', 'FANOUT_type', 'NOMNAMM_type', 'NOA_type', 'NIM_type',
       'DIT_type', 'LOC_type', 'LOCNAMM_type', 'CFNAMM_type', 'TCC_type',
       'CBO_type', 'RFC_type', 'NOC_type', 'WMC_type', 'LCOM5_type',
       'WOC_type', 'WMCNAMM_type', 'AMW_type', 'AMWNAMM_type', 'NOCS_package',
       'NOMNAMM_package', 'NOI_package', 'LOC_package', 'NOM_package',
       'NOPK_project', 'NOCS_project', 'NOI_project', 'NOM_project',
       'NOMNAMM_project', 'LOC_project', 'isStatic_type', 'NOPA_type',
       'number_private_visibility_attributes',
       'number_protected_visibility_attributes',
       'number_package_visibility_attributes', 'num_final_attributes',
       'num_static_attributes', 'num_final_static_attributes',
       'num_not_final_not_static_attributes',
       'num_final_not_static_attributes', 'num_static_not_final_attributes',
       'number_public_visibility_methods', 'number_private_visibility

In [10]:
feature_envy_df.columns

Index(['NOP_method', 'CC_method', 'ATFD_method', 'FDP_method', 'CM_method',
       'MAXNESTING_method', 'LOC_method', 'CYCLO_method', 'NMCS_method',
       'NOLV_method', 'MaMCL_method', 'NOAV_method', 'LAA_method',
       'FANOUT_method', 'CFNAMM_method', 'ATLD_method', 'CLNAMM_method',
       'CINT_method', 'MeMCL_method', 'CDISP_method', 'NOII_type', 'NOAM_type',
       'NOCS_type', 'NOM_type', 'NMO_type', 'ATFD_type', 'FANOUT_type',
       'NOMNAMM_type', 'NOA_type', 'NIM_type', 'DIT_type', 'LOC_type',
       'LOCNAMM_type', 'CFNAMM_type', 'TCC_type', 'NOPA_type', 'CBO_type',
       'RFC_type', 'NOC_type', 'WMC_type', 'LCOM5_type', 'WOC_type',
       'WMCNAMM_type', 'AMW_type', 'AMWNAMM_type', 'NOCS_package',
       'NOMNAMM_package', 'NOI_package', 'LOC_package', 'NOM_package',
       'NOPK_project', 'NOCS_project', 'NOI_project', 'NOM_project',
       'NOMNAMM_project', 'LOC_project', 'isStatic_type',
       'number_private_visibility_attributes',
       'number_protected_visibil

In [11]:
data_class_df.dropna(axis=0)

Unnamed: 0,NOII_type,NOAM_type,NOCS_type,NOM_type,NMO_type,ATFD_type,FANOUT_type,NOMNAMM_type,NOA_type,NIM_type,...,number_not_abstract_not_final_methods,number_static_methods,number_final_static_methods,number_final_not_static_methods,number_not_final_static_methods,number_not_final_not_static_methods,number_standard_design_methods,number_constructor_DefaultConstructor_methods,number_constructor_NotDefaultConstructor_methods,is_data_class
0,0.0,0.0,0.0,6.0,0.0,3.0,1.0,6.0,2.0,32.0,...,6.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,b'false'
1,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,4.0,30.0,...,2.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,b'false'
2,0.0,0.0,0.0,7.0,0.0,2.0,1.0,7.0,3.0,32.0,...,7.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,b'false'
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,32.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,b'false'
4,0.0,0.0,0.0,4.0,0.0,0.0,2.0,4.0,1.0,32.0,...,4.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,b'false'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,b'false'
836,1.0,1.0,0.0,395.0,0.0,44.0,77.0,394.0,142.0,0.0,...,394.0,387.0,1.0,0.0,386.0,8.0,392.0,0.0,2.0,b'false'
837,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,15.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,b'false'
838,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,15.0,...,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,b'false'


In [12]:
def cast_byte_to_bool(y):
    return [yy == b'true' for yy in y]

In [13]:
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(
    feature_envy_df.drop('is_feature_envy', axis=1).to_numpy(),
    cast_byte_to_bool(feature_envy_df.loc[:, 'is_feature_envy'].to_numpy()),
    test_size=0.2, random_state=1)

In [23]:
svc = svm.SVC()
svc.fit(X_train_fe, y_train_fe)
print(svc.score(X_test_fe, y_test_fe))

0.8397435897435898


In [17]:
svc = svm.SVC(kernel='sigmoid', degree=5)
svc.fit(X_train_fe, y_train_fe)
print(svc.score(X_test_fe, y_test_fe))

0.6858974358974359


In [28]:
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid={
    'C': [1, 10],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'degree': list(range(1, 5)),
}, n_jobs=-1)
clf.fit(X_train_fe, y_train_fe)
print(f'Best score: {clf.best_score_}')
print('Best params:')
print(clf.best_params_)

Best score: 0.8338709677419356
Best params:
{'C': 1, 'degree': 1, 'kernel': 'poly'}


In [29]:
clf.score(X_test_fe, y_test_fe)    

0.8397435897435898

In [33]:
nb = naive_bayes.GaussianNB()
nb.fit(X_train_fe, y_train_fe)
print(f'Test score: {nb.score(X_test_fe, y_test_fe)}')

Test score: 0.7948717948717948


In [40]:
pd.Series(clf.predict(X_test_fe)).value_counts()

False    156
dtype: int64

In [36]:
np.mean(clf.cv_results_['mean_fit_time'])


0.052894888321558635

In [39]:
# feature_envy_df.loc[:, 'is_feature_envy'].count()
print(data_class_df.is_data_class.value_counts())
print(feature_envy_df.is_feature_envy.value_counts())
print(god_class_df.is_god_class.value_counts())
print(long_method_df.is_long_method.value_counts())


b'false'    700
b'true'     140
Name: is_data_class, dtype: int64
b'false'    648
b'true'     128
Name: is_feature_envy, dtype: int64
b'false'    654
b'true'     130
Name: is_god_class, dtype: int64
b'false'    647
b'true'     129
Name: is_long_method, dtype: int64
