In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest,f_classif

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def create_base_train_data(x,y):
    selector=SelectKBest(score_func=f_classif,k=5).fit(x,y)
    x_select=selector.transform(x)
    select_features=selector.get_support(1)
    new_x,new_y=SMOTE(sampling_strategy=0.2,random_state=SEED).fit_resample(x_select,y)
    new_x=PolynomialFeatures(degree=2).fit_transform(new_x)
    return new_x,new_y,select_features

def create_base_test_data(x,feature):
    new_x=x[x.columns[feature]]
    new_x=PolynomialFeatures(degree=2).fit_transform(new_x)
    return new_x

def predict_score(model,x_train,y_train,x_test,y_test,name):
    model.fit(x_train,y_train)
    y_predict=model.predict(x_test)
    f1=f1_score(y_test,y_predict)
    auc_score=roc_auc_score(y_test,y_predict)
    return f1,auc_score
    
def baseDataSet(godclassFile):
    df=pd.read_csv(godclassFile,encoding_errors="replace")
    df=df[["dit_x","fanin_x","fanout_x","tcc_x","lcom*_x","loc_x","noc_x","rfc_x","innerClassesQty_x","totalFieldsQty_x","totalMethodsQty_x","wmc_x","GodClass"]]
    df=df.dropna(how="any")
    return df

def addMetricsDataSet(godclassFile):
    df=pd.read_csv(godclassFile,encoding_errors="replace")
    df=df[["dit_x","fanin_x","fanout_x","tcc_x","lcom*_x","noc_x","rfc_x","innerClassesQty_x","totalFieldsQty_x","totalMethodsQty_x","wmc_x","NprotM_x","BOvR_x","ATFD_x","ATLD_x","LAA_x","BUR_x","FDP_x","GodClass"]]
    df=df.dropna(how="any")
    return df

def addOldVersionDataSet(godclassFile):
    df=pd.read_csv(godclassFile,encoding_errors="replace")
    df=df.drop(columns=["file","class","type","line_x","line_y"])
    df=df.dropna(how="any")
    return df

def normalization(df):
    for column in df.columns:
        df[column]=(df[column]-df[column].min())/(df[column].max()-df[column].min())
    return df

#オーバーサンプリング，多項式化データ
def create_train_test_data(x,y):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=SEED)
    x_train_base,y_train_base,features=create_base_train_data(x_train,y_train)
    x_test_base=create_base_test_data(x_test,features)
    return x_train_base,y_train_base,x_test_base,y_test,features

SEED=1

In [9]:
GodClassFile="C:\\Users\\sugii syuji\\machinelearning\\dataset\\GodClass.csv"
baseDF=baseDataSet(GodClassFile)
x_base=baseDF.drop(columns=["GodClass"])
y_base=baseDF[["GodClass"]]
x_base=normalization(x_base)
x_train_base,y_train_base,x_test_base,y_test_base,features_base=create_train_test_data(x_base,y_base)

addMetricsDF=addMetricsDataSet(GodClassFile)
x_add=addMetricsDF.drop(columns=["GodClass"])
y_add=addMetricsDF[["GodClass"]]
x_add=normalization(x_add)
x_train_add,y_train_add,x_test_add,y_test_add,features_add=create_train_test_data(x_add,y_add)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
name="DT"
DT_prameter={
    "criterion":["gini","entropy"],
    "max_depth":list(range(1,11)),
    "min_samples_split":list(range(2,25)),
    "min_samples_leaf":list(range(1,20)),
    "max_features":["sqrt","log2",None]
    }
DT=DecisionTreeClassifier(random_state=SEED)

DT_model=RandomizedSearchCV(DT, param_distributions=DT_prameter,cv=StratifiedKFold(n_splits=10),random_state=SEED)
f1_base,auc_score_base=predict_score(DT_model,x_train_base,y_train_base,x_test_base,y_test_base,name)
f1_add,auc_score_add=predict_score(DT_model,x_train_add,y_train_add,x_test_add,y_test_add,name)

print("base")
print("f1 : ",f1_base)
print("auc_score : ",auc_score_base)
print(baseDF.columns[features_base])
print("\naddMetrics")
print("f1 : ",f1_add)
print("auc_score : ",auc_score_add)
print(addMetricsDF.columns[features_add])

base
f1 :  0.7193675889328063
auc_score :  0.9384272538970625
Index(['fanout_x', 'lcom*_x', 'loc_x', 'rfc_x', 'wmc_x'], dtype='object')

addMetrics
f1 :  0.7024029574861368
auc_score :  0.9559340640835137
Index(['fanout_x', 'rfc_x', 'wmc_x', 'ATFD_x', 'FDP_x'], dtype='object')
