In [None]:
import re
import gc
from datetime import datetime, timedelta

from tqdm import trange
import numpy as np
import pandas as pd
from matplotlib import pyplot
import networkx as nx

import gensim
import tensorflow as tf
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as R2

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,LSTM,Embedding
from keras.layers import Dropout,Activation
from keras.layers import Bidirectional,GlobalMaxPool1D
from keras.models import Model

from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

In [None]:
if tf.__version__.split('.')[0] == 2:
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

## Environment Definition

In [None]:
data = pd.read_csv('data/eclipse-total.csv')
codefeature = pd.read_csv('data/eclipse-codefeature.csv')
cutted_file = 'data/cutted.csv'
EMBEDDING_FILE = 'data/embedding.model'
FEATURE_RESULT = 'data/eclipse-all-features.csv'
kfold = TimeSeriesSplit(n_splits=10)
data.info()

# Feature Engineering

## CDF ( Generated via Java )

In [None]:
changetype = codefeature.groupby('Index')['ChangeType'].value_counts().unstack().fillna(0).reset_index()
changeratio = (codefeature.groupby('Index')['Delta'].sum() / codefeature.groupby('Index')['TotalLines'].sum()).to_frame()
changeratio.columns = ['ChangeRatio']
cdf = pd.merge(data.Index, changetype, on='Index', how='left')
cdf = pd.merge(cdf, changeratio, on='Index', how='left').fillna(0)
cdf.head()

## TF ( w2v feature )

In [None]:
cm = data[['Index', 'changedesc']]
cm.changedesc = cm.changedesc.str.replace('\[|\]|\'|\.|\t|\r|\n|:|/',' ')
cm.info()

In [None]:
cm.to_csv(cutted_file, index=False)
tmp = gensim.models.word2vec.LineSentence(cutted_file, max_sentence_length=20)
model= Word2Vec(workers=20,sentences=tmp,window=3, min_count=5,iter = 5,size=50)
print(len(model.wv.vocab))
model.save(EMBEDDING_FILE) 

In [None]:
max_features = 4000
maxlen = 25
batch_size = 50
tokenizer = Tokenizer()
# traing text
tokenizer.fit_on_texts(list(model.wv.vocab))
list_sentences_train = cm['changedesc'].astype(str)
tokenizer = Tokenizer(num_words = max_features)
# traing text
tokenizer.fit_on_texts(list_sentences_train.tolist())
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) 
 # 列长度
X_t = pad_sequences(list_tokenized_train, maxlen = maxlen)

In [None]:
def loadEmbeddingMatrix(typeToLoad,EMBEDDING_FILE,tokenizer):
    if typeToLoad == 'word2vec':
        word2vecDict = gensim.models.KeyedVectors.load(EMBEDDING_FILE)
        embedding_index = dict()
        # 词与对应词向量
        for word in word2vecDict.wv.vocab:
            embedding_index[word] = word2vecDict.wv.word_vec(word) # 对应的(300,)的词向量
        print('Load %s word vectors.' % len(embedding_index))
    gc.collect()
    all_embs = np.stack(list(embedding_index.values())) # （3000000,300）
    emb_mean,emb_std = all_embs.mean(),all_embs.std()
    nb_words = len(tokenizer.word_index) # 训练词的个数
    # 权重矩阵随机初始化
    embedding_matrix = np.random.normal(emb_mean,emb_std,(nb_words+1,50))
    gc.collect()
    embeddedCount = 0
    for word,i in tokenizer.word_index.items(): # 词
    #         i -= 1
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            embeddedCount += 1
    print('total_embedded:',embeddedCount,'common words')
    del embedding_index
    gc.collect()
    return embedding_matrix
def getvec(X_t,embedding_matrix):
    tf.reset_default_graph()
    graph = tf.Graph() 

    # with graph.as_default():  
    train_inputs1 = tf.placeholder(tf.int32, shape=(None,maxlen),name = 'x')          
    embeddings = tf.Variable(embedding_matrix,name = 'embed')
    embed = tf.nn.embedding_lookup(embeddings, train_inputs1);  
    print('shape of embed1 : \t', str(embed.get_shape()))  

    #layer 1         
    vec = tf.reduce_mean(embed, axis = 1)  
    init = tf.global_variables_initializer()
    with tf.Session() as session:  
        init.run() 
        print('inited')  
        average_loss = 0  
        vecs = []  
        nstep=int(len(X_t)/batch_size)+1

        for step in trange(nstep):  
            x1=generate_batch(X_t,step) 
            feed_dict = {train_inputs1 : x1}
            vecblock = session.run([vec], feed_dict=feed_dict) 
            vecs += vecblock
        vecs = np.vstack(vecs)
    return vecs
def generate_batch(X_t,step): 
    if (step+1)*batch_size>len(X_t):
        batchdata = X_t[batch_size*step:]
    else:
        batchdata = X_t[batch_size*step:batch_size*(step+1)]
    return batchdata   

In [None]:
embedding_matrix = loadEmbeddingMatrix('word2vec', EMBEDDING_FILE, tokenizer)
vecs = getvec(X_t, embedding_matrix)
yyuid = data.Index
w2v = pd.DataFrame(yyuid,columns = ['Index'])
w2v['vec'] = np.round(vecs,decimals=5).tolist()

In [None]:
w2v = pd.concat([w2v['Index'], w2v['vec'].astype('str').str.strip('[]').str.split(',', expand=True).astype('float')],axis=1)
w2v.head()

## PMF ( Meta )

In [None]:
data.Branch.fillna('', inplace=True)
data.Project.fillna('', inplace=True)
data['Owner'] = data['Owner'].map(lambda x: x.lower())
data['Author'] = data['Author'].map(lambda x: x.lower())
data['hour'] = data['Time'].apply(lambda x: int(x.split(':')[0]))

data['OwnerLabel'] = LabelEncoder().fit_transform(data['Owner'])
data['AuthorLabel'] = LabelEncoder().fit_transform(data['Author'])
data['ProjectLabel'] = LabelEncoder().fit_transform(data['Project'])
data['BranchLabel'] = LabelEncoder().fit_transform(data['Branch'])

data['MessageLength'] = data['changedesc'].fillna("").apply(lambda x: len(re.split('\s', x)))
data['SubmitDay'] = data['starttime'].apply(lambda x: int(x.split('-')[2].split(' ')[0]))

meta = ['OwnerLabel', 'AuthorLabel', 'ProjectLabel', 'hour', 'MessageLength',
        'SubmitDay', 'relatedNum', 'JavaFileNum', 'BranchLabel', 'ReviewerNum']

## PEF

In [None]:
data['Reviewerword'] = data['Reviewer'].apply(lambda x: x.strip('[]').split(','))
data['Reviewerword'] = data['Reviewerword'].apply(lambda x: [i.lower().strip(" |  |'").strip(' ') for i in x])
owners = list(data['Owner'].unique())
reviewers=[]
for i in data['Reviewerword']:
    reviewers += i
reviewers = list(set(reviewers))
encoder = LabelEncoder()
encoder.fit(owners + reviewers)
tmp = data['Reviewer'].copy()
tmp.index = data['Owner']
tmp = tmp.astype(str).str.strip('[]').str.lower().str.split(',', expand=True).stack().reset_index().drop('level_1',axis = 1)
tmp.columns=['OwnerLabel','ReviewerLabel']
tmp['ReviewerLabel'] = tmp['ReviewerLabel'].map(lambda x: x.strip(" |  |'").strip(' '))
tmp['OwnerLabel'] = encoder.transform(tmp['OwnerLabel'])
tmp['ReviewerLabel'] = encoder.transform(tmp['ReviewerLabel'])

In [None]:
edges = list(zip(tmp['OwnerLabel'], tmp['ReviewerLabel']))
G = nx.DiGraph()
G = nx.DiGraph(name='my graph')  
G = nx.DiGraph(edges)
# nx.draw(G)

dc = nx.degree_centrality(G)
cc = nx.closeness_centrality(G)
bc= nx.betweenness_centrality(G)
absdegree = dict(nx.degree(G))
indc = nx.in_degree_centrality(G)
outdc = nx.out_degree_centrality(G)
CNfea1 = data.Index.to_frame()
CNfea1['degree_centrality'] = data['OwnerLabel'].map(dc)
CNfea1['closeness_centrality'] = data['OwnerLabel'].map(cc)
CNfea1['betweenness_centrality'] = data['OwnerLabel'].map(bc)
CNfea1['degree'] = data['OwnerLabel'].map(absdegree)
CNfea1['in_degree_centrality'] = data['OwnerLabel'].map(indc)
CNfea1['out_degree_centrality'] = data['OwnerLabel'].map(outdc)

In [None]:
tmp = data['Reviewer'].copy()
tmp.index = data['Index']
tmp = tmp.astype(str).str.strip('[]').str.lower().str.split(',', expand=True).stack().reset_index().drop('level_1',axis = 1)
tmp.columns=['Index','ReviewerLabel']
tmp['ReviewerLabel'] = tmp['ReviewerLabel'].map(lambda x: x.strip(" |  |'").strip(' '))

tmp['ReviewerLabel'] = encoder.transform(tmp['ReviewerLabel'])
tmp['degree_centrality'] = tmp.ReviewerLabel.map(dc)
tmp['closeness_centrality'] = tmp.ReviewerLabel.map(cc)
tmp['betweenness_centrality'] = tmp.ReviewerLabel.map(bc)
tmp['degree'] = tmp.ReviewerLabel.map(absdegree)
tmp['in_degree_centrality'] = tmp.ReviewerLabel.map(indc)
tmp['out_degree_centrality'] = tmp.ReviewerLabel.map(outdc)

CNfea2 = tmp[['Index','degree_centrality','closeness_centrality','degree','in_degree_centrality',
              'out_degree_centrality']].groupby('Index').agg(['mean','sum']).add_prefix('review_')

In [None]:
CN = CNfea1.merge(CNfea2,on = 'Index',how = 'left')

In [None]:
ORE={}
ORE['ownerpassratio'] = data.groupby('OwnerLabel').PatchTime.agg(np.mean).to_dict()

In [None]:
tmp = data['Reviewer'].copy()
tmp.index = data['Index']
tmp = tmp.astype(str).str.strip('[]').str.lower().str.split(',', expand=True).stack().reset_index().drop('level_1',axis = 1)
tmp.columns = ['Index', 'Reviewer']
tmp['ReviewerLabel'] = tmp['Reviewer'].map(lambda x: x.strip(" |  |'").strip(' '))
tmp['ReviewerLabel'] = encoder.transform(tmp['ReviewerLabel'])
tmp = tmp.merge(data,on='Index',how='left')

In [None]:
ORE['reviewerpassratio'] = tmp.groupby('ReviewerLabel')['PatchTime'].agg(np.mean).to_dict()
OREfea1 = data['Index'].to_frame()
OREfea1['ownerpassratio'] = data['OwnerLabel'].map(ORE['ownerpassratio'])

tmp['reviewerpassratio'] = tmp['ReviewerLabel'].map(ORE['reviewerpassratio'])
OREfea2 = tmp.groupby('Index').agg({'reviewerpassratio':[np.mean,sum,max,min]}).reset_index()
OREfea = OREfea1.merge(OREfea2,on='Index',how='left')

PEF = CN.merge(OREfea, on='Index', how='left')
PEF.drop_duplicates(keep='first', inplace=True)
PEF.head()

# Model Definition

In [None]:
class ResultSet:
    def __init__(self, _name):
        self.name = _name
        self.acc = []
        self.prec = []
        self.rec = []
        self.f1 = []

def deal_score(result, y_true, pred, mask=False):
    if not mask:
        result.acc.append(accuracy_score(y_true, pred))
        result.prec.append(precision_score(y_true, pred))
        result.rec.append(recall_score(y_true, pred))
        result.f1.append(f1_score(y_true, pred))
    else:
        result.acc.append(np.nan_to_num(accuracy_score(y_true.iloc[mask], pred[mask])))
        result.prec.append(np.nan_to_num(precision_score(y_true.iloc[mask], pred[mask])))
        result.rec.append(np.nan_to_num(recall_score(y_true.iloc[mask], pred[mask])))
        result.f1.append(np.nan_to_num(f1_score(y_true.iloc[mask], pred[mask])))

def print_score(result):
    print("|{}|{:.2%}|{:.2%}|{:.2%}|{:.2%}|".format(
            result.name, np.mean(result.acc),
            np.mean(result.prec), np.mean(result.rec),
            np.mean(result.f1)))

In [None]:
def model_evaluate(X, Y, name, clf, kfold):
    result = ResultSet(name)
    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        if y_train.sum() * 3 < y_train.shape[0]:
            pos_cnt = (y_train.shape[0] - y_train.sum()) // 2
            neg_cnt = y_train.shape[0] - y_train.sum()
        else:
            pos_cnt = y_train.sum()
            neg_cnt = pos_cnt * 2
        smo = SMOTE(sampling_strategy={0: neg_cnt, 1 : pos_cnt}, random_state=42)
        x_train, y_train = smo.fit_sample(np.array(x_train), np.array(y_train))
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)
        deal_score(result, y_test, prediction)
    print_score(result)

In [None]:
def random_guess(X, Y, kfold):
    result = ResultSet("RandomGuess")
    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        prob = y_train.sum() / y_train.shape[0]
        prediction = np.random.choice([0, 1], size=y_test.shape[0], p=[1 - prob, prob])
        deal_score(result, y_test, prediction)
    print_score(result)

In [None]:
def light_gbm(X, Y, kfold):
    result = ResultSet("LightGBM")
    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        if y_train.sum() * 3 < y_train.shape[0]:
            pos_cnt = (y_train.shape[0] - y_train.sum()) // 2
            neg_cnt = y_train.shape[0] - y_train.sum()
        else:
            pos_cnt = y_train.sum()
            neg_cnt = pos_cnt * 2
        smo = SMOTE(sampling_strategy={0: neg_cnt, 1 : pos_cnt}, random_state=42)
        x_train, y_train = smo.fit_sample(np.array(x_train), np.array(y_train))
        x_tr = lgb.Dataset(x_train, label=y_train)
        x_te = lgb.Dataset(x_test, label=y_test)
        params = {
            'task': 'train',
            'boosting_type': 'gbdt',  # 设置提升类型
            'objective': 'binary', # 目标函数
            'metric': {'l2', 'auc'},  # 评估函数
            'num_leaves': 31,   # 叶子节点数
            'learning_rate': 0.05,  # 学习速率
            'feature_fraction': 0.9, # 建树的特征选择比例
            'bagging_fraction': 0.9, # 建树的样本采样比例
            'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
            'verbose': -1, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        }
        bst = lgb.train(params, x_tr, 3000, valid_sets=[x_te], early_stopping_rounds=10, verbose_eval=False) 
        prediction = bst.predict(x_test)
        deal_score(result, y_test, prediction > 0.5)
    print_score(result)

# Model Evaluate

## Regression

In [None]:
def regression(X, Y, name, model, kfold):
    mse_scores=[]
    mae_scores=[]
    r2_scores=[]
    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)
        mse_scores.append(MSE(y_test, prediction))
        mae_scores.append(MAE(y_test, prediction))
        r2_scores.append(R2(y_test, prediction))
    print(f"|{name}|",
          "{:.4}|".format(np.mean(mse_scores)),
          "{:.4}|".format(np.mean(mae_scores)),
          "{:.4}|".format(np.mean(r2_scores)))

In [None]:
def lgb_regression(X, Y, kfold):
    mse_scores=[]
    mae_scores=[]
    r2_scores=[]
    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        x_tr = lgb.Dataset(x_train, label=y_train)
        x_te = lgb.Dataset(x_test, label=y_test)
        params = {
            'task': 'train',
            'boosting_type': 'gbdt',  # 设置提升类型
            'objective': 'binary', # 目标函数
            'metric': {'l2', 'auc'},  # 评估函数
            'num_leaves': 31,   # 叶子节点数
            'learning_rate': 0.05,  # 学习速率
            'feature_fraction': 0.9, # 建树的特征选择比例
            'bagging_fraction': 0.9, # 建树的样本采样比例
            'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
            'verbose': -1, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        }
        bst = lgb.train(params, x_tr, 3000, valid_sets=[x_te], early_stopping_rounds=10, verbose_eval=False) 
        prediction = bst.predict(x_test)
        mse_scores.append(MSE(y_test, prediction))
        mae_scores.append(MAE(y_test, prediction))
        r2_scores.append(R2(y_test, prediction))
    print("\n|LGB|",
          "{:.4}|".format(np.mean(mse_scores)),
          "{:.4}|".format(np.mean(mae_scores)),
          "{:.4}|".format(np.mean(r2_scores)))

In [None]:
data.sort_values(by='starttime', inplace=True)
total = pd.merge(data[meta + ['Index', 'PatchTime']], cdf, on='Index', how='left')
total = pd.merge(total, w2v, on='Index', how='left')
total = pd.merge(total, PEF, on='Index', how='left')
X = total.drop(['Index', 'PatchTime'], axis=1).fillna(0)
Y = total['PatchTime']

In [None]:
print("||MSE|MAE|R2|\n|-|-|-|-|")
model = SVR()
regression(X, Y, "SVR", model, kfold)
model = MLPRegressor()
regression(X, Y, "MLP", model, kfold)
model = DecisionTreeRegressor()
regression(X, Y, "DT", model, kfold)
model = RandomForestRegressor()
regression(X, Y, "RF", model, kfold)
lgb_regression(X, Y, kfold)

## Classification

1. one-time merged: 1
2. short-time merged: 2 - 6
3. long-time merged: > 6

In [None]:
def label1(x):
    return int(x == 1)
def label2(x):
    return int(x >= 2 and x <= 6)
def label3(x):
    return int(x > 6)

labels = [('one-time', label1), ('short-time', label2), ('long-time', label3)]

In [None]:
data.sort_values(by='starttime', inplace=True)
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)

In [None]:
total = pd.merge(data[meta + ['Index', 'PatchTime']], cdf, on='Index', how='left')
total = pd.merge(total, w2v, on='Index', how='left')
total = pd.merge(total, PEF, on='Index', how='left')
total.fillna(0).to_csv(FEATURE_RESULT, index=False)

### TOTAL

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    total['label'] = total['PatchTime'].apply(tolabel)
    X = total.drop(['Index', 'PatchTime', 'label'], axis=1).fillna(0)
    Y = total['label']

    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### The performance of PMCost for the patches submitted by new developers

In [None]:
data['NewDeveloper'] = 0
df = data.groupby('Owner').count().sort_values(by='Author')
cnt = 0
test_size = data.shape[0] // 10 # 10-folds
for owner, row in df.iterrows():
    cnt += row[0]
    if cnt > test_size:
        break
    data.loc[data['Owner'] == owner, 'NewDeveloper'] = 1

####  train by total and test respectively

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    total['label'] = total['PatchTime'].apply(tolabel)
    X = total.drop(['Index', 'PatchTime', 'label'], axis=1).fillna(0)
    Y = total['label']

    clf = RandomForestClassifier(n_jobs=-1)
    merged_new = ResultSet(f"{lb_type}-merged_new')
    merged_exp = ResultSet(f"{lb_type}-merged_exp')
    abandoned_new = ResultSet(f"{lb_type}-abandoned_new')
    abandoned_exp = ResultSet(f"{lb_type}-abandoned_exp')

    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        if y_train.sum() * 3 < y_train.shape[0]:
            pos_cnt = (y_train.shape[0] - y_train.sum()) // 2
            neg_cnt = y_train.shape[0] - y_train.sum()
        else:
            pos_cnt = y_train.sum()
            neg_cnt = pos_cnt * 2
        smo = SMOTE(sampling_strategy={0: neg_cnt, 1 : pos_cnt}, random_state=42)
        x_train, y_train = smo.fit_sample(np.array(x_train), np.array(y_train))
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)

        df = data.iloc[testindex].reset_index().drop('index', axis=1)
        merged_new_mask = df[(df['status'] == 'MERGED') & (df['NewDeveloper'] == 1)].index
        merged_exp_mask = df[(df['status'] == 'MERGED') & (df['NewDeveloper'] == 0)].index
        abandoned_new_mask = df[(df['status'] == 'ABANDONED') & (df['NewDeveloper'] == 1)].index
        abandoned_exp_mask = df[(df['status'] == 'ABANDONED') & (df['NewDeveloper'] == 0)].index
        deal_score(merged_new, y_test, prediction, merged_new_mask)
        deal_score(merged_exp, y_test, prediction, merged_exp_mask)
        deal_score(abandoned_new, y_test, prediction, abandoned_new_mask)
        deal_score(abandoned_exp, y_test, prediction, abandoned_exp_mask)
    print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
    print_score(merged_new)
    print_score(merged_exp)
    print_score(abandoned_new)
    print_score(abandoned_exp)

#### only new developer

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    total['label'] = total['PatchTime'].apply(tolabel)
    X = total.drop(['Index', 'PatchTime', 'label'], axis=1).fillna(0)[data['NewDeveloper'] ==  1]
    Y = total['label'][data['NewDeveloper'] ==  1]
    clf = RandomForestClassifier(n_jobs=-1)
    model_evaluate(X, Y, lb_type, clf, kfold)

#### except PEF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    total['label'] = total['PatchTime'].apply(tolabel)
    X = total.drop(list(PEF.columns) + ['label'], axis=1).fillna(0)
    Y = total['label']
    clf = RandomForestClassifier(n_jobs=-1)
    new = ResultSet(f"{lb_type}-new')
    exp = ResultSet(f"{lb_type}-exp')

    for trainindex,testindex in kfold.split(X):
        x_train = X.iloc[trainindex,:]
        x_test = X.iloc[testindex,:]
        y_train = Y.iloc[trainindex]
        y_test = Y.iloc[testindex]
        if y_train.sum() * 3 < y_train.shape[0]:
            pos_cnt = (y_train.shape[0] - y_train.sum()) // 2
            neg_cnt = y_train.shape[0] - y_train.sum()
        else:
            pos_cnt = y_train.sum()
            neg_cnt = pos_cnt * 2
        smo = SMOTE(sampling_strategy={0: neg_cnt, 1 : pos_cnt}, random_state=42)
        x_train, y_train = smo.fit_sample(np.array(x_train), np.array(y_train))
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)

        df = data.iloc[testindex].reset_index().drop('index', axis=1)
        new_mask = df[df['NewDeveloper'] == 1].index
        exp_mask = df[df['NewDeveloper'] == 0].index
        deal_score(new, y_test, prediction, new_mask)
        deal_score(exp, y_test, prediction, exp_mask)

### CDF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = pd.merge(data, cdf, on='Index', how='left')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop(data.columns, axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### TF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = pd.merge(data, w2v, on='Index', how='left')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop(data.columns, axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### PMF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    data['label'] = data['PatchTime'].apply(tolabel)
    X = data[meta]
    Y = data['label']
    random_guess(X, Y, kfold)
    clf = LogisticRegression()
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=-1)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### PEF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = pd.merge(data, PEF, on='Index', how='left')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop(data.columns, axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### CDF + TF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = pd.merge(data, cdf, on='Index', how='left')
    X = pd.merge(X, w2v, on='Index', how='left')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop(data.columns, axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### CDF + TF + PMF

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = pd.merge(data, cdf, on='Index', how='left')
    X = pd.merge(X, w2v, on='Index', how='left')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop([x for x in data.columns if x not in meta], axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression()
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=-1)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### ID Feature

In [None]:
ID = ['OwnerLabel','AuthorLabel','ProjectLabel','BranchLabel']
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:    
    X = data[ID]
    Y = data['PatchTime'].apply(tolabel)
    random_guess(X, Y, kfold)
    clf = LogisticRegression()
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=-1)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)

### Owner Experience

In [None]:
print("||acc|prec|recall|f1|\n|-|-|-|-|-|")
for lb_type, tolabel in labels:
    X = OREfea.merge(data[['Index', 'PatchTime']], on='Index')
    Y = X['PatchTime'].apply(tolabel)
    X = X.drop(['Index', 'PatchTime'], axis=1)
    random_guess(X, Y, kfold)
    clf = LogisticRegression(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-LR", clf, kfold)
    clf = SVC(gamma=0.1)
    model_evaluate(X, Y, f"{lb_type}-SVM", clf, kfold)
    clf = MLPClassifier()
    model_evaluate(X, Y, f"{lb_type}-MLP", clf, kfold)
    clf = DecisionTreeClassifier()
    model_evaluate(X, Y, f"{lb_type}-DT", clf, kfold)
    clf = RandomForestClassifier(n_jobs=16)
    model_evaluate(X, Y, f"{lb_type}-RF", clf, kfold)
    light_gbm(X, Y, kfold)