In [17]:
### 1. 諸設定

import sys
import os
sys.path.append(os.path.abspath(".."))
import settings.modeling_settings as ms

# ランダムシードを何回振るか
N_ITERATIONS = 1

# 設定の表示
print('API Key:', ms.DR_API_KEY)
print('Endpoint:', ms.DR_END_POINT)
print('Input:', ms.INPUT_FILE_PATH)
print('Target:', ms.COL_TARGET)
print('metrics:', ms.METRICS)
print('number of cv-folds:', ms.N_FOLDS)

API Key: NWY4MTJmZGQwOGQ5Njk1NTI4YTY5MTFlOnlsc3FzOEJ0UzVLckNhbzU0blhPcHFZVkl3MEV4bFFpL1ZsNnpSTmdNclk9
Endpoint: https://app.datarobot.com/api/v2
Input: /Users/mon/Desktop/aia_test_02/input/lending_club_5000.csv
Target: bad_loan
metrics: AUC
number of cv-folds: 5


In [18]:
### 2. 環境初期化

import pandas as pd
import numpy as np
import datarobot as dr
import datetime as dt
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from pylab import rcParams
from colour import Color
import wordcloud
pd.set_option('display.max_rows', 500)
%matplotlib inline

dr_dark_blue = '#08233F'
dr_blue = '#1F77B4'
dr_orange = '#FF7F0E'
dr_red = '#BE3C28'

def __init_datarobot_env(token):
    c = dr.Client(endpoint=ms.DR_END_POINT, token=ms.DR_API_KEY)
    print("DataRobot API バージョンは", dr.__version__, "を利用しています。")
    print("エンドポイントは:", c.endpoint, "TOKENは", c.token, "を利用して接続しています。")
    print("接続判定:", c.verify)
    
__init_datarobot_env(ms.DR_API_KEY)

DataRobot API バージョンは 2.22.1 を利用しています。
エンドポイントは: https://app.datarobot.com/api/v2 TOKENは NWY4MTJmZGQwOGQ5Njk1NTI4YTY5MTFlOnlsc3FzOEJ0UzVLckNhbzU0blhPcHFZVkl3MEV4bFFpL1ZsNnpSTmdNclk9 を利用して接続しています。
接続判定: True


In [23]:
### 3. データ読み込み

def __read_inputs():
    def read_file_autohandle(input_path):
        _, ext = os.path.splitext(input_path)
        if ext == ".csv":
            result = pd.read_csv(
                input_path,encoding="utf-8-sig")
            return result
        elif (ext == ".xls") or (ext == ".xlsx"):
            result = pd.read_excel(input_path, encoding="utf-8-sig")
            return result

    df_x = read_file_autohandle(ms.INPUT_FILE_PATH)
    df_y = df_x[ms.COL_TARGET]
    df_id = []
    for col in df_x.columns:
        df_x.rename(columns={col:col.replace("{","_").replace("-","_").replace("$","_").replace(".","_").replace("}","_").replace("\n","_").replace('"',"_")},inplace=True)
    return (df_id, df_x, df_y)

df_id,df_x,df_y=__read_inputs()
dict_log_input = {
    'input_file_path': ms.INPUT_FILE_PATH,
    'n_records': len(df_x),
    'rate_target': sum(df_x[ms.COL_TARGET])/len(df_x)
}
pd.to_pickle(dict_log_input, "../output/intermediate_files/input_info.pkl")

In [38]:
### 4. オートパイロット・交差検定実行

def __run_autopilot():
    print("Start creating projects...")
    project_id_list = []
    for i in range(1, N_ITERATIONS+1):
        project_name = str(dt.date.today()).replace('-', '') + '_' + ms.INPUT_FILE_PATH[ms.INPUT_FILE_PATH.rfind('/')+1:] + '_Seed' + str(i)
        project = dr.Project.create(df_x, project_name=project_name)
        project.set_worker_count(-1)
        ao = dr.AdvancedOptions(seed=i)
        pm = dr.StratifiedCV(holdout_pct=0, reps=ms.N_FOLDS, seed=i)
        project.set_target(ms.COL_TARGET, metric=ms.METRICS, partitioning_method=pm, advanced_options=ao, mode=dr.AUTOPILOT_MODE.FULL_AUTO)
        print("The new project has ID for seed "+str(i)+" is:", project.id)
        project_id_list.append(project.id)
        random_seeds.append(i)
        project_ids.append(project.id)
        project_names.append(project_name)
    print("project IDs are " + str(project_id_list))
    return project_id_list

def __get_model_scores(project):
    return pd.DataFrame(
        [[model.metrics[project.metric]['crossValidation'],
          model.metrics[project.metric]['validation'],
          model.model_type,
          model.id,
          model.sample_pct,
          model.model_category,
          model,
          model.blueprint_id
          ] for model in project.get_models(with_metric=project.metric)],
        columns=['cv', 'v', 'type', 'model_id', 'sample_pct',
                 'category', 'model', 'blueprint_id']
    ).sort_values(['cv', 'v'], na_position='last')

def __run_cross_validation(project_id_list):
    for i in range(0, N_ITERATIONS):
        project = dr.Project.get(project_id=project_id_list[i])
        project.wait_for_autopilot(check_interval=60.0)
        print("Autopilot of the project for seed "+str(i+1)+" is completed")
        print("Confirming CV status....")
        jobs_list = project.get_all_jobs()
        for job in jobs_list:
            job.wait_for_completion(max_wait=60000)
        df_model = __get_model_scores(project)
        if max(df_model["sample_pct"])==100:
            df_model = df_model[df_model["sample_pct"] == sorted(list(set(df_model["sample_pct"])))[-2]]
            df_model = df_model[df_model["category"] == "model"]
            for model in (df_model[df_model["cv"].isnull()]["model"]):
                print("Seed"+str(i+1)+" "+model.model_type+" started CV")
                model.cross_validate()
        else:
            print("Seed "+str(i+1)+" looks error occured. Ignored")
            project_id_list[i]="error"
            continue
            
def __wait_for_cv(project_id_list):
    for i in range(0, N_ITERATIONS):
        project = dr.Project.get(project_id=project_id_list[i])
        jobs_list = project.get_all_jobs()
        for job in jobs_list:
            job.wait_for_completion(max_wait=60000)
        print("completed CV of in seed"+str(i+1))

#random_seeds = []
#project_ids = []
#project_names = []

#project_id_list=__run_autopilot()
#__run_cross_validation(project_id_list)
#__wait_for_cv(project_id_list)
#dict_log_project = {
#    'random_seed': random_seeds,
#    'project_id': project_ids,
#    'project_name': project_names
#}
#pd.to_pickle(dict_log_project, '../output/intermediate_files/project_info.pkl')

project_id_list=['5f85e73425132a0427978946']

In [27]:
### 5. 特徴量のインパクトを算出

N_BEST_MODELS = 3 # 上位いくつのモデルを参照して重要な特徴量を決めるか
N_TOP_FEATURES = 10 # 複数モデル共通で重要度topNの特徴量は、重要とみなす。その時のN

def __check_impact(project_id_list, save_histgrams=True):
    i=0
    feature_impacts = [{} for j in range(N_ITERATIONS)]
    project = dr.Project.get(project_id=project_id_list[i])
    jobs_list = project.get_all_jobs()
    for job in jobs_list:
        job.wait_for_completion(max_wait=60000)
    print("Completed seed"+str(i+1))
    df_model = __get_model_scores(project)
    df_model['type'] = df_model['type'].str.replace(' / ', ', ')
    target_pct = sorted(df_model['sample_pct'].unique())[::-1][1]
    df_model = df_model[df_model['sample_pct'] == target_pct]
    if ms.METRICS == 'AUC':
        df_model = df_model.sort_values('cv', ascending=False)
    df_model = df_model[:N_BEST_MODELS]
    print("Request and get feature impact for seed"+str(i+1))
    for m, model_id in enumerate(df_model['model_id']):
        model = dr.Model.get(project_id_list[i], model_id)
        feature_impact = model.get_or_request_feature_impact(max_wait=60000, row_count=100000)
        feature_impacts[i][model_id] = feature_impact
        if save_histgrams:
            percent_tick_fmt = mtick.PercentFormatter(xmax=1.0)
            impact_df = pd.DataFrame(feature_impact)
            impact_df.sort_values(by='impactNormalized', ascending=True, inplace=True)
            bar_colors = impact_df.impactNormalized.apply(lambda x: dr_red if x < 0
                                                          else dr_blue)
            ax = impact_df.plot.barh(x='featureName', y='impactNormalized',
                                     legend=False,
                                     color=bar_colors,
                                     figsize=(16, 8))
            ax.xaxis.set_major_formatter(percent_tick_fmt)
            ax.xaxis.set_tick_params(labeltop=True)
            ax.xaxis.grid(True, alpha=0.2)
            ax.set_facecolor(dr_dark_blue)
            plt.ylabel('')
            plt.xlabel('Effect')
            plt.xlim((None, 1))  # Allow for negative impact
            plt.title('Feature Impact', y=1.04)
            plt.savefig('../output/intermediate_files/feature_impacts_rank_{rank}_{model_name}.png'.format(rank=m+1, model_name=df_model['type'].iloc[m].replace(' ', '_')))
            plt.close()
    return feature_impacts

def __get_common_top_features(feature_impacts, n_top=10):
    commonly_important_features = [[] for i in range(0, N_ITERATIONS)]
    for i in range(0, N_ITERATIONS):
        best_models_important_features = None
        for m, (model_id, important_features) in enumerate(feature_impacts[i].items()):
            important_features = [feature['featureName'] for feature in important_features][:n_top]
            commonly_important_features[i] += important_features
            if m == 0:
                best_models_important_features = important_features
        commonly_important_features[i] = list(set(commonly_important_features[i])) # 重複削除
        commonly_important_features[i] = [f for f in best_models_important_features if f in commonly_important_features[i]] # ソート
    return commonly_important_features
            
feature_impacts=__check_impact(project_id_list)
commonly_important_features=__get_common_top_features(feature_impacts, n_top=N_TOP_FEATURES)
pd.to_pickle(commonly_important_features, "../output/intermediate_files/commonly_important_features.pkl") # 共通で重要な特徴量の一覧を保存しておく。読み出し：arr_tmp = pd.read_pickle("commonly_important_features.pkl")

Completed seed1
Request and get feature impact for seed1


In [34]:
### 6. 最も精度の良いモデルの特徴量ごとの作用を算出

def __get_file_name(feature_name, graph_type):
    if graph_type == 'partial_dependence':
        return '../output/intermediate_files/partial_dependence_{}.png'.format(feature_name)
    elif graph_type == 'histgram':
        return '../output/intermediate_files/partial_dependence_hist_{}.png'.format(feature_name)
    else:
        return 
    
def __get_feature_effects(project_id_list):
    best_model_feature_effects = [None]*N_ITERATIONS
    # request
    for i in range(0, N_ITERATIONS):
        if project_id_list[i]=="error":
            best_model_feature_effects[i]="error"
            continue
        project = dr.Project.get(project_id=project_id_list[i])
        df_model = __get_model_scores(project)
        df_model['type'] = df_model['type'].str.replace(' / ', ', ')
        target_pct = sorted(df_model['sample_pct'].unique())[::-1][1]
        df_model = df_model[df_model['sample_pct'] == target_pct]
        if ms.METRICS == 'AUC':
            df_model = df_model.sort_values('cv', ascending=False)
        model = dr.Model.get(project_id_list[i], df_model.iloc[0, 3])
        model.get_feature_effect_metadata()
        model.request_feature_effect()
    # get results
    for i in range(0, N_ITERATIONS):
        if project_id_list[i]=="error":
            best_model_feature_effects[i]="error"
            continue
        project = dr.Project.get(project_id=project_id_list[i])
        jobs_list = project.get_all_jobs()
        print("Waiting feature effect seed "+str(i+1))
        for job in jobs_list:
            job.wait_for_completion(max_wait=60000)
        df_model = __get_model_scores(project)
        df_model['type'] = df_model['type'].str.replace(' / ', ', ')
        target_pct = sorted(df_model['sample_pct'].unique())[::-1][1]
        df_model = df_model[df_model['sample_pct'] == target_pct]
        if ms.METRICS == 'AUC':
            df_model = df_model.sort_values('cv', ascending=False)
        model = dr.Model.get(project_id_list[i], df_model.iloc[0, 3])
        best_model_feature_effects[i] = model.get_feature_effect("validation")

    return best_model_feature_effects

def __create_effect_plot(important_features, feature_effects):
    max_dependence=0
    min_dependence=100
    
    for x in important_features:
        effect=pd.DataFrame()
        for i in range(0,N_ITERATIONS):
            if project_id_list[i]=="error":
                continue
            df=pd.io.json.json_normalize(feature_effects[i])
            if x in list(df["feature_name"]):
                z=pd.DataFrame()
                z=list(df[df["feature_name"]==x]["partial_dependence.data"])

                def flatten_2d(data):
                    for block in data:
                        for elem in block:
                            yield elem

                z = list(flatten_2d(z))
                z=  pd.DataFrame(z)
                effect[["label","seed"+str(i+1)]]=z[["label","dependence"]]
                if max_dependence<z["dependence"].max():
                    max_dependence=z["dependence"].max() 
                if min_dependence>z["dependence"].min():
                    min_dependence=z["dependence"].min() 
                if list(df[df["feature_name"]==x]["feature_type"])==["numeric"]:
                    effect["label"]=effect["label"].astype("float")
                else:
                    def order(effect,df_x,x):
                        l_order = list(df_x[x].value_counts().index)
                        j=0
                        l_order_dic={}
                        for x in l_order:
                            l_order_dic[x]=j
                            j+=1
                        effect['order'] = effect['label'].map(l_order_dic)
                        effect["order"]=effect["order"].fillna(j)
                        effect=effect.sort_values("order")
                        effect.drop(columns="order",inplace=True)
                        return effect
                    
                    effect=order(effect,df_x,x)
                    
        if x in list(df["feature_name"]):
            effect.to_csv('../output/intermediate_files/partial_dependence_{}.csv'.format(x),index=False,encoding='utf-8-sig')
            
    for x in important_features:
        if x in list(df["feature_name"]):
            effect=pd.read_csv('../output/intermediate_files/partial_dependence_{}.csv'.format(x),encoding='utf-8-sig')
            
            if effect["label"].nunique()!=1:
                plt.rcParams['font.family'] = 'IPAexGothic'
                plt.rcParams['font.size'] = 15
                plt.rcParams['lines.linewidth'] = 1
                plt.rcParams['figure.figsize'] = 8,6
                plt.rcParams['axes.facecolor'] = dr_dark_blue
                if list(df[df["feature_name"]==x]["feature_type"])==["numeric"]:
                    for y in effect.columns[1:]:
                        if y == effect.columns[-1]:
                            plt.rcParams['lines.linewidth'] = 5
                        plt.plot(effect["label"], effect[y], label = y, color=dr_orange)
                else:
                    for y in effect.columns[1:]:
                        if y == effect.columns[-1]:
                            plt.scatter(effect["label"], effect[y], label = y, s=100, color=dr_orange)
                        else:
                            plt.scatter(effect["label"], effect[y], label = y, color=dr_orange)
                plt.xticks(rotation=90, size='small')
                plt.title(x+"の作用")
                plt.legend()
                plt.grid()
                plt.ylim([min_dependence,max_dependence])
                plt.savefig(__get_file_name(x, graph_type='partial_dependence'),bbox_inches="tight")
                #plt.show()
                plt.close()
                
                plt.rcParams['axes.facecolor'] = dr_dark_blue
                if list(df[df["feature_name"]==x]["feature_type"])==["numeric"]:
                    df_x[x].hist(bins=10)
                    print(effect["label"].min(),effect["label"].max())
                else:
                    df_x[x].value_counts().plot(kind="bar", color=dr_blue)
                plt.title(x+"のヒストグラム")
                plt.savefig(__get_file_name(x, graph_type='histgram'),bbox_inches="tight")
                #plt.show()
                plt.close()
            
    return

def __create_wordcloud(project_id_list):
    i=0
    
    def word_cloud_plot(wc, font_path=None):
        # Stopwords usually dominate any word cloud, so we will filter them out
        dict_freq = {wc_word['ngram']: wc_word['frequency']
                     for wc_word in wc.ngrams
                     if not wc_word['is_stopword']}
        dict_coef = {wc_word['ngram']: wc_word['coefficient']
                     for wc_word in wc.ngrams}

        def color_func(*args, **kwargs):
            word = args[0]
            palette_index = int(round(dict_coef[word] * 100)) + 100
            r, g, b = colors[palette_index].get_rgb()
            return 'rgb({:.0f}, {:.0f}, {:.0f})'.format(int(r * 255),
                                                        int(g * 255),
                                                        int(b * 255))

        wc_image = wordcloud.WordCloud(stopwords=set(),
                                       width=1024, height=1024,
                                       relative_scaling=0.5,
                                       prefer_horizontal=1,
                                       color_func=color_func,
                                       background_color=(0, 10, 29),
                                       font_path=font_path).fit_words(dict_freq)
        plt.imshow(wc_image, interpolation='bilinear')
        plt.axis('off')
        plt.savefig('../output/intermediate_files/word_cloud.png',bbox_inches='tight')
        plt.close()
        return 
        
    colors = [Color('#2458EB')]
    colors.extend(list(Color('#2458EB').range_to(Color('#31E7FE'), 81))[1:])
    colors.extend(list(Color('#31E7FE').range_to(Color('#8da0a2'), 21))[1:])
    colors.extend(list(Color('#a18f8c').range_to(Color('#ffad9e'), 21))[1:])
    colors.extend(list(Color('#ffad9e').range_to(Color('#d80909'), 81))[1:])
    webcolors = [c.get_web() for c in colors]
        
    from matplotlib.colors import LinearSegmentedColormap
    dr_cmap = LinearSegmentedColormap.from_list('DataRobot', webcolors, N=len(colors))
        
    project = dr.Project.get(project_id=project_id_list[i])
    models = project.get_models()
    model_with_word_cloud = None
    for model in models:
        if 'Auto-Tuned Word N-Gram Text Modeler' in model.model_type:
            try:
                model.get_word_cloud()
                model_with_word_cloud = model
                break
            except ClientError as e:
                if e.json['message'] and 'No word cloud data' in e.json['message']:
                    pass
                else:
                    raise
    wc = model_with_word_cloud.get_word_cloud(exclude_stop_words=True)
    #word_cloud_plot(wc, font_path='C:/Windows/Fonts/ipaexg.ttf')
    word_cloud_plot(wc, font_path='/Users/mon/Library/Fonts/ipaexg.ttf')
    return 

feature_effects=__get_feature_effects(project_id_list)
__create_effect_plot(commonly_important_features[0], feature_effects)
__create_wordcloud(project_id_list)

Waiting feature effect seed 1


  df=pd.io.json.json_normalize(feature_effects[i])


200000.0 3500000.0
2500000.0 15488000.000000013
234585.0 4198123.000000003
4.56 31.73
4010300.12 4989761.1499999985
0.0 6.0
4.0 24.0


In [39]:
### 7. Logファイルを作成・保存

def __create_log(project_id_list):    
    # 必要な情報の読み込み
    prj_ids, iterations, model_ranks, model_names, model_ids, sample_pcts = [], [], [], [], [], []
    metric_names, cv_scores, f1scores, precisions, recalls = [], [], [], [], []
    for i in range(0, N_ITERATIONS):
        if project_id_list[i]=="error":
            prj_ids.append("error")
            iterations.append("error")
            model_ranks.append("error")
            model_names.append("error")
            metric_names.append("error")
            cv_scores.append("error")
            model_ids.append("error")
            sample_pcts.append("error")
            f1scores.append("error")
            precisions.append("error")
            recalls.append("error")
            continue
        project = dr.Project.get(project_id=project_id_list[i])
        df_model = __get_model_scores(project)
        df_model['type'] = df_model['type'].str.replace(' / ', ', ')
        target_pct = sorted(df_model['sample_pct'].unique())[::-1][1]
        df_model = df_model[df_model['sample_pct'] == target_pct]
        if ms.METRICS == 'AUC':
            df_model = df_model.sort_values('cv', ascending=False)
        for m in range(N_BEST_MODELS):
            model = dr.Model.get(project=project_id_list[i], model_id=df_model['model_id'].iloc[m])
            roc = model.get_roc_curve('crossValidation')
            threshold = roc.get_best_f1_threshold()
            metrics = roc.estimate_threshold(threshold)
            
            prj_ids.append(project_id_list[i])
            iterations.append(i)
            model_ranks.append(m+1)
            model_names.append(df_model['type'].iloc[m])
            model_ids.append(df_model['model_id'].iloc[m])
            sample_pcts.append(df_model['sample_pct'].iloc[m])
            metric_names.append(project.metric)
            cv_scores.append(model.metrics[project.metric]["crossValidation"])
            f1scores.append(metrics['f1_score'])
            precisions.append(metrics['true_positive_score']/(metrics['true_positive_score']+metrics['false_positive_score']))
            recalls.append(metrics['true_positive_score']/(metrics['true_positive_score']+metrics['false_negative_score']))
    # DataFrameの作成
    df_log_model = pd.DataFrame({
        'project_id': prj_ids,
        'random_seed': iterations,
        'model_id': model_ids,
        'model_rank': model_ranks,
        'model_name': model_names,
        'sample_pct': sample_pcts,
        'metric': metric_names,
        'cv_score': cv_scores,
        'f1_score': f1scores,
        'precision': precisions,
        'recall': recalls
    })
    df_log_model['image_path_impact'] = '../output/intermediate_files/feature_impacts_rank_' + df_log_model['model_rank'].astype(str) + '_' + df_log_model['model_name'].str.replace(' ', '_') + '.png'
    return df_log_model
        
df_log_model = __create_log(project_id_list) 
df_log_model.to_csv('../output/intermediate_files/model_info.csv',index=False,encoding='utf-8-sig')