In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def make_submitfile(submit_date = "0815",filename = "submission", version = "00", data = []):
    submit = pd.read_csv("data/sample_submit.csv", header=None)
    # 提出ファイルの2列目を予測値で置き換えます。
    submit[1] = data
    submit.to_csv(f"output/{submit_date}{filename}_{version}.csv", index=False, header=None)

In [3]:
from IPython.display import clear_output

# ライブラリのimportを行います
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
import json

import re
import unicodedata
from kanjize import kanji2number
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import lightgbm as lgb
import logging
logging.getLogger("lightgbm").setLevel(logging.WARNING)
from sklearn.metrics import accuracy_score

from imblearn.under_sampling import RandomUnderSampler
from tqdm.auto import tqdm

In [4]:
import src.preprocess as pre
import src.feature_selection as fs
import src.stacking as stacking
import src.classification_models as models

In [5]:
class RESULT_class:
    def __init__(self):
        self.RESULT = pd.DataFrame()

    def addResult(self, model_name,
                  feature_selection,
                  under_sampling,
                  PCAd,
                  columns,
                  skf_num,
                  skf_rand,
                  score,
                  oof_valid,
                  oof_test,
                  date = "0821"):

        tmp = pd.DataFrame([[model_name,feature_selection,under_sampling,PCAd,columns,skf_num,skf_rand,score,oof_valid,oof_test,date]])
        tmp.columns = ["model_name","feature_selection","under_sampling","PCA(d)","columns","skf_num","skf_rand","score","oof_valid","oof_test","date"]
        print(len(self.RESULT))
        if len(self.RESULT)>0:
            self.RESULT = pd.concat([self.RESULT, tmp])
        else:
            self.RESULT = tmp

Rc = RESULT_class()

# Dataload

In [6]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [7]:
df_dict = {"train":train.copy(),
           "synthetic":train.copy(),
          "test":test.copy()}

- id: 営業リストの顧客ID
- Age: 顧客の年齢
- TypeofContact: 顧客への連絡方法
- CityTier: 都市層(1>2>3)
- DurationOfPitch: 営業担当者による顧客への売り込み時間
- Occupation: 顧客のご職業
- Gender: 顧客の性別
- NumberOfPersonVisiting: 一緒に旅行を予定している人数の合計
- NumberOfFollowups: セールストーク後に営業担当者が行ったフォローアップの合計数
- ProductPitched: 営業担当者による商品の売り込み
- PreferredPropertyStar: 顧客によるホテル施設の優先評価
- NumberOfTrips: 顧客の年間旅行数
- Passport: パスポートの所持
- PitchSatisfactionScore: 売り込みの満足度スコア
- Designation: 現在の組織における顧客の指定
- MonthlyIncome: 顧客の月収
- customer_info: 顧客の情報のメモ(婚姻状況や車の有無、旅行の子どもの同伴の有無について記載されている)
- ProdTaken: 目的変数

In [8]:
print(train.shape)
print(test.shape)

(3489, 18)
(3489, 17)


# 基本変換

In [9]:
# 似た文字のマッピング辞書
char_map = {
    'ς': 'c',
    'а': 'a', 'А': 'A','α':'a',
    'е': 'e', 'Е': 'E',
    'о': 'o', 'О': 'O',
    'с': 'c', 'С': 'C',
    'р': 'p', 'Р': 'P',
    'υ': 'y', 'Υ': 'Y',
    'ꓢ': 'S', 'ꓤ': 'R',
    'Ѕ': 'S','ѕ':'s',
    'Ꭰ': 'D', 'ᗞ': 'D', 'ꭰ': 'D', '𝙳': 'D',
    'Տ': 'S',
    '𝖺': 'a', '𝘤': 'c', '𝔡': 'd',
    '𐊡': 'B', 'в':'B','β':'B',
    'μ':'m',
    'ε':'E',
    'K':'K',
    'ı':'i',
    'n':'n','ո':'n',
    'ｓ': 's', 'տ':'s','ι':'i',
    '×': 'x'
}

# 特定の文字列を正規化するための辞書
pattern_map = {
    r'(?i)basic': 'Basic',
    r'(?i)standard': 'Standard',
    r'(?i)super deluxe': 'Super Deluxe',
    r'(?i)deluxe': 'Deluxe',
    r'(?i)king': 'King',
    r'(?i)de\|uxe': 'Deluxe',
}

In [10]:
for kind in ["train","test"]:
    df_dict[kind]["Age"] = df_dict[kind]["Age"].map(lambda x:pre.Age(x))
    df_dict[kind]["Age"] = df_dict[kind]["Age"].map(lambda x: pre.age_distribution(df_dict[kind], x) if x < 0 else x)
    df_dict[kind]["DurationOfPitch"] = df_dict[kind]["DurationOfPitch"].map(lambda x:pre.DurationOfPitch(x))
    df_dict[kind]["Gender"] = df_dict[kind]["Gender"].map(lambda x:pre.Gender(x))
    df_dict[kind]["ProductPitched"] = df_dict[kind]["ProductPitched"].map(lambda x:pre.ProductPitched(x, char_map, pattern_map))
    df_dict[kind]["NumberOfTrips"] = df_dict[kind]["NumberOfTrips"].map(lambda x:pre.NumberOfTrips(x))
    df_dict[kind]["Designation"] = df_dict[kind]["Designation"].map(lambda x:pre.ProductPitched(x, char_map, pattern_map))
    df_dict[kind]["MonthlyIncome"] = df_dict[kind]["MonthlyIncome"].map(lambda x:pre.MonthlyIncome(x)).astype(float)
    df_dict[kind]["MonthlyIncome"] = df_dict[kind]["MonthlyIncome"].map(lambda x: pre.Income_distribution(df_dict[kind], x) if x % 1000 == 0 else x)
    df_dict[kind]["NumberOfFollowups"] = df_dict[kind]["NumberOfFollowups"].map(lambda x:pre.NumberOfFollowups(x)).astype(float)
    df_dict[kind] = pre.customer_info(df_dict[kind]).drop(["customer_info"],axis=1)

# Gans

In [11]:
# SDV
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
# 診断用
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

In [12]:
# データフレームからメタデータを自動抽出
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_dict["train"])
metadata.update_column(column_name="CityTier",sdtype="categorical")

In [13]:
# インスタンス生成
ctgan = CTGANSynthesizer(metadata,epochs=1000,verbose=True) 
# 学習
ctgan.fit(df_dict["train"])

Gen. (-1.27) | Discrim. (-0.25): 100%|█████████████████████████████████████████████| 1000/1000 [06:56<00:00,  2.40it/s]


In [14]:
synthetic_data = ctgan.sample(100000)
# 生成したデータを確認
synthetic_data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,marriage,car,child
0,807994768,60.0,Company Invited,2,9.0,Small Business,male,3.0,5.0,standard,3.0,2.0,0,1,senior manager,441593.0,0,未婚,車未所有,2.0
1,746439230,33.0,Company Invited,3,14.0,Large Business,female,1.0,3.0,basic,4.0,1.0,0,4,executive,257463.0,0,未婚,車未所有,0.0
2,201363792,42.0,Self Enquiry,2,13.0,Small Business,female,2.0,4.0,standard,4.0,2.0,0,1,senior manager,398940.0,1,未婚,車未所有,0.0
3,364823003,32.0,Company Invited,1,14.0,Large Business,female,2.0,3.0,standard,4.0,2.0,1,2,senior manager,337741.0,1,未婚,車未所有,0.0
4,726973888,41.0,Self Enquiry,2,15.0,Small Business,male,2.0,,deluxe,3.0,3.0,0,3,manager,303355.0,0,結婚済み,車未所有,1.0


In [15]:
diagnostic_report = run_diagnostic(
    real_data=train,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 20/20 [00:00<00:00, 512.38it/s]|
Data Validity Score: 85.8%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 998.88it/s]|
Data Structure Score: 80.95%

Overall Score (Average): 83.37%



In [16]:
#diagnostic_report.get_details(property_name='Coverage')

In [17]:
quality_report = evaluate_quality(
    real_data=df_dict["train"],
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 20/20 [00:00<00:00, 123.35it/s]|
Column Shapes Score: 90.55%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 190/190 [00:05<00:00, 33.44it/s]|
Column Pair Trends Score: 85.33%

Overall Score (Average): 87.94%



In [18]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,Age,KSComplement,0.902094
1,TypeofContact,TVComplement,0.846306
2,CityTier,TVComplement,0.937822
3,DurationOfPitch,KSComplement,0.874748
4,Occupation,TVComplement,0.845442
5,Gender,TVComplement,0.909276
6,NumberOfPersonVisiting,TVComplement,0.96397
7,NumberOfFollowups,TVComplement,0.875719
8,ProductPitched,TVComplement,0.974126
9,PreferredPropertyStar,TVComplement,0.935466


In [19]:
fig = get_column_plot(
    real_data=df_dict["train"],
    synthetic_data=synthetic_data,
    column_name='Age',
    metadata=metadata
)
    
fig.show()

In [20]:
fig = get_column_plot(
    real_data=df_dict["train"],
    synthetic_data=synthetic_data,
    column_name='ProdTaken',
    metadata=metadata
)
    
fig.show()

In [21]:
df_dict["synthetic"] = synthetic_data