In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
val = pd.read_csv('PJT002_validation.csv')
train = pd.read_csv('PJT002_train.csv')
test = pd.read_csv('PJT002_test.csv')

train_id = set (train['id'].unique())
val_id = set (val['id'].unique())
val.shape, train.shape, test.shape

((6898, 180), (59199, 180), (2957, 180))

In [3]:
df = pd.concat([val,train,test])
df['fr_yn'] = df['fr_yn'].apply(lambda x: 0 if x == 'N' else 1 if x == 'Y' else -1)
df.shape

(69054, 180)

In [4]:
def fillna_linear(data,time_column,column_name):
    data[time_column] = pd.to_datetime(data[time_column])
    data = data.sort_values(by = time_column)
    data[column_name] = data[column_name].interpolate(method = 'linear')

for column_name in ['tmprtr','prcpttn','wnd_spd','wnd_drctn','hmdt']:
    fillna_linear(df,'dt_of_fr',column_name)

In [5]:
def drop_feature(data, column_name, drop_rate = 0.5):
    percent = data[column_name].isnull().sum() / len(data) 
    print ('{0}: {1}'.format(column_name,round(percent,2)))
    if percent >= drop_rate:
        del df[column_name]
        print ('---------->DELETE: {0}'.format(column_name))

In [6]:
for i in df:
    drop_feature(df, i, drop_rate = 0.5)
print (df.shape)

ahsm_dstnc: 0.0
bldng_ar: 0.0
bldng_ar_prc: 0.38
bldng_archtctr: 0.42
bldng_cnt: 0.0
bldng_cnt_in_50m: 0.0
bldng_us: 0.42
bldng_us_clssfctn: 0.46
blk_dngrs_thng_mnfctr_yn: 0.83
---------->DELETE: blk_dngrs_thng_mnfctr_yn
cctv_dstnc: 0.0
cctv_in_100m: 0.0
cltrl_hrtg_yn: 0.83
---------->DELETE: cltrl_hrtg_yn
dngrs_thng_yn: 0.83
---------->DELETE: dngrs_thng_yn
dt_of_athrztn: 0.42
dt_of_fr: 0.0
ele_engry_us_201401: 0.42
ele_engry_us_201402: 0.42
ele_engry_us_201403: 0.42
ele_engry_us_201404: 0.42
ele_engry_us_201405: 0.42
ele_engry_us_201406: 0.42
ele_engry_us_201407: 0.42
ele_engry_us_201408: 0.42
ele_engry_us_201409: 0.42
ele_engry_us_201410: 0.42
ele_engry_us_201411: 0.42
ele_engry_us_201412: 0.42
ele_engry_us_201501: 0.42
ele_engry_us_201502: 0.42
ele_engry_us_201503: 0.42
ele_engry_us_201504: 0.42
ele_engry_us_201505: 0.42
ele_engry_us_201506: 0.42
ele_engry_us_201507: 0.42
ele_engry_us_201508: 0.42
ele_engry_us_201509: 0.42
ele_engry_us_201510: 0.42
ele_engry_us_201511: 0.42
ele_eng

In [7]:
# str이 너무 다양하면 안맞음
def seperate_data_type(data,standard):
    binominal = []
    continuous = []
    for i in data.columns:
        if data[i].nunique() < standard:
            binominal.append(i)
        else:
            continuous.append(i)
    print ('binominal_data:', len(binominal), '개')
    print ('binominal_data:', binominal)
    print ('-------------------------------------------------------------------------------')
    #binominal_data = data[binominal]
    print ('continuous_data:', len(continuous), '개')
    print ('continuous_data:', continuous)
    #continuous_data = data[continuous]
    return binominal,continuous

In [8]:
binominal_list,continuous_list = seperate_data_type(df,50)

binominal_data: 15 개
binominal_data: ['bldng_archtctr', 'bldng_us', 'bldng_us_clssfctn', 'cctv_in_100m', 'fr_mn_cnt', 'fr_wthr_fclt_in_100m', 'fr_yn', 'jmk', 'lnd_us_sttn_nm', 'mlt_us_yn', 'rd_sd_nm', 'rgnl_ar_nm', 'rgnl_ar_nm2', 'ttl_dwn_flr', 'wnd_drctn']
-------------------------------------------------------------------------------
continuous_data: 142 개
continuous_data: ['ahsm_dstnc', 'bldng_ar', 'bldng_ar_prc', 'bldng_cnt', 'bldng_cnt_in_50m', 'cctv_dstnc', 'dt_of_athrztn', 'dt_of_fr', 'ele_engry_us_201401', 'ele_engry_us_201402', 'ele_engry_us_201403', 'ele_engry_us_201404', 'ele_engry_us_201405', 'ele_engry_us_201406', 'ele_engry_us_201407', 'ele_engry_us_201408', 'ele_engry_us_201409', 'ele_engry_us_201410', 'ele_engry_us_201411', 'ele_engry_us_201412', 'ele_engry_us_201501', 'ele_engry_us_201502', 'ele_engry_us_201503', 'ele_engry_us_201504', 'ele_engry_us_201505', 'ele_engry_us_201506', 'ele_engry_us_201507', 'ele_engry_us_201508', 'ele_engry_us_201509', 'ele_engry_us_201510

In [9]:
df_1 = df[binominal_list]

In [10]:
def imputer_cate_most_frequent(data,column_name):
    data[column_name] = data[column_name].fillna(data[column_name].value_counts().index[0])

In [11]:
for column_name in df_1:
    imputer_cate_most_frequent(df_1,column_name)

In [12]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

In [13]:
dummy_columns = list (set(binominal_list) - set(['fr_yn']))
df_1 = dummy_data(df_1, dummy_columns)

In [14]:
df_2 = df[continuous_list]

In [15]:
def imputer_cont_trim_mean(data,column_name):
    from scipy import stats
    try:
        data[column_name] = data[column_name].fillna(stats.trim_mean(data[column_name].dropna(), 0.2))
        data[column_name] = round(data[column_name],2)
    except:
        print ('not_changed ->',column_name)

In [16]:
for column_name in df_2:
    imputer_cont_trim_mean(df_2,column_name)

not_changed -> dt_of_athrztn
not_changed -> dt_of_fr
not_changed -> emd_nm


In [17]:
df_2['year'] = df_2['dt_of_fr'].dt.year 
df_2['month'] = df_2['dt_of_fr'].dt.month
df_2['day'] = df_2['dt_of_fr'].dt.day
df_2['hour'] = df_2['dt_of_fr'].dt.hour

In [18]:
df_2 = df_2.drop(['dt_of_athrztn','dt_of_fr','emd_nm'],1)

In [19]:
da = pd.concat([df_1,df_2],1)
da = da.reset_index(drop = True)
dt = da.copy()
dt.shape

(69054, 438)

In [20]:
# 데이터 나누기
df_val = dt[dt['id'].isin(val_id)]
df_train = dt[dt['id'].isin(train_id)]
df_test = dt[dt['fr_yn'] == -1]
# 마무으리
df_test = df_test.drop(['fr_yn'],1)
df_val = df_val.drop('id',1)
df_train = df_train.drop('id',1)
df_test = df_test.drop('id',1)

df_val.shape, df_train.shape, df_test.shape

((6898, 437), (59199, 437), (2957, 436))

In [21]:
X = df_train.drop('fr_yn',1)
Y = df_train[['fr_yn']]
val_X = df_val.drop('fr_yn',1)
val_Y = df_val[['fr_yn']]

from sklearn.model_selection import *
from sklearn.metrics import *
Train_X,Test_X,Train_Y,Test_Y = train_test_split(X, Y, test_size = 0.1, random_state = 13)

In [22]:
import xgboost as xgb

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    xgb_model = xgb.XGBClassifier(objective="binary:logistic", learning_rate = 0.15, max_depth = 5,
                                 max_delta_step = 7, max_bin = 512, eval_metric = 'poisson-nloglik',
                                 random_state = 13)
    xgb_model.fit(Train_X, Train_Y,
                eval_set=[(Test_X, Test_Y)],
                verbose = False,
                early_stopping_rounds = 1000)
    y_pred = xgb_model.predict(Test_X)
    predicted = [round(value) for value in y_pred]
    predicted = xgb_model.predict(val_X)
    print ('val_set - precision: {0}'.format(precision_score(val_Y,predicted)))
    print ('val_set - recall: {0}'.format(recall_score(val_Y,predicted)))
    print ('val_set - fl: {0}'.format(f1_score(val_Y,predicted)))

val_set - precision: 0.46200980392156865
val_set - recall: 0.5927672955974843
val_set - fl: 0.5192837465564738


In [23]:
import lightgbm as lgb

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lgb_model = lgb.LGBMClassifier(objective='binary', boosting_type='gbdt',learning_rate = 0.15, n_estimators = 60,
                                   max_bin = 225, metric='auc', num_leaves = 17,default = 'is_unbalance',
                                   random_state = 13)
    lgb_model.fit(Train_X, Train_Y,
                eval_set=[(Test_X, Test_Y)],
                verbose = False,
                early_stopping_rounds = 1000)
    y_pred = lgb_model.predict(Test_X)
    predicted = [round(value) for value in y_pred]
    predicted = lgb_model.predict(val_X)
    print ('val_set - precision: {0}'.format(precision_score(val_Y,predicted)))
    print ('val_set - recall: {0}'.format(recall_score(val_Y,predicted)))
    print ('val_set - fl: {0}'.format(f1_score(val_Y,predicted)))

val_set - precision: 0.46545226130653267
val_set - recall: 0.5825471698113207
val_set - fl: 0.5174581005586593


In [24]:
from sklearn.ensemble import VotingClassifier 

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    vo_model = VotingClassifier(estimators = [('XGB', xgb_model),('LGBM', lgb_model)],
                              voting = 'soft')
    vo_model.fit(Train_X, Train_Y)
    predicted = [round(value) for value in y_pred]
    predicted = vo_model.predict(val_X)
    print ('val_set - precision: {0}'.format(precision_score(val_Y,predicted)))
    print ('val_set - recall: {0}'.format(recall_score(val_Y,predicted)))
    print ('val_set - fl: {0}'.format(f1_score(val_Y,predicted)))

val_set - precision: 0.462640099626401
val_set - recall: 0.5841194968553459
val_set - fl: 0.5163307852675469


In [25]:
def sub(model,test):
    predicted = model.predict(test)
    sub = pd.DataFrame({'fr_yn': predicted})
    sub['fr_yn'] = sub['fr_yn'].apply(lambda x: 'Y' if x == 1 else 'N')
    sub.to_csv('화재예측과제_Submission.csv', index = False)
    print (sub['fr_yn'].value_counts())

In [26]:
sub(vo_model,df_test)

N    2295
Y     662
Name: fr_yn, dtype: int64
