# 導入資料

In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}


In [None]:
# 指定Google Drive雲端硬碟的根目錄，名為drive
!mkdir -p drive
!google-drive-ocamlfuse drive

In [None]:
# 指定當前工作夾
import os
# google drive中的檔案路徑,drive為之前指定的工作根目錄
os.chdir("drive/Colab Notebooks")

In [None]:
# 輸入資料
import pandas as pd
data = pd.read_csv("WY_fish_all_2019-12-29_wash.csv",encoding='utf-8')
data

# 資料理解、特徵提取


In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame

In [None]:
# 查看空值
data.isna().sum()
# 去空值
data.dropna(axis=0, how='any', inplace=True)

In [None]:
# 資料清洗
data = data.drop(['next_outlay'],axis=1) # 預測下次是否會增加押注

In [None]:
# 看欄位型態
print(data.dtypes)

In [None]:
# 看分布狀況
data.describe()

In [None]:
# 資料視覺化
df_for_plot = pd.concat([data['gameid'],data['next_outlay_will']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="gameid", y="next_outlay_will", data=df_for_plot)
fig.axis(ymin=0,ymax=3)

In [None]:
df_for_plot = pd.concat([data['gameid'],data['backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="gameid", y="backvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=3)

In [None]:
# 增加累計值時續性特徵
data['add_addvalue_time'] = 0
data[['add_addvalue_time']] = data[['add_addvalue_time']].astype('float64')  # 轉成float才能看小數點
for i in range(len(data['add_addvalue'])):
  data['add_addvalue_time'][i] = data['add_addvalue'][i]/data['time'][i]

data['add_outlay_time'] = 0
data[['add_outlay_time']] = data[['add_outlay_time']].astype('float64')    # 轉成float才能看小數點
for i in range(len(data['add_outlay'])):
  data['add_outlay_time'][i] = data['add_outlay'][i]/data['time'][i]

data['add_outlay_percentage_time'] = 0
data[['add_outlay_percentage_time']] = data[['add_outlay_percentage_time']].astype('float64')   # 轉成float才能看小數點
for i in range(len(data['add_outlay_percentage'])):
  data['add_outlay_percentage_time'][i] = data['add_outlay_percentage'][i]/data['time'][i]

data['add_addvalue_percentage_time'] = 0
data[['add_addvalue_percentage_time']] = data[['add_addvalue_percentage_time']].astype('float64')   # 轉成float才能看小數點
for i in range(len(data['add_addvalue_percentage'])):
  data['add_addvalue_percentage_time'][i] = data['add_addvalue_percentage'][i]/data['time'][i]

In [None]:
# 依遊戲時間做間格
def time_cut(time):
  if time <= 5:
    return 5
  if 5 < time <= 10:
    return 10
  if 10 < time <= 15:
    return 15
  if 15 < time <= 20:
    return 20
  if 20 < time <= 25:
    return 25
  if 25 < time <= 30:
    return 30
  if 30 < time <= 35:
    return 35
  if 35 < time <= 40:
    return 40
  if 40 < time <= 45:
    return 45
  if 45 < time <= 50:
    return 50
  if 50 < time <= 55:
    return 55  
  if 55 <= time:
    return 56   

def time_add_cut(time):
  if time <= 5:
    return 5
  if time <= 10:
    return 10
  if time <= 15:
    return 15
  if time <= 20:
    return 20
  if time <= 25:
    return 25
  if time <= 30:
    return 30
  if time <= 35:
    return 35
  if time <= 40:
    return 40
  if time <= 45:
    return 45
  if time <= 50:
    return 50
  if time <= 55:
    return 55  
  if 0 <= time:
    return 56  

data['time_zone'] = 0
for i in range(len(data['time'])):
  data['time_zone'][i] = time_cut(data['time'][i])


In [None]:
# 將 next_outlay_will 分級(以四分位距分級)
def will_level(data):
  if data < 0.79:
    return 1
  if 0.79 <= data < 1:
    return 2
  if 1 <= data < 1.22:
    return 3
  if 1.22 <= data:
    return 4

data['outlay_will_level'] = 0
for i in range(len(data['next_outlay_will'])):
  data['outlay_will_level'][i] = will_level(data['next_outlay_will'][i])

In [None]:
# 將 next_outlay_will 分級(以四分位距分級)
def will_level_loss(data):
  if data < 0.51:
    return 0
  if 0.51 <= data:
    return 1

data['outlay_will_loss'] = 0
for i in range(len(data['next_outlay_will'])):
  data['outlay_will_loss'][i] = will_level_loss(data['next_outlay_will'][i])

In [None]:
# 將 next_outlay_will 分數(以16分位距分級)
def will_level_16(data):
  if data < 0.18:
    return 1
  if 0.18 <= data < 0.51:
    return 2
  if 0.51 <= data < 0.68:
    return 3
  if 0.68 <= data < 0.79:
    return 4
  if 0.79 <= data < 0.84:
    return 5
  if 0.84 <= data < 0.91:
    return 6
  if 0.91 <= data < 0.96:
    return 7
  if 0.96 <= data < 1.00:
    return 8
  if 1.00 <= data < 1.01:
    return 9
  if 1.01 <= data < 1.05:
    return 10
  if 1.05 <= data < 1.13:
    return 11
  if 1.13 <= data < 1.22:
    return 12
  if 1.22 <= data < 1.34:
    return 13
  if 1.34 <= data < 1.58:
    return 14
  if 1.58 <= data < 2.24:
    return 15
  if 2.24 <= data:
    return 16

data['outlay_will_level_16'] = 0
for i in range(len(data['next_outlay_will'])):
  data['outlay_will_level_16'][i] = will_level_16(data['next_outlay_will'][i])
  

In [None]:
# 看變數影響權重
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.figure(figsize=(24,20))
sns.heatmap(data.corr(), annot=True, cmap="RdBu_r")

# 時間觀察特徵_視覺化


In [None]:
# 遊戲時間切割視覺化
# next_outlay_will
df_for_plot = pd.concat([data['time_zone'],data['next_outlay_will']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="next_outlay_will", data=df_for_plot)
fig.axis(ymin=0,ymax=2)

In [None]:
# backvalue
df_for_plot = pd.concat([data['time_zone'],data['backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="backvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=2)

In [None]:
# add_backvalue
df_for_plot = pd.concat([data['time_zone'],data['add_backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_backvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=2)


In [None]:
# add_addv_outlay_life
df_for_plot = pd.concat([data['time_zone'],data['add_addv_outlay_life']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_addv_outlay_life", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

In [None]:
# addvalue_percentage
df_for_plot = pd.concat([data['time_zone'],data['addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-0.5,ymax=0.5)

In [None]:
# outlay_percentage
df_for_plot = pd.concat([data['time_zone'],data['addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-0.5,ymax=0.5)

In [None]:
# add_addvalue_time
df_for_plot = pd.concat([data['time_zone'],data['add_addvalue_time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_addvalue_time", data=df_for_plot)
fig.axis(ymin=-500,ymax=500)

In [None]:
# add_outlay_time
df_for_plot = pd.concat([data['time_zone'],data['add_outlay_time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_outlay_time", data=df_for_plot)
fig.axis(ymin=0,ymax=10000)

In [None]:
# add_outlay_percentage_time
df_for_plot = pd.concat([data['time_zone'],data['add_outlay_percentage_time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_outlay_percentage_time", data=df_for_plot)
fig.axis(ymin=0,ymax=1)

In [None]:
# add_addvalue_percentage_time
df_for_plot = pd.concat([data['time_zone'],data['add_addvalue_percentage_time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="time_zone", y="add_addvalue_percentage_time", data=df_for_plot)
fig.axis(ymin=-0.25,ymax=0.25)

# 分級觀察特徵


In [None]:
# oldvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['oldvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="oldvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=1000)

In [None]:
# addvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['addvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="addvalue", data=df_for_plot)
fig.axis(ymin=-25,ymax=25)

In [None]:
# outlay 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['outlay']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="outlay", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

In [None]:
# income 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['income']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="income", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

In [None]:
# add_addvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_addvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_addvalue", data=df_for_plot)
fig.axis(ymin=-5000,ymax=5000)

In [None]:
# add_outlay 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_outlay']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_outlay", data=df_for_plot)
fig.axis(ymin=0,ymax=200000)

In [None]:
# add_income 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_income']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_income", data=df_for_plot)
fig.axis(ymin=0,ymax=200000)

In [None]:
# backvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="backvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=2)

In [None]:
# add_backvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_backvalue", data=df_for_plot)
fig.axis(ymin=0.85,ymax=1.15)

In [None]:
# outlay_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['outlay_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="outlay_percentage", data=df_for_plot)
fig.axis(ymin=0,ymax=1)

In [None]:
# add_outlay_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_outlay_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_outlay_percentage", data=df_for_plot)
fig.axis(ymin=0,ymax=2500)

In [None]:
# addvalue_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-0.5,ymax=0.5)

In [None]:
# add_addvalue_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-100,ymax=100)

In [None]:
# time 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="time", data=df_for_plot)
fig.axis(ymin=0,ymax=5000)

In [None]:
# add_addv_outlay_life 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level'],data['add_addv_outlay_life']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level", y="add_addv_outlay_life", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

# 易流失玩家行為分析


In [None]:
# 分級拉出資料
data1 = data[data["outlay_will_level"]==1]
data2 = data[data["outlay_will_level"]==2]
data3 = data[data["outlay_will_level"]==3]
data4 = data[data["outlay_will_level"]==4]

In [None]:
data.describe()

In [None]:
data1.describe()

In [None]:
data2.describe()

In [None]:
data3.describe()

In [None]:
data4.describe()

# 分級觀察特徵(16分位)

In [None]:
# oldvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['oldvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="oldvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=1000)

In [None]:
# addvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['addvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="addvalue", data=df_for_plot)
fig.axis(ymin=-25,ymax=25)

In [None]:
# outlay 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['outlay']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="outlay", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

In [None]:
# income 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['income']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="income", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

In [None]:
# add_addvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_addvalue']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_addvalue", data=df_for_plot)
fig.axis(ymin=-5000,ymax=5000)

In [None]:
# add_outlay 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_outlay']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_outlay", data=df_for_plot)
fig.axis(ymin=0,ymax=200000)

In [None]:
# add_income 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_income']/100], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_income", data=df_for_plot)
fig.axis(ymin=0,ymax=200000)

In [None]:
# backvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="backvalue", data=df_for_plot)
fig.axis(ymin=0,ymax=2)

In [None]:
# add_backvalue 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_backvalue']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_backvalue", data=df_for_plot)
fig.axis(ymin=0.85,ymax=1.15)

In [None]:
# outlay_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['outlay_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="outlay_percentage", data=df_for_plot)
fig.axis(ymin=0,ymax=1)

In [None]:
# add_outlay_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_outlay_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_outlay_percentage", data=df_for_plot)
fig.axis(ymin=0,ymax=2500)

In [None]:
# addvalue_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-0.5,ymax=0.5)

In [None]:
# add_addvalue_percentage 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_addvalue_percentage']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_addvalue_percentage", data=df_for_plot)
fig.axis(ymin=-100,ymax=100)

In [None]:
# time 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['time']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="time", data=df_for_plot)
fig.axis(ymin=0,ymax=5000)

In [None]:
# add_addv_outlay_life 分級視覺化
df_for_plot = pd.concat([data['outlay_will_level_16'],data['add_addv_outlay_life']], axis=1)
fig,axes = plt.subplots(figsize=(6,12))
fig = sns.boxplot(x="outlay_will_level_16", y="add_addv_outlay_life", data=df_for_plot)
fig.axis(ymin=0,ymax=100)

# 模型前處理

In [None]:
# 轉換數值型態(區分類別,數值)
data[['gameid']] = data[['gameid']].astype('category')   # 選取需轉換之欄位

In [None]:
# 類別轉化
data = pd.get_dummies(data)
data1 = data.drop(['next_outlay_will','playerid','outlay_will_level','outlay_will_level_16','time_zone'], axis=1) # 已經分級 去掉連續值
data1

In [None]:
# 切分訓練測試集
from sklearn.model_selection import train_test_split
y = data1['outlay_will_loss']           # 填入預測目的欄位
x = data1.drop('outlay_will_loss', axis=1)    # 將目的欄位去除，留下其他欄位做模型訓練
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.1, random_state=42)

# 資料清洗

In [None]:
# 資料分布做圖
# oldvalue
sns.regplot(x=x_train["oldvalue"]/100,y=x_train["next_outlay_will"])

In [None]:
# addvalue
sns.regplot(x=x_train["addvalue"]/100,y=x_train["next_outlay_will"])

In [None]:
# outlay
sns.regplot(x=x_train["outlay"]/100,y=x_train["next_outlay_will"])

In [None]:
# add_addvalue
sns.regplot(x=x_train["add_addvalue"]/100,y=x_train["next_outlay_will"])

In [None]:
# add_outlay
sns.regplot(x=x_train["add_outlay"]/100,y=x_train["next_outlay_will"])

In [None]:
# backvalue
sns.regplot(x=x_train["backvalue"],y=x_train["next_outlay_will"])

In [None]:
# add_backvalue
sns.regplot(x=x_train["add_backvalue"],y=x_train["next_outlay_will"])

In [None]:
# outlay_percentage
sns.regplot(x=x_train["outlay_percentage"],y=x_train["next_outlay_will"])

In [None]:
# add_outlay_percentage
sns.regplot(x=x_train["add_outlay_percentage"]/1000,y=x_train["next_outlay_will"])

In [None]:
# addvalue_percentage
sns.regplot(x=x_train["addvalue_percentage"],y=x_train["next_outlay_will"])

In [None]:
# add_addvalue_percentage
sns.regplot(x=x_train["add_addvalue_percentage"],y=x_train["next_outlay_will"])

In [None]:
# time
sns.regplot(x=x_train["time"],y=x_train["next_outlay_will"])

In [None]:
# add_addv_outlay_life
sns.regplot(x=x_train["add_addv_outlay_life"]/10000,y=x_train["next_outlay_will"])

In [None]:
# add_addvalue_percentage_time
sns.regplot(x=x_train["time"],y=x_train["add_addvalue_percentage_time"])

In [None]:
# 去除離群值

In [None]:
data.describe()

# 資料不平衡(outlay_will_loss)

In [None]:
# 資料不平衡
# 過採樣
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=0)
x_train, y_train = ros.fit_sample(x_train,y_train)
sorted(Counter(y_train).items())

In [None]:
# 欠採樣
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler(random_state=0)
x_train, y_train = rus.fit_sample(x_train,y_train)
 
sorted(Counter(y_train).items())

# 決策樹

In [None]:
# 建立決策樹模型
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2)
# 訓練決策樹模型
clf.fit(x_train,y_train)
# 查看預測正確率
from sklearn.metrics import accuracy_score
print("正確率:",accuracy_score(clf.predict(x_test),y_test))

In [None]:
# 決策樹繪圖
import graphviz
from sklearn.tree import export_graphviz
g = export_graphviz(clf,filled=True, feature_names=data1.columns[:-1], class_names=['1','2','3','4'])
graphviz.Source(g)

# 隨機森林

In [None]:
# 隨機森林
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
# 找最佳參數
clf = RandomForestClassifier()
p = {"n_estimators":range(10,20,2),"max_depth":range(5,20,2)} 
cv = GridSearchCV(clf, param_grid=p, cv=3)
cv.fit(x_train, y_train)
cv.best_params_

In [None]:
# randomforest model
clf = RandomForestClassifier(n_estimators=30,max_depth=10)
# np.average(cross_val_score(clf, x_train, y_train, cv=5))
clf.fit(x_train, y_train)
print("正確率:",accuracy_score(clf.predict(x_test),y_test))

In [None]:
# 查看randomforest變數重要性
importances = clf.feature_importances_
feat_labels = data.columns[1:]
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

# XgBoost 

In [None]:
# XgBoost
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from pandas import DataFrame

In [None]:
# 找最佳參數
clf = XGBRegressor()
p = {"n_estimators":range(10,20,2),"max_depth":range(5,20,2)} 
cv = GridSearchCV(clf, param_grid=p, cv=5)
cv.fit(x_train, y_train)
cv.best_params_

In [None]:
x_test = x_test[x_train.columns]
clf_xg = xgb.XGBRegressor(
    max_depth = 30,
    n_estimators = 10,
    min_child_weight=3,
    colsample_bylevel=0.7,
    colsample_bytree = 0.75, 
    gamma=0,   
    learning_rate = 0.033,
    max_delta_step=0,
    nthread= 12,  
    )
clf_xg.fit(x_train,y_train,verbose=False)
predictions = clf_xg.predict(x_test)
print("正確率:",accuracy_score(predictions.round(),y_test))

In [None]:
# 查看xgboost變數重要性
importances = clf_xg.feature_importances_
feat_labels = data.columns[0:-1]
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))