In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False
import seaborn as sns
import tqdm

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier            # Voting 기법 사용 Module
from sklearn.neighbors import KNeighborsClassifier 
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

Data Load

In [15]:
df = pd.read_csv('./data/lotto.csv', index_col=0)
df

Unnamed: 0_level_0,1,2,3,4,5,6,bonus,1st,2nd,3rd,4th,5th
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,10,23,29,33,37,40,16,0,143934100,5140500,113400,10000
2,9,13,21,25,32,42,2,2002006800,94866800,1842000,100800,10000
3,11,16,19,21,27,31,30,2000000000,0,1174100,54900,10000
4,14,27,30,31,40,42,2,0,211191200,7282400,152100,10000
5,16,24,29,40,41,42,3,0,0,6033800,166500,10000
...,...,...,...,...,...,...,...,...,...,...,...,...
1082,21,26,27,32,34,42,31,3720489643,70009214,1568126,50000,5000
1083,3,7,14,15,22,38,17,1713084525,59482102,1308098,50000,5000
1084,8,12,13,29,33,42,5,1738764600,40625342,1447523,50000,5000
1085,4,7,17,18,38,44,36,1073277473,51427879,1185316,50000,5000


In [13]:
df.index

RangeIndex(start=0, stop=1086, step=1)

In [14]:
df.columns

Index(['num', '1', '2', '3', '4', '5', '6', 'bonus', '1st', '2nd', '3rd',
       '4th', '5th'],
      dtype='object')

One Hot Encoding

In [None]:
# one-hot encoding

# 당첨번호를 원핫인코딩벡터(ohbin)으로 변환
def numbers2ohbin(numbers):

    ohbin = np.zeros(45) #45개의 빈 칸을 만듬

    for i in range(6): #여섯개의 당첨번호에 대해서 반복함
        ohbin[int(numbers[i])-1] = 1 #로또번호가 1부터 시작하지만 벡터의 인덱스 시작은 0부터 시작하므로 1을 뺌
    
    return ohbin

# 원핫인코딩벡터(ohbin)를 번호로 변환
def ohbin2numbers(ohbin):

    numbers = []
    
    for i in range(len(ohbin)):
        if ohbin[i] == 1.0: # 1.0으로 설정되어 있으면 해당 번호를 반환값에 추가한다.
            numbers.append(i+1)
    
    return numbers
     

Modeling

In [None]:
# train test spilt 

X = 
y = 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=777)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
# Model Load
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = xgb.XGBClassifier()
lgb = lgb.LGBMClassifier()

# Training 
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)

# Predict 
dt_pred = dt.predict(X_val)
rf_pred = rf.predict(X_val)
xgb_pred = xgb.predict(X_val)
lgb_pred = lgb.predict(X_val)

Evaluation

In [None]:
# accuracy, recall, precision, auc graph, confusion matrix

def eval_model(pred_list, name_list, y_test):
    for pred, name in zip(pred_list, name_list):
        accuracy = accuracy_score(y_test , pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)
        matrix = confusion_matrix(y_test, pred)

        plt.figure(figsize=(8,3))
        plt.subplot(1, 2, 1)
        fpr, tpr, thresholds = roc_curve(y_val, pred, pos_label=1)
        # roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color='darkorange')
        plt.plot([0, 1], [0, 1], color = 'black', label = 'y = x')
        plt.subplot(1, 2, 2)
        sns.heatmap(matrix, annot=True, fmt='g') # fmt='g' : 지수 표기를 일반 표기로
        plt.title('{0} Model Confusion Matrix'.format(name))
        plt.show()
        plt.tight_layout()

        print('{0} 정확도: {1:.4f}'.format(name, accuracy))
        print('{0} Recall: {1:.4f}'.format(name, recall))
        print('{0} Precision: {1:.4f}'.format(name, precision))
        print('{0} AUC: {1:.4f}'.format(name, auc))

        print('='* 50)
        print()