In [2]:
import random
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import GridSearchCV
import seaborn as sns

from math import pi
from math import log
from math import e
import math
import itertools
import csv

from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
# nan값 확인용
import collections
# 차원축소용
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
# 폴더 생성 함수
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
train_org = train.reset_index().copy()
test_org = test.reset_index().copy()
display(train.head())

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# NAN 확인

In [5]:
train_nan = list(train.isna().sum())
num_nan = list(filter(lambda x:x >= 1, train_nan))

index_nan = list(filter(lambda e:train_nan[e] >= 1, range(len(train_nan))))

nan_df = pd.DataFrame({'index_in_train' : index_nan, 'column_name' : train.iloc[:, index_nan].columns, 'NAN_number' : num_nan})
display(nan_df)

Unnamed: 0,index_in_train,column_name,NAN_number
0,1,HomePlanet,201
1,2,CryoSleep,217
2,3,Cabin,199
3,4,Destination,182
4,5,Age,179
5,6,VIP,203
6,7,RoomService,181
7,8,FoodCourt,183
8,9,ShoppingMall,208
9,10,Spa,183


In [7]:
target_feat = 'Transported'

x = train.drop(target_feat, axis = 1) # 이게 feature가 되고
y = train.loc[:, target_feat] # 이게 target이 됨

In [8]:
class cfg: 
    categorical_feats = list(set(list(x.select_dtypes("object").columns) + list(x.select_dtypes("bool").columns) + list(x.select_dtypes("category").columns))) # 범주형 변수
    numerical_feats = list(set(list(x.select_dtypes("int").columns) + list(x.select_dtypes("float").columns))) # 수치형 변수

In [9]:
def makePCA(data):
    '''
    data에 train 데이터를 넣어주면 됩니다.
    '''
    target_feat = 'Transported'

    x = data.drop(target_feat, axis = 1) # 이게 feature가 되고
    y = data.loc[:, target_feat] # 이게 target이 됨
    
    # 가변수화
    dummy_vars = cfg.categorical_feats
    train_gd = pd.get_dummies(x, columns = dummy_vars, drop_first = True)
    
    train_scaled = StandardScaler().fit_transform(train_gd) # pca를 위한 스케일링

    pca = PCA(n_components = 20) # 차원축소 완료 후 feature 개수

    #fit( )과 transform( ) 을 호출하여 PCA 변환 데이터 반환
    pca.fit(train_scaled)
    train_pca = pca.transform(train_scaled)
    
    pca_columns=[]
    pca_columns.extend('test' + str(i) for i in range (20)) # n_components 개수를 넣어줌

    train_pca_df = pd.DataFrame(train_pca, columns=pca_columns)
    
    return train_pca_df, y

In [10]:
def auto_features_selection(data, mint, maax):
    '''
    mint = 최소 컬럼 수, maax = 최대 컬럼수
    
    '''
    train = pd.read_csv('./' + data)
    train = train.dropna(axis = 0)
    
    recall = []
    feature = []
    pred_list = []
    precision = []
    f1_score = []
    
    x1, y = makePCA(train)
    print('PCA 데이터 생성완료')
    print()
    
    column_list = x1.columns # 변경 가능, 원하는 컬럼들 list형태로 삽입
    
    for i in range (mint, maax + 1):
        caselist = list(itertools.combinations(column_list, i))
        list(caselist)
        for n in range (len(caselist)):
            feature.append(list(caselist[n]))
            x = x1[list(caselist[n])].reset_index(drop = True)
            x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

            ###################################################################################################################
            
            models = {
                        'rf': RandomForestRegressor(n_estimators = 195),
                        'gb': GradientBoostingRegressor(learning_rate = 0.03, n_estimators = 145),
                        # 'lgbm': LGBMRegressor()
                    }

            stacking = StackingRegressor(
                estimators=list(models.items()),
                final_estimator=LinearRegression(),
                cv=5
            )
            print(str(list(caselist[n])), '진행중')
            
            stacking.fit(x_train, y_train)
            
            ###################################################################################################################
            
            pred = stacking.predict(x_valid)
            pred_stacking = np.array(pred.round(0))
            
            pred_stacking = np.where(pred_stacking == 1, True, False)
            pred_list.append(pred_stacking)
            
            recall_sc = recall_score(y_valid, pred_stacking)
            precision_sc = precision_score(y_valid, pred_stacking)
            f1_sc = 2 * precision_sc * recall_sc / (precision_sc + recall_sc)
            cm = confusion_matrix(y_valid, pred_stacking)
            
            # sns.heatmap(cm, annot = True, fmt='d') # # confusion matrix 시각화
            # plt.show()
            
            print()
            print('recall:', recall_sc)
            print('precision:', precision_sc)
            print('f1_score:', f1_sc)
            print('=='*30)
            
            recall.append(recall_sc)
            precision.append(precision_sc)
            f1_score.append(f1_sc)
            
            createFolder('test_folder')
            
            with open('test_folder//recall.csv','w',newline='') as f:
                writer = csv.writer(f)
                writer.writerow(recall)
                
            with open('test_folder//feature_name.csv','w',newline='') as f:
                writer = csv.writer(f)
                writer.writerow(feature)
                
            with open('test_folder//pred.csv','w',newline='') as f:
                writer = csv.writer(f)
                writer.writerow(pred_stacking)
                
            with open('test_folder//precision.csv','w',newline='') as f:
                writer = csv.writer(f)
                writer.writerow(precision)
                
            with open('test_folder//f1_score.csv','w',newline='') as f:
                writer = csv.writer(f)
                writer.writerow(f1_score)
                
    return recall, precision, f1_score, pred_list, feature

In [11]:
rc, pc, f1, preds, feats = auto_features_selection('train.csv', 20, 20)

PCA 데이터 생성완료

['test0', 'test1', 'test2', 'test3', 'test4', 'test5', 'test6', 'test7', 'test8', 'test9', 'test10', 'test11', 'test12', 'test13', 'test14', 'test15', 'test16', 'test17', 'test18', 'test19'] 진행중

recall: 0.6385302879841113
precision: 0.7618483412322274
f1_score: 0.6947595894111291
