# 학습 데이터 불러오기

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

# csv형식으로 된 데이터 파일을 읽어옵니다.
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [2]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,F,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,M,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,I,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,M,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,F,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


## 결측치 확인

In [4]:
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(train)

결측치가 존재하지 않습니다


# 데이터 전처리

In [5]:

#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [8]:
train_le = make_label_map(train[['Gender']])
gender_df = label_encoder(train[['Gender']], train_le)
train['Gender'] = gender_df[['Gender']]

{'Gender': {'unknown': 0, 'M': 1, 'I': 2, 'F': 3}}


In [7]:
train_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [11]:
train_x = train.drop(['id', 'Target'], axis=1)
train_y = train.Target

# RandomForest

## 배깅

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
model = RandomForestRegressor() # 모델을 객체에 할당

model.fit(train_x, train_y) # 모델 학습

RandomForestRegressor()

In [12]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,F,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,M,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,I,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,M,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,F,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


In [13]:
gender_df = label_encoder(test_df[['Gender']], train_le)
test_df['Gender'] = gender_df[['Gender']]

test_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,3,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,1,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,2,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,1,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,3,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


In [16]:
test_x = test_df.drop(['id'],axis=1)

In [17]:
# 전처리가 완료된 테스트 데이터셋을 통해 본격적으로 학습한 모델로 추론을 시작합니다.
prediction = model.predict(test_x)

In [11]:
# 딥러닝을 구동하는 데 필요한 케라스 함수를 불러옵니다.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 필요한 라이브러리를 불러옵니다.
import numpy
import tensorflow as tf

# 실행할 때마다 같은 결과를 출력하기 위해 설정하는 부분입니다.
numpy.random.seed(3)
tf.random.set_seed(3)



# 모델을 설정합니다.#레이어 3개
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))#숫자는 몇을 줘야 가장 좋은 결과가 나오는지 알아내야되긴함
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# 모델을 컴파일합니다.
model.compile(loss='mae',
             optimizer='adam',
             metrics=['mae'])

In [13]:
# 모델을 실행합니다.
model.fit(train_x, train_y, epochs=200, batch_size=10)

# 결과를 출력합니다.
print("\n Accuracy: %.4f" % (model.accuracy(train_x,train_y)[1]))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

AttributeError: 'Sequential' object has no attribute 'accuracy'

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train['Gender'])
gender_encoded_train = encoder.transform(train['Gender'])
gender_encoded_test = encoder.transform(test['Gender'])
train['Gender'] = gender_encoded_train
test['Gender'] = gender_encoded_test

X = train.drop('Target', axis=1)
y = train['Target']

In [35]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import lightgbm

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1000)
skf.split(X, X['Gender'])

pred_list = []
mae_list = []


for fold,(train_index, val_index) in enumerate(skf.split(X,  X['Gender'])):

    print(f'***********{fold+1}th fold start***********')
    x_train, x_val, y_train, y_val = X.loc[train_index], X.loc[val_index], y.loc[train_index], y.loc[val_index]

    lgb = lightgbm.LGBMRegressor(boosting_type='dart',
        n_estimators=1000, random_state=1000, learning_rate=0.05, categorical_feature=[0])

    lgb.fit(x_train, y_train,
            eval_set=(x_val,y_val),
            eval_metric='l1', verbose=False, early_stopping_rounds=100)

    pred = lgb.predict(x_val)
    result = mean_absolute_error(pred,y_val)
    mae_list.append(result)
    
    pred_test = lgb.predict(test)
    pred_list.append(pred_test)
    
    print(f'mae : {result:.4f}', end='\n\n')
    
print(f'mean mae {np.mean(mae_list):.4f}')

***********1th fold start***********
mae : 1.6926

***********2th fold start***********
mae : 1.5487

***********3th fold start***********
mae : 1.3561

***********4th fold start***********
mae : 1.8642

***********5th fold start***********
mae : 1.8445

***********6th fold start***********
mae : 1.8161

***********7th fold start***********
mae : 1.7906

***********8th fold start***********
mae : 1.4795

***********9th fold start***********
mae : 1.3718

***********10th fold start***********
mae : 1.5530

mean mae 1.6317


In [27]:
from pycaret.classification import *

In [28]:
setup_clf = setup(data = train, target='Target', fold=5)

Unnamed: 0,Description,Value
0,session_id,3098
1,Target,Target
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(1253, 10)"
5,Missing Values,False
6,Numeric Features,8
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [29]:
top10 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.2713,0.0,0.1406,0.2538,0.2543,0.1722,0.1733,20.396
et,Extra Trees Classifier,0.2702,0.0,0.1565,0.2542,0.2542,0.1725,0.1737,0.14
rf,Random Forest Classifier,0.2634,0.0,0.1549,0.2383,0.2435,0.1626,0.1639,0.154
knn,K Neighbors Classifier,0.2418,0.0,0.1441,0.2033,0.2141,0.1343,0.1359,0.016
lda,Linear Discriminant Analysis,0.2406,0.0,0.1494,0.2155,0.2181,0.1403,0.142,0.008
gbc,Gradient Boosting Classifier,0.236,0.0,0.1467,0.2245,0.2239,0.1349,0.1356,1.604
lr,Logistic Regression,0.2327,0.0,0.0948,0.1649,0.1759,0.1058,0.1119,0.046
lightgbm,Light Gradient Boosting Machine,0.2315,0.0,0.1403,0.2112,0.216,0.1283,0.1291,1.02
ridge,Ridge Classifier,0.2304,0.0,0.0908,0.1656,0.1668,0.1013,0.1075,0.008
xgboost,Extreme Gradient Boosting,0.2303,0.0,0.1447,0.2208,0.2211,0.1301,0.1309,0.664


# 제출

In [14]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Target
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [18]:
submission['Target'] = prediction

# 데이터가 잘 들어갔는지 확인합니다
submission

Unnamed: 0,id,Target
0,1,8.25
1,2,12.65
2,3,5.29
3,4,11.47
4,5,10.30
...,...,...
2919,2920,4.02
2920,2921,7.07
2921,2922,9.56
2922,2923,10.37


In [19]:
submission.to_csv('baseline_submit.csv', index=False)