# 학습 데이터 불러오기

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

# csv형식으로 된 데이터 파일을 읽어옵니다.
train_df = pd.read_csv('data/train.csv')
train_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


## 결측치 확인

In [4]:
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(train_df)

결측치가 존재하지 않습니다


# 데이터 전처리

In [5]:

#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [6]:
train_le = make_label_map(train_df[['Gender']])
gender_df = label_encoder(train_df[['Gender']], train_le)
train_df['Gender'] = gender_df[['Gender']]

{'Gender': {'unknown': 0, 'M': 1, 'I': 2, 'F': 3}}


In [7]:
train_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [8]:
train_x = train_df.drop(['id', 'Target'], axis=1)
train_y = train_df.Target

# RandomForest

## 배깅

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
model = RandomForestRegressor() # 모델을 객체에 할당

model.fit(train_x, train_y) # 모델 학습

RandomForestRegressor()

In [12]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,F,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,M,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,I,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,M,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,F,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


In [13]:
gender_df = label_encoder(test_df[['Gender']], train_le)
test_df['Gender'] = gender_df[['Gender']]

test_df.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,3,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,1,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,2,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,1,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,3,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


In [16]:
test_x = test_df.drop(['id'],axis=1)

In [17]:
# 전처리가 완료된 테스트 데이터셋을 통해 본격적으로 학습한 모델로 추론을 시작합니다.
prediction = model.predict(test_x)

# 제출

In [14]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Target
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [18]:
submission['Target'] = prediction

# 데이터가 잘 들어갔는지 확인합니다
submission

Unnamed: 0,id,Target
0,1,8.25
1,2,12.65
2,3,5.29
3,4,11.47
4,5,10.30
...,...,...
2919,2920,4.02
2920,2921,7.07
2921,2922,9.56
2922,2923,10.37


In [19]:
submission.to_csv('baseline_submit.csv', index=False)