<a href="https://colab.research.google.com/github/jodog0412/DACON/blob/main/TRAVEL_BOOL_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab/TRAVEL_PRODUCT_ANALYSIS
!ls

# 1. 데이터 로드

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
sample_submission = pd.read_csv('dataset/sample_submission.csv')
train.head()
# train.isna().sum()

# 2. 데이터 결측치 처리

In [None]:
train_na = train.copy()
test_na = test.copy()

# 0 으로 채우는 경우
train_na.DurationOfPitch = train_na.DurationOfPitch.fillna(0)
test_na.DurationOfPitch=test_na.DurationOfPitch.fillna(0)


# mean 값으로 채우는 경우
mean_cols = ['Age',
             'NumberOfFollowups',
             'PreferredPropertyStar',
             'NumberOfTrips',
             'NumberOfChildrenVisiting',
             'MonthlyIncome']
             
for col in mean_cols:
    train_na[col] = train_na[col].fillna(train[col].mean())
    test_na[col] = test_na[col].fillna(test[col].mean())

# "Unknown"으로 채우는 경우
train_na.TypeofContact = train_na.TypeofContact.fillna("Unknown")
test_na.TypeofContact = test_na.TypeofContact.fillna("Unknown")

# 3. 문자열 변수 전처리

In [None]:
object_columns = train.columns[train.dtypes == 'object']

from sklearn.preprocessing import LabelEncoder
train_enc = train_na.copy()
test_enc=test_na.copy()
for o_col in object_columns:
    encoder = LabelEncoder()
    encoder.fit(train_enc[o_col])
    train_enc[o_col] = encoder.transform(train_enc[o_col])
    test_enc[o_col] = encoder.transform(test_enc[o_col])

train_enc.info()
train_enc.describe(include="number")
# train_enc["MonthlyIncome"].hist(bins=100)

# 4. 스케일링

In [None]:
from sklearn.preprocessing import MinMaxScaler
train_scale = train_enc.copy()
test_scale=test_enc.copy()
scaler = MinMaxScaler()
scaler.fit(train_scale[['Age',
                        'DurationOfPitch',
                        'MonthlyIncome']])
train_scale[['Age','DurationOfPitch','MonthlyIncome']] = scaler.transform(train_scale[['Age', 
                                                                                       'DurationOfPitch', 
                                                                                       'MonthlyIncome']])
test_scale[['Age', 'DurationOfPitch', 'MonthlyIncome']] = scaler.transform(test_scale[['Age',
                                                                                       'DurationOfPitch',
                                                                                       'MonthlyIncome']])
# 결과를 확인합니다.
train_scale.info()

# 5. 머신러닝

## 1) RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
x_train = train_scale.drop(columns=['id','ProdTaken'])
y_train = train_scale[['ProdTaken']]
x_test = test_scale.drop(columns=['id'])

model = RandomForestClassifier()
model.fit(x_train,y_train)
prediction = model.predict(x_test)

## 2. DEEP LEARNING

## 1) 데이터 처리

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset # 학습 및 배치로 모델에 넣어주기 위한 툴
from sklearn.model_selection import train_test_split

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x_datas = train_scale.drop(columns=['id','ProdTaken'])
y_datas = train_scale[['ProdTaken']]
x_test = test_scale.drop(columns=['id'])
x_datas.info()

x_train, x_vali, y_train, y_vali = train_test_split(x_datas, 
                                                    y_datas, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [None]:
CFG = {
    'EPOCHS': 50, #에포크
    'LEARNING_RATE':5e-3, #학습률
    'BATCH_SIZE':16, #배치사이즈
    'SEED':41, #시드
}

class CustomDataset(Dataset):
    def __init__(self, x_data, y_data, train_mode=True): #필요한 변수들을 선언
        self.train_mode = train_mode
        self.x_data = x_data
        self.y_data = y_data

    def __len__(self): 
        return len(self.x_data)

    def __getitem__(self, idx): 
        x = torch.FloatTensor(self.x_data.iloc[idx])
        y = torch.FloatTensor(self.y_data.iloc[idx])
        return x, y
    
train_dataset = CustomDataset(x_train, y_train, train_mode=True)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
vali_dataset = CustomDataset(x_vali, y_vali, train_mode=True)
vali_loader = DataLoader(vali_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)


## 2) 모델링

In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        self.layer_1 = nn.Linear(18, 32) 
        self.layer_2 = nn.Linear(32, 64)
        self.layer_3 = nn.Linear(64, 32)
        self.layer_out = nn.Linear(32, 1) 
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.sigmoid(self.layer_out(x))
        return x

model=BinaryClassification().to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=CFG['LEARNING_RATE'])
loss_fn = F.binary_cross_entropy_with_logits

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

losses=[]
accur=[]
for epoch in range(CFG['EPOCHS']):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad() 

        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)

        loss.backward()
        optimizer.step()  

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        losses.append(epoch_loss/len(train_loader))
        accur.append(epoch_acc/len(train_loader))
    print('[{:d}] Train_Loss: {:.4f}'.format(epoch, epoch_loss/len(train_loader)))
    print('[{:d}] Train_Accur: {:.4f}'.format(epoch, epoch_acc/len(train_loader)))

    model.eval() #evaluation 과정에서 사용하지 않아야 하는 layer들을 알아서 off 시키도록 하는 함수
    vali_loss = 0
    vali_acc=0
    with torch.no_grad(): #파라미터 업데이트 안하기 때문에 no_grad 사용
        for x_batch, y_batch in vali_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch)
            vali_loss += loss_fn(y_pred, y_batch).item()
            vali_acc += binary_acc(y_pred, y_batch).item()
    print('[{:d}] Vali_Accur: {:.4f}'.format(epoch, vali_acc/len(vali_loader)))
plt.plot(accur)

In [None]:
sample_submission['ProdTaken'] = prediction
sample_submission.head()
sample_submission.to_csv('submission.csv',index = False)