In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np

In [3]:
emails = pd.read_csv('./data/emails.csv')

In [4]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [5]:
emails = emails.select_dtypes(exclude=['object'])

In [6]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3001 entries, the to Prediction
dtypes: int64(3001)
memory usage: 118.4 MB


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
e_trn, e_tst = train_test_split(emails, test_size=0.2, shuffle=False, random_state=2023)

In [9]:
print(type(emails))
print(type(e_tst))
print(type(e_trn))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [10]:
X_trn =e_trn.drop(['Prediction'],axis=1)
y_trn = e_trn.Prediction

X_tst = e_tst.drop(['Prediction'], axis=1)
y_tst = e_tst.Prediction

In [10]:
class CustomDataset(Dataset):
  def __init__(self, *args:list[np.array]):#array 저장
    assert all(args[0].shape[0] == arg.shape[0] for arg in args), "Size mismatch." #이 부분은 row의 개수를 일치 시키는게 맞는가?
    self.data = args #(X,y)의 튜플 형식, X = np.array, y=np.array
  def __getitem__(self, index):
    return tuple(x[index] for x in self.data) #(X[index],y[index])
  def __len__(self):
    return self.data[0].shape[0]

In [11]:
X = torch.tensor(X_trn.to_numpy(dtype=np.float32))
y = torch.tensor(y_trn.to_numpy(dtype=np.float32)).unsqueeze(-1)

X_2 = torch.tensor(X_tst.to_numpy(dtype=np.float32))
y_2 = torch.tensor(y_tst.to_numpy(dtype=np.float32)).unsqueeze(-1)


In [12]:
trn_ds = CustomDataset(X, y)
tst_ds = CustomDataset(X_2, y_2)

# trn_ds = TensorDataset(X, y)
# tst_ds = TensorDataset(X_2, y_2)

trn_dl = DataLoader(trn_ds, batch_size=32, shuffle=True)
tst_dl = DataLoader(tst_ds, batch_size=32, shuffle=False)


In [13]:
X_trn.columns.value_counts().sum() #컬럼 개수

3000

In [14]:
len(trn_ds) #row 개수

4137

In [15]:
import torch.nn as nn
from sklearn.metrics import mean_absolute_error
from tqdm.auto import tqdm

In [16]:
class ANN(nn.Module):
    def __init__(self, input_dim : int=len(X_trn.columns), hidden_dim=128, output_dim : int=1):
        super().__init__()
        self.lin1 = nn.Linear(input_dim, hidden_dim)
        self.lin2 = nn.Linear(hidden_dim, hidden_dim)
        self.lin3 = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x = self.lin1(x)
        x = nn.functional.relu(x)
        x = self.lin2(x)
        x = nn.functional.relu(x)
        x = self.lin3(x)
        x = nn.functional.relu(x) #분류 모델이기 때문에 음수값이 나오면 0을 반환하도록 조치
        return x

In [17]:
def train_one_epoch(model: nn.Module, optimizer: torch.optim.Optimizer, data_loader:DataLoader)->float:
    model.train()
    total_loss = 0.
    for X, y in data_loader:
        output = model(X)
        loss = torch.nn.functional.mse_loss(output,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()*len(y)
    return total_loss/len(data_loader.dataset)

def evaluate(model: nn.Module, data_loader: DataLoader):
    model.eval()
    total_loss = 0.
    with torch.inference_mode():
        for X, y in data_loader:
            output = model(X)
            total_loss += mean_absolute_error(output, y).item()*len(y)
            #토탈 로스가 for문 안에 있는데, 이렇게 해도, 토탈 로스에 왜 들어가나? 토탈로스가 with 안에 있어야 하는거 아닌가?
    return total_loss/len(data_loader.dataset)
            

In [18]:
model_t = ANN()
optimizer = torch.optim.Adam(model_t.parameters(), lr = 0.001)

pbar = tqdm(range(10))
for j in pbar:
    loss = train_one_epoch(model=model_t, optimizer=optimizer, data_loader=trn_dl)
    loss_val = evaluate(model_t, tst_dl)
    pbar.set_postfix(trn_loss=loss, val_loss = loss_val)
    #이렇게 한 줄에 넣으면, train loss와 val loss가 동시에 계산되는데, 단순히 계산이 빨라서 그렇게 보이는건가?



  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
# prediction = []

# for X, y in tst_dl:
#     pred = model_t(X)
#     prediction.append(pred)

# prd = torch.cat(prediction)
# print(prd.shape)

In [22]:
prediction = torch.cat([model_t(X) for X, y in tst_dl])
prediction = prediction.cpu()
prediction = prediction.flatten()

In [23]:
type(prediction)

torch.Tensor

In [None]:
prediction.shape

torch.Size([1035])

In [28]:
prediction

tensor([0., 0., 0.,  ..., 0., 0., 0.], grad_fn=<ViewBackward0>)

In [None]:
#prd = prd.flatten()

In [None]:
#prd.shape

In [25]:
pre_pred = pd.DataFrame(y_tst) #test.csv나 submission.csv가 없는 관계로 test index 기준으로 새로 하나 생성

In [26]:
submission_df = pd.DataFrame(pre_pred.drop(['Prediction'], axis=1)) 

In [27]:
submission_df['Prediction'] = prediction

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [None]:
submission_df.to_csv('./submission_nn.csv', index=False)