In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive

drive.mount('/content/drive')

train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Mounted at /content/drive


In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [15]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.from_numpy(labels.values).float()

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [11]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

In [12]:
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = pd.factorize(df_all[col])[0]

for col in label_columns:
    df_train[col] = df_all.iloc[:len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train):][col]

X = df_train.drop("is_converted", axis=1)
y = df_train["is_converted"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [16]:
train_dataset = CustomDataset(X_train_processed, y_train)
val_dataset = CustomDataset(X_val_processed, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [17]:
input_size = X_train_processed.shape[1]
model = NeuralNetwork(input_size)

In [18]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_outputs = []
    val_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_outputs.extend(outputs.squeeze().numpy())
            val_labels.extend(labels.numpy())

    val_outputs = np.array(val_outputs)
    val_labels = np.array(val_labels)

    val_preds = (val_outputs > 0.5).astype(float)

    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision = precision_score(val_labels, val_preds)
    val_recall = recall_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)

    print(f"Epoch {epoch+1}/{num_epochs}\n 정확도: {val_accuracy:.4f}\n 정밀도: {val_precision:.4f}\n 재현율: {val_recall:.4f}\n F1: {val_f1:.4f}")


Epoch 1/10
 정확도: 0.9210
 정밀도: 0.5863
 재현율: 0.1655
 F1: 0.2581
Epoch 2/10
 정확도: 0.9415
 정밀도: 0.8299
 재현율: 0.3716
 F1: 0.5133
Epoch 3/10
 정확도: 0.9469
 정밀도: 0.8515
 재현율: 0.4365
 F1: 0.5772
Epoch 4/10
 정확도: 0.9508
 정밀도: 0.8722
 재현율: 0.4782
 F1: 0.6177
Epoch 5/10
 정확도: 0.9520
 정밀도: 0.8478
 재현율: 0.5147
 F1: 0.6406
Epoch 6/10
 정확도: 0.9527
 정밀도: 0.8430
 재현율: 0.5289
 F1: 0.6500
Epoch 7/10
 정확도: 0.9535
 정밀도: 0.8581
 재현율: 0.5279
 F1: 0.6537
Epoch 8/10
 정확도: 0.9546
 정밀도: 0.8528
 재현율: 0.5472
 F1: 0.6667
Epoch 9/10
 정확도: 0.9533
 정밀도: 0.8231
 재현율: 0.5574
 F1: 0.6646
Epoch 10/10
 정확도: 0.9533
 정밀도: 0.8404
 재현율: 0.5401
 F1: 0.6576
