# Kaggle Project

## Describe Your Dataset

**URL:** https://www.kaggle.com/datasets/kyr7plus/emg-4/

**Task:**

1. 필요한 library 및 데이터 불러오기 
2. 데이터 세트 분할하기 
3. 모델 선택하기(Neaural network model)
4. 각 모델 훈련 및 성능 평가하기
5. 최종 테스트 데이터로 모델 평가하기
6. 결론(이전 성능과 현재 성능 비교하기)

**Data:**

이 데이터는 MYO라고 불리는 근전도 센서의 데이터로 64개의 센서 값인 X 데이터와 65번째 열에는 rock =0, scissors = 1, paper = 2, ok = 3으로 총 4개의 데이터로 구성되어 있다.
다중 분류를 사용하여 rock, scissors, paper and ok 중에 어느 제스처에 속하는지 분류하는 것이다. 
Pytorch의 Neural Network를 이용하여 모델을 형성하고 평가하고자 한다. 

**Datasets**

* Train dataset: 7473개 (64%)
* Validation dataset: 1869개 (16%)
* Test dataset: 2336개 (20%)

**Features(x):**

64개의 센서 값

**Target(y):**

rock = 0
scissors = 1
paper = 2
ok = 3

---

## Build Your Model

### Data preprocessing

In [38]:
# 필요한 library 불러오기
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import torch.nn.functional as F
import numpy as np

In [3]:
# 데이터 불러오기
rock_pos=pd.read_csv('0.csv', header=None)
scis_pos=pd.read_csv('1.csv', header=None)
paper_pos=pd.read_csv('2.csv', header=None)
ok_pos=pd.read_csv('3.csv', header=None)

In [4]:
# 데이터 shape 보기
rock_pos.shape

(2910, 65)

In [5]:
scis_pos.shape

(2903, 65)

In [6]:
paper_pos.shape

(2943, 65)

In [7]:
ok_pos.shape

(2922, 65)

In [8]:
# 데이터 head 보기
rock_pos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0,-9.0,2.0,...,-28.0,61.0,4.0,8.0,5.0,4.0,-7.0,-59.0,16.0,0
1,-47.0,-6.0,-5.0,-7.0,13.0,-1.0,35.0,-10.0,10.0,-4.0,...,-25.0,47.0,6.0,6.0,5.0,13.0,21.0,111.0,15.0,0
2,-19.0,-8.0,-8.0,-8.0,-21.0,-6.0,-79.0,12.0,0.0,5.0,...,-83.0,7.0,7.0,1.0,-8.0,7.0,21.0,114.0,48.0,0
3,2.0,3.0,0.0,2.0,0.0,22.0,106.0,-14.0,-16.0,-2.0,...,-38.0,-11.0,4.0,7.0,11.0,33.0,39.0,119.0,43.0,0
4,6.0,0.0,0.0,-2.0,-14.0,10.0,-51.0,5.0,7.0,0.0,...,38.0,-35.0,-8.0,2.0,6.0,-13.0,-24.0,-112.0,-69.0,0


In [9]:
scis_pos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,-7.0,-1.0,-1.0,0.0,-10.0,-10.0,-1.0,1.0,-5.0,-5.0,...,6.0,-4.0,-3.0,-5.0,-3.0,15.0,11.0,-4.0,-5.0,1
1,-6.0,-2.0,-5.0,-2.0,27.0,42.0,3.0,5.0,11.0,1.0,...,2.0,-17.0,-5.0,-7.0,-2.0,15.0,12.0,0.0,-7.0,1
2,5.0,0.0,-1.0,-2.0,2.0,-9.0,1.0,5.0,1.0,-3.0,...,-11.0,-23.0,1.0,-1.0,-1.0,-23.0,-25.0,2.0,-1.0,1
3,31.0,4.0,2.0,-2.0,38.0,14.0,2.0,7.0,-2.0,4.0,...,-4.0,13.0,2.0,-1.0,-3.0,-7.0,0.0,-3.0,-2.0,1
4,-4.0,-4.0,3.0,3.0,-25.0,-46.0,-1.0,3.0,-7.0,-6.0,...,15.0,4.0,-4.0,-1.0,-1.0,7.0,26.0,3.0,-7.0,1


In [10]:
paper_pos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,4.0,19.0,-9.0,-7.0,-3.0,-36.0,-6.0,-23.0,3.0,-21.0,...,9.0,-14.0,-2.0,-3.0,-4.0,-21.0,7.0,-8.0,-12.0,2
1,-1.0,12.0,20.0,7.0,20.0,-73.0,-4.0,-2.0,4.0,5.0,...,-3.0,-5.0,-2.0,0.0,-4.0,-7.0,5.0,6.0,9.0,2
2,4.0,5.0,-8.0,-2.0,10.0,-10.0,-10.0,16.0,-3.0,-18.0,...,2.0,0.0,2.0,3.0,-8.0,19.0,20.0,0.0,-8.0,2
3,-3.0,-3.0,5.0,11.0,25.0,-20.0,-2.0,14.0,9.0,32.0,...,15.0,-2.0,16.0,6.0,9.0,1.0,31.0,16.0,4.0,2
4,-5.0,-9.0,-2.0,-5.0,-46.0,-34.0,-9.0,-4.0,0.0,8.0,...,-5.0,-11.0,-6.0,7.0,6.0,-10.0,-24.0,-6.0,0.0,2


In [11]:
ok_pos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,-22.0,-9.0,-6.0,-1.0,21.0,26.0,-2.0,5.0,5.0,0.0,...,-10.0,-15.0,-6.0,-3.0,3.0,20.0,25.0,1.0,1.0,3
1,-7.0,0.0,1.0,0.0,-13.0,-18.0,0.0,-9.0,-6.0,-2.0,...,-1.0,11.0,-4.0,-5.0,-4.0,3.0,-8.0,-7.0,-3.0,3
2,-6.0,-6.0,-6.0,-8.0,25.0,43.0,-4.0,7.0,6.0,1.0,...,3.0,2.0,1.0,3.0,-1.0,1.0,9.0,4.0,4.0,3
3,2.0,1.0,1.0,1.0,0.0,-23.0,2.0,-6.0,-1.0,-2.0,...,16.0,-3.0,-1.0,-1.0,-3.0,0.0,-3.0,4.0,3.0,3
4,0.0,3.0,4.0,5.0,1.0,25.0,-9.0,0.0,0.0,1.0,...,1.0,6.0,2.0,1.0,-3.0,-20.0,-42.0,-4.0,2.0,3


In [12]:
# 데이터 결합하기
df=pd.concat([rock_pos,scis_pos,paper_pos,ok_pos])

In [13]:
# 결측치 확인하기
missing_values = df.isnull().sum()
print(missing_values)

columns_with_missing_values = missing_values[missing_values > 0].index
if not columns_with_missing_values.empty:
    print(f"결측치가 있는 열: {columns_with_missing_values}")
else:
    print("결측치가 있는 열이 없습니다.")

0     0
1     0
2     0
3     0
4     0
     ..
60    0
61    0
62    0
63    0
64    0
Length: 65, dtype: int64
결측치가 있는 열이 없습니다.


### Model Construction

In [14]:
# 입력 변수와 대상 변수로 데이터 분할
X=df.drop([64],axis=1)
Y=df.drop(df.index[0:64],axis=1)

In [15]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0,-9.0,2.0,...,21.0,-28.0,61.0,4.0,8.0,5.0,4.0,-7.0,-59.0,16.0
1,-47.0,-6.0,-5.0,-7.0,13.0,-1.0,35.0,-10.0,10.0,-4.0,...,-105.0,-25.0,47.0,6.0,6.0,5.0,13.0,21.0,111.0,15.0
2,-19.0,-8.0,-8.0,-8.0,-21.0,-6.0,-79.0,12.0,0.0,5.0,...,-128.0,-83.0,7.0,7.0,1.0,-8.0,7.0,21.0,114.0,48.0
3,2.0,3.0,0.0,2.0,0.0,22.0,106.0,-14.0,-16.0,-2.0,...,-54.0,-38.0,-11.0,4.0,7.0,11.0,33.0,39.0,119.0,43.0
4,6.0,0.0,0.0,-2.0,-14.0,10.0,-51.0,5.0,7.0,0.0,...,60.0,38.0,-35.0,-8.0,2.0,6.0,-13.0,-24.0,-112.0,-69.0


In [16]:
Y.head()

Unnamed: 0,64
0,0
1,0
2,0
3,0
4,0


In [17]:
# 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
# 데이터셋 크기 확인
print(f"훈련 세트 크기: {len(X_train)}")
print(f"검증 세트 크기: {len(X_val)}")
print(f"테스트 세트 크기: {len(X_test)}")

훈련 세트 크기: 7473
검증 세트 크기: 1869
테스트 세트 크기: 2336


In [18]:
# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [23]:
# PyTorch Tensor로 변환
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.flatten(), dtype=torch.long)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values.flatten(), dtype=torch.long)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.flatten(), dtype=torch.long)

In [26]:
# TensorDataset 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [32]:
# DataLoader 생성
batch_size = 64

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

### Train Model & Select Model

In [33]:
# 신경망 모델 정의
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [34]:
# y_train이 DataFrame이면 Series로 변환
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze()
    
# y_train을 torch.LongTensor로 변환
y_train = torch.LongTensor(y_train.values)

In [35]:
# 모델 인스턴스 생성
input_size = X_train.shape[1]
hidden_size = 128
output_size = len(np.unique(y_train))

model = NeuralNetwork(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [36]:
# 모델 훈련
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()

    # Validation 데이터 평가
    model.eval()
    val_outputs = []
    val_labels = []
    with torch.no_grad():
        for val_inputs, val_labels_batch in val_loader:
            val_outputs_batch = model(val_inputs)
            val_outputs.extend(val_outputs_batch.argmax(dim=1).cpu().numpy())
            val_labels.extend(val_labels_batch.cpu().numpy())

    val_accuracy = accuracy_score(val_labels, val_outputs)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {val_accuracy:.4f}')

Epoch [1/10], Validation Accuracy: 0.6276
Epoch [2/10], Validation Accuracy: 0.8068
Epoch [3/10], Validation Accuracy: 0.8604
Epoch [4/10], Validation Accuracy: 0.8866
Epoch [5/10], Validation Accuracy: 0.9026
Epoch [6/10], Validation Accuracy: 0.9058
Epoch [7/10], Validation Accuracy: 0.9074
Epoch [8/10], Validation Accuracy: 0.9096
Epoch [9/10], Validation Accuracy: 0.9192
Epoch [10/10], Validation Accuracy: 0.9208


In [39]:
# 테스트 데이터로 예측
model.eval()
test_outputs = []
test_labels = []
with torch.no_grad():
    for test_inputs, test_labels_batch in test_loader:
        test_outputs_batch = model(test_inputs)
        test_outputs.extend(test_outputs_batch.argmax(dim=1).cpu().numpy())
        test_labels.extend(test_labels_batch.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_outputs)
print(f'Test Accuracy: {test_accuracy:.4f}')

# 분류 보고서 생성
report = classification_report(test_labels, test_outputs)

# 출력
print("Classification Report:")
print(report)

Test Accuracy: 0.9234
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       590
           1       0.91      0.95      0.93       603
           2       0.91      0.89      0.90       565
           3       0.90      0.88      0.89       578

    accuracy                           0.92      2336
   macro avg       0.92      0.92      0.92      2336
weighted avg       0.92      0.92      0.92      2336



### 중간과제 데이터 성능
Logistic Regression Test Data Acc(solver=lbfgs): 0.3595890410958904
Logistic Regression Test Data Acc(solver=newton-cg): 0.3583047945205479
Logistic Regression Test Data Acc(solver=saga): 0.3505993150684932

Decision Tree Test Data Acc(depth=5): 0.6001712328767124
Decision Tree Test Data Acc(depth=10): 0.7448630136986302
Decision Tree Test Data Acc(depth=15): 0.769263698630137

SVC Test Data Acc(kernel=linear): 0.3493150684931507
SVC Test Data Acc(kernel=rbf): 0.8621575342465754

### 기말과제 데이터 성능
Neural Network Data ACC: 0.9234

### 결론
중간과제의 SVC Test Data ACC: 0.8621575342465754 보다 기말과제의 Neural Network Data ACC: 0.9234 이 성능이 향상되었음을 알 수 있다. 