# Pytorch with Logistic Regression

## Dataset

[Pima Indians Diabetes Database](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database)

In [None]:
# load dataset
import pandas as pd

dataset_df = pd.read_csv('./dataset/diabetes.csv')
dataset_df.head()

In [None]:
dataset_df.info()

# Code

## Data Preprocessing

In [None]:
# 檢查是否有缺失值
print(dataset_df.isnull().sum())

# 檢查是否有重複值
print(dataset_df.duplicated().sum())

In [None]:
# 檢視資料集的統計資訊，是否有異常值
dataset_df.describe()

## 結論是不這樣洗資料比較好

In [None]:
import numpy as np
# 計算每個特徵的Z-score
z_scores = np.abs((dataset_df - dataset_df.mean()) / dataset_df.std())

# 設定Z-score的閾值
threshold = 3

# 過濾掉異常值
dataset_df = dataset_df[(z_scores < threshold).all(axis=1)]

In [None]:
# 檢視資料集
dataset_df.describe()

## Splitting the dataset

In [None]:
# 假設 'label' 是我們的目標變量
X = dataset_df.drop('Outcome', axis=1).values
y = dataset_df['Outcome'].values

In [None]:
from sklearn.model_selection import train_test_split

# 先將數據集切分為訓練集和測試集（80%訓練集和20%測試集）
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 再將訓練集切分為訓練集和驗證集（80%訓練集和20%驗證集）
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# 檢查訓練集、驗證集和測試集的大小
print(X_train.shape, X_val.shape, X_test.shape)

## Scaling the dataset

In [None]:
# 標準化數據
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## Converting the dataset into Pytorch tensors

In [None]:
import torch

X_train = torch.from_numpy(X_train).float()
X_val = torch.from_numpy(X_val).float()
X_test = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train).float().view(-1, 1)
y_val = torch.from_numpy(y_val).float().view(-1, 1)
y_test = torch.from_numpy(y_test).float().view(-1, 1)

## Building the Model

In [None]:
import torch.nn as nn
import torch.optim as optim

# 定義Logistic Regression模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# 定義訓練函數
def train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, num_epochs=100):
    for epoch in range(num_epochs):
        model.train()
        
        # 前向傳播
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        
        # 反向傳播和優化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 訓練集上的損失
        if (epoch+1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val)
                val_loss = criterion(val_outputs, y_val)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

# 定義測試函數
def test_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        predicted = (outputs >= 0.5).float()
        accuracy = (predicted == y_test).float().mean()
        print(f'Test Accuracy: {accuracy.item():.4f}')

## Train

In [None]:
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)

criterion = nn.BCELoss()  # 二元交叉熵損失
optimizer = optim.SGD(model.parameters(), lr=0.0001, weight_decay=0.0001)  # 隨機梯度下降

# 訓練模型
train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, num_epochs=100000)

# 測試模型
test_model(model, X_test, y_test)

In [None]:
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)

criterion = nn.BCELoss()  # 二元交叉熵損失
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.0001)
# 訓練模型
train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, num_epochs=100000)

# 測試模型
test_model(model, X_test, y_test)


In [None]:
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)

criterion = nn.BCELoss()  # 二元交叉熵損失
optimizer = optim.RMSprop(model.parameters(), lr=0.00001, weight_decay=0.0001)

# 訓練模型
train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, num_epochs=100000)

# 測試模型
test_model(model, X_test, y_test)

In [None]:
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)

criterion = nn.BCELoss()  # 二元交叉熵損失
optimizer = optim.Adagrad(model.parameters(), lr=0.00001, weight_decay=0.0001)

# 訓練模型
train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, num_epochs=100000)

# 測試模型
test_model(model, X_test, y_test)