In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


df = pd.read_csv("anxiety_attack_dataset.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.drop('ID', axis=1, inplace=True)
df

In [None]:
le = LabelEncoder()
columns = ['Gender', 'Occupation', 'Smoking', 
           'Family History of Anxiety', 'Dizziness', 
           'Medication', 'Recent Major Life Event']

mappings = {}

for column in columns:
    df[column] = le.fit_transform(df[column])
    mappings[column] = dict(zip(le.classes_, le.transform(le.classes_)))

# check mappings
for column, mapping in mappings.items():
    print(f"Mapping for {column}: {mapping}")

In [None]:
df

---

## visualization & data engineering

In [8]:
def visualize_column(df, column_name):
    unique_values = df[column_name].value_counts().sort_index()
    
    # Dynamically adjust the figure width based on the number of x-axis labels
    num_labels = len(unique_values)
    fig_width = max(6, num_labels * 0.5)  # Base width is 6, add 0.5 units for each additional label
    plt.figure(figsize=(fig_width, 6))  # Fixed height is 6
    
    unique_values.plot(kind='bar', edgecolor='black')
    plt.title(f'Count of Each Unique {column_name}')
    plt.xlabel(f'{column_name}')
    plt.ylabel('Count')
    plt.xticks(rotation=45)  # Rotate x-axis labels
    plt.tight_layout()  # Automatically adjust layout to avoid label clipping


In [None]:
unique_ages = df['Age'].value_counts().sort_index()
unique_ages

idk why these ages are this much 'discrete', cuz usually in large scale dataset we can see a 'pseudo-continuous' distribution.

In [None]:
visualize_column(df, 'Age')

In [None]:
plt.hist(df["Age"], bins=20, edgecolor='black')
plt.show()

that is not what I call healthy data.

I want it better formed. To do so, I try to classify them using KMeans. For reusing this, I write it a func.

In [None]:
import numpy as np
from sklearn.cluster import KMeans

def auto_cluster(df, column_name, cluster_column_name, verbose=False):
    # Set the number of clusters
    k = 5

    # Use the best k value for clustering
    kmeans = KMeans(n_clusters=k, random_state=42)
    df[cluster_column_name] = kmeans.fit_predict(df[[column_name]])

    if verbose:
        # Print the original column and the cluster column
        print(df[[column_name, cluster_column_name]])
        print('\n')

        # Print the descriptive statistics for each cluster
        print(df.groupby(cluster_column_name)[column_name].describe())

        # Visualize the clusters
        visualize_column(df, cluster_column_name)

    return df

# Example usage
df = auto_cluster(df, 'Age', 'Class of Age', verbose=True)

---

lets go vis them all.

In [None]:
visualize_column(df, 'Gender')

oops, got some non-bisexual.

I want to check if non-bisexuals are more likely to have anxiety.

In [None]:
non_bisexuals = df[df['Gender'] == 2]

visualize_column(non_bisexuals, 'Severity of Anxiety Attack (1-10)')

In [None]:
print(non_bisexuals['Severity of Anxiety Attack (1-10)'].describe())
print('\n')
print(df[df['Gender']==0]['Severity of Anxiety Attack (1-10)'].describe())  # Male
print('\n')
print(df[df['Gender']==1]['Severity of Anxiety Attack (1-10)'].describe())  # Female

Interestingly, non-bisexuals are more likely to have anxiety than bisexuals. Meanwhile, females are more likely to have anxiety than males. 

You can see it from the mean and std.

Men just don't give a f*ck to shit things right?

---

In [None]:
df

In [None]:
visualize_column(df, 'Sleep Hours')

Seems to much 'average'.

In [None]:
df = auto_cluster(df, 'Sleep Hours', 'Class of Sleep Hours', verbose=True)

here we may see, short duration sleep is class 0, followed by class 1 and 2 which are the medium and long.

---

In [None]:
visualize_column(df, 'Physical Activity (hrs/week)')

we do the same trick on Physical Activity Duration.

In [None]:
df = auto_cluster(df, 'Physical Activity (hrs/week)', 'Class of Physical Activity', verbose=True)

In [None]:
# Cluster Caffeine Intake (mg/day)
df = auto_cluster(df, 'Caffeine Intake (mg/day)', 'Class of Caffeine Intake', verbose=True)

# Cluster Alcohol Consumption (drinks/week)
df = auto_cluster(df, 'Alcohol Consumption (drinks/week)', 'Class of Alcohol Consumption', verbose=True)

# Cluster Therapy Sessions (per month)
df = auto_cluster(df, 'Therapy Sessions (per month)', 'Class of Therapy Sessions', verbose=True)

---

In [None]:
df.head(2)

---

## Train

In [23]:
from sklearn.model_selection import train_test_split

# columns to drop:
# you can select the columns whatever you like or dislike
columns_to_drop = [
    'Severity of Anxiety Attack (1-10)',
    'Age',
    'Sleep Hours',
    'Physical Activity (hrs/week)',
    'Caffeine Intake (mg/day)',
    'Alcohol Consumption (drinks/week)',
    'Therapy Sessions (per month)'
]

X = df.drop(columns=columns_to_drop, axis=1)
y = df['Severity of Anxiety Attack (1-10)'] - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


---

### MLP

In [None]:
import torch.nn as nn
from KAN import KANLinear

class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, use_kan=False):
        super(MLP, self).__init__()
        layers = []
        last_size = input_size
        
        if use_kan:
            from KAN import KANLinear as Linear
        else:
            from torch.nn import Linear
        # 添加隐藏层
        for hidden_size in hidden_sizes:
            layers.append(Linear(last_size, hidden_size))
            layers.append(nn.ReLU())
            last_size = hidden_size

        # 添加输出层
        layers.append(nn.Linear(last_size, output_size))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# DIY hidden layers
hidden_sizes = [16, 32, 48, 32, 16]

mlp = MLP(input_size=X_train.shape[1], hidden_sizes=hidden_sizes, output_size=10, use_kan=True)

mlp

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm

X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.LongTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.LongTensor(y_test.values)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

criterion = nn.CrossEntropyLoss()
mlp = MLP(input_size=X_train.shape[1], hidden_sizes=hidden_sizes, output_size=10, use_kan=True)

optimizer = optim.Adam(mlp.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    mlp.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    train_log_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch', leave=False, mininterval=0.1, miniters=1, dynamic_ncols=True)

    for batch_X, batch_y in train_log_bar:
        optimizer.zero_grad()
        outputs = mlp(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()

        batch_acc = (predicted == batch_y).float().mean().item()
        batch_loss = loss.item()
        train_log_bar.set_postfix(loss=batch_loss, acc=batch_acc)

    train_accuracy = train_correct / train_total
    train_loss /= len(train_loader)

    mlp.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    val_log_bar = tqdm(test_loader, desc=f'Validation', unit='batch', leave=False, mininterval=0.1, miniters=1, dynamic_ncols=True)

    with torch.no_grad():
        for batch_X, batch_y in val_log_bar:
            outputs = mlp(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += batch_y.size(0)
            val_correct += (predicted == batch_y).sum().item()

            batch_acc = (predicted == batch_y).float().mean().item()
            batch_loss = loss.item()
            val_log_bar.set_postfix(loss=batch_loss, acc=batch_acc)

    val_accuracy = val_correct / val_total
    val_loss /= len(test_loader)

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    # print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

---

### Transformer

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim

class TransformerClassifier(nn.Module):
    def __init__(self, input_size, output_size, d_model=64, nhead=4, num_encoder_layers=2, dim_feedforward=256, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        
        # 线性层将输入特征映射到 d_model 维度
        self.input_proj = nn.Linear(input_size, d_model)
        
        # Transformer 编码器
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # 分类头
        self.classifier = nn.Linear(d_model, output_size)
    
    def forward(self, x):
        # 输入投影
        x = self.input_proj(x)
        
        # Transformer 编码器需要输入形状为 (seq_len, batch_size, d_model)
        x = x.unsqueeze(0)  # 添加序列长度维度
        x = self.transformer_encoder(x)
        
        # 取序列的第一个时间步的输出作为分类依据
        x = x[0, :, :]
        
        # 分类
        x = self.classifier(x)
        return x

In [None]:

model = TransformerClassifier(input_size=input_size, output_size=output_size)

model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm

# 将数据转换为 PyTorch 张量
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.LongTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.LongTensor(y_test.values)

# 创建数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# 定义损失函数
criterion = nn.CrossEntropyLoss()

# 初始化 TransformerClassifier
input_size = X_train.shape[1]
output_size = 10
model = TransformerClassifier(input_size=input_size, output_size=output_size)

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    train_log_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch', leave=False, mininterval=0.1, miniters=1, dynamic_ncols=True)

    for batch_X, batch_y in train_log_bar:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()

        batch_acc = (predicted == batch_y).float().mean().item()
        batch_loss = loss.item()
        train_log_bar.set_postfix(loss=batch_loss, acc=batch_acc)

    train_accuracy = train_correct / train_total
    train_loss /= len(train_loader)

    # 验证模型
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    val_log_bar = tqdm(test_loader, desc=f'Validation', unit='batch', leave=False, mininterval=0.1, miniters=1, dynamic_ncols=True)

    with torch.no_grad():
        for batch_X, batch_y in val_log_bar:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += batch_y.size(0)
            val_correct += (predicted == batch_y).sum().item()

            batch_acc = (predicted == batch_y).float().mean().item()
            batch_loss = loss.item()
            val_log_bar.set_postfix(loss=batch_loss, acc=batch_acc)

    val_accuracy = val_correct / val_total
    val_loss /= len(test_loader)

    # 每10个epoch打印一次训练和验证结果
    # if epoch % 10 == 0:
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')