# Read Dataset


In [14]:
import pandas as pd

# Adjust the path to wherever your CSV is located
df_factors = pd.read_csv("F-F_Research_Data_5_Factors_2x3_daily.CSV")

# Let's print the head
print(df_factors.head())

# df has 7 columns: [Date, Mkt-RF, SMB, HML, RMW, CMA, RF]
df_factors.set_index('Date', inplace=True)

print(df_factors.head())

       Date  Mkt-RF   SMB   HML   RMW   CMA     RF
0  19630701   -0.67  0.02 -0.35  0.03  0.13  0.012
1  19630702    0.79 -0.28  0.28 -0.08 -0.21  0.012
2  19630703    0.63 -0.18 -0.10  0.13 -0.25  0.012
3  19630705    0.40  0.09 -0.28  0.07 -0.30  0.012
4  19630708   -0.63  0.07 -0.20 -0.27  0.06  0.012
          Mkt-RF   SMB   HML   RMW   CMA     RF
Date                                           
19630701   -0.67  0.02 -0.35  0.03  0.13  0.012
19630702    0.79 -0.28  0.28 -0.08 -0.21  0.012
19630703    0.63 -0.18 -0.10  0.13 -0.25  0.012
19630705    0.40  0.09 -0.28  0.07 -0.30  0.012
19630708   -0.63  0.07 -0.20 -0.27  0.06  0.012


In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class FamaFrenchDataset(Dataset):
    def __init__(self, df):
        """
        df: a pandas dataframe with columns [Mkt-RF, SMB, HML, RMW, CMA, RF]
        """
        # Convert entire dataframe to a float tensor
        self.features = torch.tensor(df.values, dtype=torch.float64)
        
        # Example target: We can define any label. 
        # For demonstration, let's do a simplistic rule:
        # if (Mkt-RF) > 0 => label = +1, else label = -1
        # In practice, your label might come from another file (asset returns, etc.)
        mktrf_col_index = df.columns.get_loc("Mkt-RF")
        mkt_rf_values = self.features[:, mktrf_col_index]
        self.labels = torch.where(mkt_rf_values > 0, 1.0, -1.0)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        x = self.features[idx]  # shape: [6] if 6 columns
        y = self.labels[idx]    # scalar in {+1, -1}
        return x, y

# Instantiate the dataset
dataset = FamaFrenchDataset(df_factors)

# Optional: DataLoader for batching
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Quick check
for batch_x, batch_y in data_loader:
    print("Features shape:", batch_x.shape)
    print("Labels shape:", batch_y.shape)
    break

Features shape: torch.Size([32, 6])
Labels shape: torch.Size([32])


In [19]:
import torch.nn as nn
import torch.optim as optim

class SVMModel(nn.Module):
    def __init__(self, input_dim):
        super(SVMModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

def hinge_loss(scores, labels):
    labels = labels.view(-1, 1)
    return torch.clamp(1 - scores * labels, min=0).mean()

def train_factors_svm(data_loader, num_features=6, epochs=10, lr=1e-2):
    model = SVMModel(input_dim=num_features).double()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0.0
        for batch_x, batch_y in data_loader:
            # convert to float 64
            batch_x = batch_x.double()
            batch_y = batch_y.double()

            optimizer.zero_grad()
            scores = model(batch_x)  # shape [batch_size, 1]
            loss = hinge_loss(scores, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(batch_x)
        
        avg_loss = total_loss / len(data_loader.dataset)
        print(f"Epoch {epoch+1}/{epochs}, Loss={avg_loss:.4f}")
    
    return model

# Now train
model = train_factors_svm(data_loader)

Epoch 1/10, Loss=0.3908
Epoch 2/10, Loss=0.1877
Epoch 3/10, Loss=0.1393
Epoch 4/10, Loss=0.1137
Epoch 5/10, Loss=0.0971
Epoch 6/10, Loss=0.0851
Epoch 7/10, Loss=0.0759
Epoch 8/10, Loss=0.0682
Epoch 9/10, Loss=0.0617
Epoch 10/10, Loss=0.0562
