In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
#import data
df = pd.read_csv("/Users/fynnhufler/Documents/ETH/Datathon/data/skylab_instagram_datathon_dataset.csv", header = 0, sep=";")

#convert dates to numerical values
df_date_numerical = df
df_date_numerical['period_end_date'] = pd.to_datetime(df['period_end_date'])

print(df.columns)
#show head and information
df.head()
df.info()

Index(['period', 'period_end_date', 'compset_group', 'compset',
       'business_entity_doing_business_as_name', 'legal_entity_name',
       'domicile_country_name', 'ultimate_parent_legal_entity_name',
       'primary_exchange_name', 'calculation_type', 'followers', 'pictures',
       'videos', 'comments', 'likes'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704313 entries, 0 to 704312
Data columns (total 15 columns):
 #   Column                                  Non-Null Count   Dtype         
---  ------                                  --------------   -----         
 0   period                                  704313 non-null  object        
 1   period_end_date                         704313 non-null  datetime64[ns]
 2   compset_group                           704313 non-null  object        
 3   compset                                 704313 non-null  object        
 4   business_entity_doing_business_as_name  704313 non-null  object        
 5   lega

In [34]:
df['change followers'] = df['followers'].diff()
df['change followers'] = df['followers'].fillna(0)
df['change pictures'] = df['pictures'].diff()
df['change pictures'] = df['pictures'].fillna(0)
df['change comments'] = df['comments'].diff()
df['change comments'] = df['comments'].fillna(0)
df['change likes'] = df['likes'].diff()
df['change likes'] = df['likes'].fillna(0)
df.head()

Unnamed: 0,period,period_end_date,compset_group,compset,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,calculation_type,followers,pictures,videos,comments,likes,change followers,change pictures,change comments,change likes
0,Weekly,2019-12-07,Luxury & Premium & Mainstream,Soft Luxury,Versace,Capri Holdings,United States of America,Capri Holdings,New York Stock Exchange,Metric Value,22066189.0,97.0,13.0,16239.0,6112205.0,22066189.0,97.0,16239.0,6112205.0
1,Weekly,2021-01-30,Luxury & Premium & Mainstream,Luxury & Premium & Mainstream,Versace,Capri Holdings,United States of America,Capri Holdings,New York Stock Exchange,Metric Value,24289963.0,54.0,2.0,13358.0,3189144.0,24289963.0,54.0,13358.0,3189144.0
2,Weekly,2016-05-14,Luxury & Premium & Mainstream,Soft Luxury,Versace,Capri Holdings,United States of America,Capri Holdings,New York Stock Exchange,Metric Value,7137442.0,42.0,8.0,15894.0,2152955.0,7137442.0,42.0,15894.0,2152955.0
3,Weekly,2019-04-20,Luxury & Premium & Mainstream,Luxury & Premium & Mainstream,Versace,Capri Holdings,United States of America,Capri Holdings,New York Stock Exchange,Metric Value,18901125.0,73.0,9.0,26762.0,5224916.0,18901125.0,73.0,26762.0,5224916.0
4,Weekly,2023-03-25,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,Versace,Capri Holdings,United States of America,Capri Holdings,New York Stock Exchange,Metric Value,30251374.0,54.0,10.0,25681.0,5760987.0,30251374.0,54.0,25681.0,5760987.0


In [None]:
# Assuming df is your DataFrame containing the data

# Prepare the data
X = df.iloc[:, -5:]  # Last five columns as features
Y = df['strong_deviation_label']  # Strong deviation labels

# Convert data to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
Y_tensor = torch.tensor(Y.values, dtype=torch.float32).unsqueeze(1)  # Add extra dimension for compatibility with PyTorch

# Create PyTorch DataLoader
dataset = TensorDataset(X_tensor, Y_tensor)
train_size = int(0.8 * len(dataset))
train_data, test_data = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Define the model
class DeviationPredictor(nn.Module):
    def __init__(self, input_size):
        super(DeviationPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Output layer (1-dimensional for binary classification)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Sigmoid activation for binary classification
        return x

# Initialize the model
input_size = X.shape[1]  # Number of features
model = DeviationPredictor(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_tensor).numpy()

# Threshold the predicted probabilities to get binary predictions (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate the performance
accuracy = (Y.values == y_pred.flatten()).mean()
print(f'Accuracy: {accuracy:.4f}')
