In [26]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.linear_model import LogisticRegression, LinearRegression
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, r2_score, mean_absolute_error
import pickle
import torch

graph_num = 17

with open(f'../data/graphs/{graph_num}/linegraph_tg.pkl', 'rb') as f:
    graph_data = pickle.load(f)

bins = [int(i) for i in '400 800 1300 2100 3000 3700 4700 7020 9660'.split()]
bins = [int(i) for i in '3000'.split()]
bins = torch.tensor(bins)


In [34]:
graph_data.x[0]


tensor([2.0000e+00, 5.0000e+01, 1.0000e+00, 0.0000e+00, 7.4253e+01, 6.1045e-08,
        4.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00])

In [27]:
y = graph_data.y[graph_data.y > 0]
x = graph_data.x[graph_data.y > 0].numpy()


In [28]:
X_train, X_test, y_train, y_test = train_test_split(x[:,5], y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
regression_model = LinearRegression()


regression_model.fit(X_train.reshape(-1, 1), y_train)
y_train_pred = regression_model.predict(X_train.reshape(-1, 1))
y_valid_pred = regression_model.predict(X_valid.reshape(-1, 1))

print('Train R2:', r2_score(y_train, y_train_pred))
print('Valid R2:', r2_score(y_valid, y_valid_pred))
print('Train MAE:', mean_absolute_error(y_train, y_train_pred))
print('Valid MAE:', mean_absolute_error(y_valid, y_valid_pred))


Train R2: 0.024193975194143458
Valid R2: 0.005741773378489801
Train MAE: 3127.6548
Valid MAE: 3469.0444


In [29]:
y = graph_data.y[graph_data.y > 0]
x = graph_data.x[graph_data.y > 0].numpy()

y = torch.bucketize(y, boundaries=bins).numpy()


In [35]:
X_train, X_test, y_train, y_test = train_test_split(x[:,5].reshape(-1,1), y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (444, 1)
X_valid shape: (95, 1)
X_test shape: (96, 1)


In [36]:
# Create a Logistic Regression model
model = LogisticRegression(max_iter=10000, solver='saga', random_state=42, multi_class='auto')
# Fit the model on the training data
model.fit(X_train, y_train)
# Evaluate the model on the validation data
valid_accuracy = model.score(X_valid, y_valid)

print(confusion_matrix(y_valid, model.predict(X_valid)))
print(f"Validation accuracy: {valid_accuracy:.4f}")

print(classification_report(y_valid, model.predict(X_valid), zero_division=0))


[[53  0]
 [42  0]]
Validation accuracy: 0.5579
              precision    recall  f1-score   support

           0       0.56      1.00      0.72        53
           1       0.00      0.00      0.00        42

    accuracy                           0.56        95
   macro avg       0.28      0.50      0.36        95
weighted avg       0.31      0.56      0.40        95

