In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score

In [2]:
df = pd.read_parquet("../ETL/data/engineered/final_data_2025Q1.parquet")
print(len(df))
df.head(10)
print(df.describe())

5924
       log_dollar_value    is_10b5_1  log_size_vs_cap  label_up_market
count       5924.000000  5924.000000      5924.000000      5924.000000
mean          12.201343     0.391796        -9.597193         0.393147
std            2.352754     0.488193         2.286482         0.488490
min            0.850749     0.000000       -18.262147         0.000000
25%           10.789616     0.000000       -11.017295         0.000000
50%           12.231095     0.000000        -9.597609         0.000000
75%           13.666852     1.000000        -8.178504         1.000000
max           21.731577     1.000000         1.559461         1.000000


## Splitting data into attributes & target

In [3]:
attr = df.drop(columns=[ 'label_up_market'])
target = df['label_up_market']
attr.head()

Unnamed: 0,side,log_dollar_value,role,is_10b5_1,log_size_vs_cap
3,sell,15.441086,OFFICER,0,-9.087956
4,sell,14.496378,DIRECTOR,0,-10.032665
5,buy,13.401964,OFFICER,0,-9.559271
6,sell,14.673079,OFFICER,1,-10.128313
7,sell,11.586464,DIRECTOR,1,-13.214937


## Normalizing/Centering data

In [4]:
# One-hot encode categorical columns
attr = pd.get_dummies(attr, columns=['side', 'role'], drop_first=False)

# normalize/center numeric features
# Normalize only numeric features
numeric_cols = ['log_dollar_value', 'log_size_vs_cap']
scaler = StandardScaler()
attr[numeric_cols] = scaler.fit_transform(attr[numeric_cols])

attr.head(10)

Unnamed: 0,log_dollar_value,is_10b5_1,log_size_vs_cap,side_buy,side_sell,role_DIRECTOR,role_OFFICER,role_OTHER,role_TENPERCENTOWNER
3,1.377117,0,0.222735,False,True,False,True,False,False
4,0.97555,0,-0.190471,False,True,True,False,False,False
5,0.510348,0,0.016587,True,False,False,True,False,False
6,1.05066,1,-0.232307,False,True,False,True,False,False
7,-0.261366,1,-1.582365,False,True,True,False,False,False
8,1.425304,1,0.153196,False,True,False,True,False,False
9,0.454037,0,-0.180519,False,True,False,True,False,False
10,1.285844,0,0.675398,False,True,False,True,False,False
11,0.595938,0,-0.034505,False,True,True,False,False,False
19,-0.367746,0,-0.87501,False,True,False,True,False,False


## Splitting data into training and testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(attr, target,random_state=82, test_size=0.3)
print(X_train.shape)
print(y_train.shape)

(4146, 9)
(4146,)


## Artificial Neural Network

In [6]:
#  one hidden layer containing 4 neurons, default to ReLU activation
model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=10000, random_state=82)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluation

In [7]:
# Accuracy Score
accuracy = accuracy_score(y_pred, y_test)
print("Accuracy: {} ({:.2%})".format(accuracy, accuracy))

print(classification_report(y_test, y_pred, digits=3))


Accuracy: 0.6293588301462317 (62.94%)
              precision    recall  f1-score   support

           0      0.642     0.878     0.742      1079
           1      0.566     0.246     0.343       699

    accuracy                          0.629      1778
   macro avg      0.604     0.562     0.542      1778
weighted avg      0.612     0.629     0.585      1778



In [8]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(f"Precision (for class 1): {precision_score(y_test, y_pred)}")
print('Classification Report')
print(classification_report(y_test, y_pred))

Accuracy: 0.6294
Precision (for class 1): 0.5657894736842105
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.88      0.74      1079
           1       0.57      0.25      0.34       699

    accuracy                           0.63      1778
   macro avg       0.60      0.56      0.54      1778
weighted avg       0.61      0.63      0.59      1778



# Performance
- Accuracy: 0.642
- Precision (class 1): 0.565