In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, classification_report
import matplotlib.pyplot as plt


In [2]:
df = pd.read_parquet("../ETL/data/engineered/final_data_2025Q1.parquet")
df.head()

Unnamed: 0,side,log_dollar_value,role,is_10b5_1,log_size_vs_cap,label_up_market
3,sell,15.441086,OFFICER,0,-9.087956,1
4,sell,14.496378,DIRECTOR,0,-10.032665,1
5,buy,13.401964,OFFICER,0,-9.559271,1
6,sell,14.673079,OFFICER,1,-10.128313,0
7,sell,11.586464,DIRECTOR,1,-13.214937,0


In [3]:
print(df[df.isna().any(axis=1)]) # shows NaN values in data frame if it exists

Empty DataFrame
Columns: [side, log_dollar_value, role, is_10b5_1, log_size_vs_cap, label_up_market]
Index: []


In [4]:
# Calculate percentage of 1 and 0 values for label_up_market
percentages = df['label_up_market'].value_counts(normalize=True) * 100
print(percentages)

label_up_market
0    60.685348
1    39.314652
Name: proportion, dtype: float64


In [5]:
# Print data types of each column using row 1
print("Data types of each column:")
print(df.dtypes)

Data types of each column:
side                 object
log_dollar_value    float64
role                 object
is_10b5_1             int64
log_size_vs_cap     float64
label_up_market       int64
dtype: object


In [6]:
attr = df.drop(columns=[ 'label_up_market'])
target = df['label_up_market']
attr.head()


# One-hot encode categorical columns
attr = pd.get_dummies(attr, columns=['side', 'role'], drop_first=False)
attr.head()

Unnamed: 0,log_dollar_value,is_10b5_1,log_size_vs_cap,side_buy,side_sell,role_DIRECTOR,role_OFFICER,role_OTHER,role_TENPERCENTOWNER
3,15.441086,0,-9.087956,False,True,False,True,False,False
4,14.496378,0,-10.032665,False,True,True,False,False,False
5,13.401964,0,-9.559271,True,False,False,True,False,False
6,14.673079,1,-10.128313,False,True,False,True,False,False
7,11.586464,1,-13.214937,False,True,True,False,False,False


In [7]:
attr_train, attr_test, target_train,target_test  = train_test_split(attr, target,test_size = 0.3, random_state =82 , shuffle = True)

gnb = GaussianNB()

model = gnb
# Train model
model.fit(attr_train, target_train)

# Make predictions on the test set
target_pred = model.predict(attr_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy:.4f}')
print(f"Precision (for class 1): {precision_score(target_test, target_pred)}")
print('Classification Report')
print(classification_report(target_test, target_pred))


Accuracy: 0.5906
Precision (for class 1): 0.3644859813084112
Classification Report
              precision    recall  f1-score   support

           0       0.61      0.94      0.74      1079
           1       0.36      0.06      0.10       699

    accuracy                           0.59      1778
   macro avg       0.48      0.50      0.42      1778
weighted avg       0.51      0.59      0.48      1778



# Performance
- Accuracy: 0.59
- Precision (class 1): 0.364