In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report

In [2]:
df = pd.read_parquet("../ETL/data/engineered/final_data_2025Q1.parquet")
df.head()

Unnamed: 0,side,log_dollar_value,role,is_10b5_1,log_size_vs_cap,label_up_market
3,sell,15.441086,OFFICER,0,-9.087956,1
4,sell,14.496378,DIRECTOR,0,-10.032665,1
5,buy,13.401964,OFFICER,0,-9.559271,1
6,sell,14.673079,OFFICER,1,-10.128313,0
7,sell,11.586464,DIRECTOR,1,-13.214937,0


In [3]:
print(df[df.isna().any(axis=1)]) # shows NaN values in data frame if it exists

Empty DataFrame
Columns: [side, log_dollar_value, role, is_10b5_1, log_size_vs_cap, label_up_market]
Index: []


There are no missing values

In [4]:
# Print data types of each column using row 1
print("Data types of each column:")
print(df.dtypes)

Data types of each column:
side                 object
log_dollar_value    float64
role                 object
is_10b5_1             int64
log_size_vs_cap     float64
label_up_market       int64
dtype: object


In [5]:
# Calculate percentage of 1 and 0 values for label_up_market
percentages = df['label_up_market'].value_counts(normalize=True) * 100
print(percentages)

label_up_market
0    60.685348
1    39.314652
Name: proportion, dtype: float64


In [6]:
attr = df.drop(columns=[ 'label_up_market'])
target = df['label_up_market']
attr.head()


# One-hot encode categorical columns
attr = pd.get_dummies(attr, columns=['side', 'role'], drop_first=False)

# normalize/center numeric features
# Normalize only numeric features
numeric_cols = ['log_dollar_value', 'log_size_vs_cap']
scaler = StandardScaler()
attr[numeric_cols] = scaler.fit_transform(attr[numeric_cols])

attr.head(10)

Unnamed: 0,log_dollar_value,is_10b5_1,log_size_vs_cap,side_buy,side_sell,role_DIRECTOR,role_OFFICER,role_OTHER,role_TENPERCENTOWNER
3,1.377117,0,0.222735,False,True,False,True,False,False
4,0.97555,0,-0.190471,False,True,True,False,False,False
5,0.510348,0,0.016587,True,False,False,True,False,False
6,1.05066,1,-0.232307,False,True,False,True,False,False
7,-0.261366,1,-1.582365,False,True,True,False,False,False
8,1.425304,1,0.153196,False,True,False,True,False,False
9,0.454037,0,-0.180519,False,True,False,True,False,False
10,1.285844,0,0.675398,False,True,False,True,False,False
11,0.595938,0,-0.034505,False,True,True,False,False,False
19,-0.367746,0,-0.87501,False,True,False,True,False,False


In [7]:
print("Data types of each column of attr:")
print(attr.dtypes)

Data types of each column of attr:
log_dollar_value        float64
is_10b5_1                 int64
log_size_vs_cap         float64
side_buy                   bool
side_sell                  bool
role_DIRECTOR              bool
role_OFFICER               bool
role_OTHER                 bool
role_TENPERCENTOWNER       bool
dtype: object


In [8]:
# Split data for traning and test
attr_train, attr_test, target_train,target_test  = train_test_split(attr, target,test_size = 0.3, random_state =82 , shuffle = True)
attr_train.head()



Unnamed: 0,log_dollar_value,is_10b5_1,log_size_vs_cap,side_buy,side_sell,role_DIRECTOR,role_OFFICER,role_OTHER,role_TENPERCENTOWNER
4150,-0.287891,0,-0.234843,False,True,False,True,False,False
815,-0.811775,1,-1.148759,False,True,False,True,False,False
5921,0.44868,1,-0.142854,False,True,False,True,False,False
1871,0.090969,1,0.082035,False,True,False,True,False,False
5137,0.990518,0,0.373957,True,False,True,False,False,False


In [9]:
k_values = [3, 5, 10]

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(attr_train, target_train)   # Train the Knn model
    target_pred = knn.predict(attr_test)    # Make predictions using Knn model
    accuracy = accuracy_score(target_test,target_pred ) 
    print(f'Accuracy of model with k = {k}: {accuracy}')
    print(f"Precision (for class 1): {precision_score(target_test, target_pred)}")
    print('')

Accuracy of model with k = 3: 0.6242969628796401
Precision (for class 1): 0.5248796147672552

Accuracy of model with k = 5: 0.609673790776153
Precision (for class 1): 0.504302925989673

Accuracy of model with k = 10: 0.6169853768278966
Precision (for class 1): 0.5243243243243243



Best accuracy was achieved with k = 3. 
Let's store predictions with k=3 so we can evaluate those results.

In [10]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(attr_train, target_train)
target_pred = knn.predict(attr_test)

In [11]:
# Metrics
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy:.4f}')
print(f"Precision (for class 1): {precision_score(target_test, target_pred)}")
print('Classification Report')
print(classification_report(target_test, target_pred))


Accuracy: 0.6243
Precision (for class 1): 0.5248796147672552
Classification Report
              precision    recall  f1-score   support

           0       0.68      0.73      0.70      1079
           1       0.52      0.47      0.49       699

    accuracy                           0.62      1778
   macro avg       0.60      0.60      0.60      1778
weighted avg       0.62      0.62      0.62      1778



# Performance
- Accuracy: 0.624
- Precision (class 1): 0.524