In [3]:
import numpy as np

# 1. Import necessary libraries and modules
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# 2. Load and preprocess data

data = pd.read_csv('data/encoded_data_only_cpv_ohe.csv')
X = data.drop('organizationId', axis=1)
y = data['organizationId']

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define the model
clf = DecisionTreeClassifier(random_state=42)

# 5. Train the model on the training data
clf.fit(X_train, y_train)

# 6. Evaluate the model on the testing data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_test, y_pred, average='macro')

# 7. Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

# 6. Evaluate the model on the testing data and predict top N companyId options
N = 100 # Number of top predictions to consider
y_pred_proba = clf.predict_proba(X_test)
y_pred_topN = np.argsort(y_pred_proba, axis=1)[:, :-N-1:-1]
y_pred_topN_classes = clf.classes_[y_pred_topN]

# Check if actual companyId is among the top N predictions for each data point
is_in_topN = np.any(y_test.values.reshape(-1, 1) == y_pred_topN_classes, axis=1)

# Compute performance metrics based on whether actual companyId is in the top N predictions
accuracy_topN = np.mean(is_in_topN)
precision_topN = np.mean([np.any(y_test.values[i] == y_pred_topN_classes[i]) for i in range(len(y_test))])
recall_topN = np.mean(is_in_topN)
f1_topN = 2 * (precision_topN * recall_topN) / (precision_topN + recall_topN)

# 7. Print the performance metrics
print("Top-{} Accuracy: {:.2f}%".format(N, accuracy_topN * 100))
print("Top-{} Precision: {:.2f}%".format(N, precision_topN * 100))
print("Top-{} Recall: {:.2f}%".format(N, recall_topN * 100))
print("Top-{} F1 Score: {:.2f}%".format(N, f1_topN * 100))


Accuracy: 10.83%
Precision: 50.12%
Recall: 39.41%
F1 Score: 4.55%
Top-100 Accuracy: 11.48%
Top-100 Precision: 11.48%
Top-100 Recall: 11.48%
Top-100 F1 Score: 11.48%


In [2]:
# 1. Import necessary libraries and modules
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

# 2. Load and preprocess data
# data = pd.read_csv('data/processed_data.csv')
# data_sample = data.sample(n=5550, random_state=42)
# X, y = process_data(data_sample)

data = pd.read_csv('data/encoded_data_only_cpv_ohe.csv')
# data = data.sample(n=2000, random_state=42)
X = data.drop('organizationId', axis=1)
y = data['organizationId']

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define the model
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=30)

# 5. Train the model on the training data
clf.fit(X_train, y_train)

# 6. Make predictions on the testing data
y_pred = clf.predict(X_test)

# 7. Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_test, y_pred, average='macro')

# 8. Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))


Accuracy: 12.36%
Precision: 54.05%
Recall: 36.53%
F1 Score: 5.05%
