In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv("cleaned_dataset_new.csv")
df.head()

Unnamed: 0,status_encoded,relationships,funding_per_milestone,mean_funding_by_country,funding_relative_to_country,rounds_relative_to_category,founded_year,founded_month,milestone_duration,category_code_advertising,...,country_code_CAN,country_code_FRA,country_code_GBR,country_code_IND,state_code_FL,state_code_IL,state_code_MA,state_code_NJ,state_code_TX,state_code_WA
0,3,0.166892,-0.196758,-1.050366,-0.282498,-0.449253,0.115668,0.498795,-0.165372,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,-0.99301,-0.196758,-1.050366,-0.282498,-0.449253,0.219078,1.043039,-0.165372,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.166892,-0.196758,0.952049,-0.282498,-0.449253,0.219078,1.043039,-0.165372,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.746843,-0.196758,0.952049,-0.282498,-0.449253,0.219078,-0.861815,-0.165372,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0.166892,-0.196758,-1.050366,-0.282498,-0.449253,0.529306,-0.861815,-0.165372,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X = df.drop('status_encoded', axis=1)
y = df['status_encoded']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
svm_ovo = SVC(kernel='rbf', decision_function_shape='ovo', random_state=42)
svm_ovo.fit(X_train, y_train)


In [6]:
y_pred = svm_ovo.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9356362872742545

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       387
           1       0.00      0.00      0.00       199
           2       0.00      0.00      0.00        27
           3       0.94      1.00      0.97      8911

    accuracy                           0.94      9524
   macro avg       0.23      0.25      0.24      9524
weighted avg       0.88      0.94      0.90      9524


Confusion Matrix:
 [[   0    0    0  387]
 [   0    0    0  199]
 [   1    0    0   26]
 [   0    0    0 8911]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
svm_ovo_1 = SVC(kernel='rbf', decision_function_shape='ovo', class_weight='balanced', random_state=42)
svm_ovo_1.fit(X_train, y_train)


In [9]:
y_pred = svm_ovo_1.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.11      0.49      0.18       387
           1       0.10      0.73      0.17       199
           2       0.02      0.37      0.03        27
           3       0.98      0.63      0.77      8911

    accuracy                           0.63      9524
   macro avg       0.30      0.56      0.29      9524
weighted avg       0.92      0.63      0.73      9524



In [14]:
from collections import Counter


min_class_count = min(Counter(y_train).values())
k_neighbors = max(1, min(5, min_class_count - 1))

In [15]:
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_res, y_res = smote.fit_resample(X_train, y_train)


In [16]:
print("Class distribution BEFORE SMOTE (train):", Counter(y_train))

print("Class distribution AFTER SMOTE (train):", Counter(y_res))



Class distribution BEFORE SMOTE (train): Counter({3: 35643, 0: 1549, 1: 795, 2: 107})
Class distribution AFTER SMOTE (train): Counter({3: 35643, 0: 35643, 2: 35643, 1: 35643})


In [17]:
svm_ovo = SVC(
    kernel="rbf",
    decision_function_shape="ovo",
    class_weight=None,          # try None first after SMOTE; you can switch to 'balanced' if needed
    C=1.0,
    gamma="scale",
    random_state=42
)
svm_ovo.fit(X_res, y_res)

In [20]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


y_pred = svm_ovo.predict(X_test)

print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Macro F1: 0.29435165408208036
Weighted F1: 0.7511280106807003

Classification Report:
               precision    recall  f1-score   support

           0       0.11      0.46      0.17       387
           1       0.10      0.61      0.17       199
           2       0.02      0.41      0.04        27
           3       0.97      0.67      0.79      8911

    accuracy                           0.66      9524
   macro avg       0.30      0.54      0.29      9524
weighted avg       0.92      0.66      0.75      9524


Confusion Matrix:
 [[ 178   36   66  107]
 [  16  122    1   60]
 [  13    0   11    3]
 [1468 1045  453 5945]]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train class counts:", Counter(y_train))


Train class counts: Counter({3: 35643, 0: 1549, 1: 795, 2: 107})


In [22]:
# 3) Build a partial-oversample SMOTE inside SMOTEENN
counts = Counter(y_train)
maj_class = max(counts, key=counts.get)
maj_count = counts[maj_class]

# target minority size ~ 40% of majority (tweak 0.3–0.6 as needed)
target_ratio = 0.4
sampling_strategy = {c: int(target_ratio * maj_count) for c in counts if c != maj_class}

In [24]:
from imblearn.combine import SMOTEENN


# pick a safe k_neighbors (must be < min minority count)
min_minor = min(counts[c] for c in counts if c != maj_class)
k_neighbors = max(1, min(5, min_minor - 1))

sampler = SMOTEENN(
    smote=SMOTE(random_state=42, k_neighbors=k_neighbors, sampling_strategy=sampling_strategy),
    random_state=42
)

X_res, y_res = sampler.fit_resample(X_train, y_train)
print("After SMOTEENN:", Counter(y_res))


After SMOTEENN: Counter({3: 28202, 2: 13361, 1: 12668, 0: 8472})


In [26]:
# 4) Train a softer SVM (smaller C and gamma)
svm_ovo_5 = SVC(
    kernel="rbf",
    decision_function_shape="ovo",
    C=1.0,            # try 0.5 → 1.0 → 2.0
    gamma=0.01,       # try 'scale' vs 0.01
    random_state=42
)
svm_ovo_5.fit(X_res, y_res)


In [29]:
y_pred = svm_ovo_5.predict(X_test)
print(f"Macro F1: {f1_score(y_test, y_pred, average='macro', zero_division=0):.4f}")
print(f"Weighted F1: {f1_score(y_test, y_pred, average='weighted', zero_division=0):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Macro F1: 0.3015
Weighted F1: 0.7798

Classification Report:
               precision    recall  f1-score   support

           0       0.15      0.21      0.18       387
           1       0.09      0.76      0.16       199
           2       0.02      0.63      0.04        27
           3       0.97      0.72      0.82      8911

    accuracy                           0.70      9524
   macro avg       0.31      0.58      0.30      9524
weighted avg       0.91      0.70      0.78      9524


Confusion Matrix:
 [[  81   39   86  181]
 [   5  151    2   41]
 [   8    0   17    2]
 [ 433 1466  638 6374]]
