# ***Models Built from Notebook Data***


In [4]:
import pandas as pd
my_data=pd.read_csv("/content/drive/MyDrive/extracted_files/cleaned_dataset_new.csv")
my_data.head()

Unnamed: 0,status_encoded,relationships,funding_per_milestone,mean_funding_by_country,funding_relative_to_country,rounds_relative_to_category,founded_year,founded_month,milestone_duration,category_code_advertising,...,country_code_CAN,country_code_FRA,country_code_GBR,country_code_IND,state_code_FL,state_code_IL,state_code_MA,state_code_NJ,state_code_TX,state_code_WA
0,3,0.166892,-0.196758,-1.050366,-0.282498,-0.449253,0.115668,0.498795,-0.165372,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,-0.99301,-0.196758,-1.050366,-0.282498,-0.449253,0.219078,1.043039,-0.165372,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.166892,-0.196758,0.952049,-0.282498,-0.449253,0.219078,1.043039,-0.165372,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.746843,-0.196758,0.952049,-0.282498,-0.449253,0.219078,-0.861815,-0.165372,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0.166892,-0.196758,-1.050366,-0.282498,-0.449253,0.529306,-0.861815,-0.165372,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Splitting the data into train and test sets


In [6]:
from sklearn.model_selection import train_test_split
y=my_data["status_encoded"]
X=my_data.drop(["status_encoded"],axis=1)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.value_counts())


(28570, 26) (9524, 26) (9524, 26)
status_encoded
3    26732
0     1162
1      596
2       80
Name: count, dtype: int64


In [7]:
unique_values = y_train.unique()
print("Unique values in the training target:", unique_values)


Unique values in the training target: [3 0 1 2]


# Advanced Class Balancing Using SMOTE and Tomek Links

In [27]:
"""from imblearn.combine import SMOTETomek
from collections import Counter

print("Distribution before balancing:", Counter(y_train))

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

print("Distribution after balancing:", Counter(y_resampled))"""

'from imblearn.combine import SMOTETomek\nfrom collections import Counter\n\nprint("Distribution before balancing:", Counter(y_train))\n\nsmote_tomek = SMOTETomek(random_state=42)\nX_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)\n\nprint("Distribution after balancing:", Counter(y_resampled))'

# Dimensionality Reduction with PCA after Balancing the Dataset"

In [49]:
"""from sklearn.decomposition import PCA
from collections import Counter

print("Distribution after SMOTETomek:", Counter(y_resampled))

pca = PCA(n_components=0.95, random_state=42)
X_resampled_pca = pca.fit_transform(X_resampled)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)"""

print("Original number of features:", X_resampled.shape[1])
print("Reduced number of features after PCA:", X_resampled_pca.shape[1])

Original number of features: 26
Reduced number of features after PCA: 9


# Optimized Random Forest with Adjusted Thresholds (RF-Thresholded)

# Training an ExtraTreesClassifier for Rare Classes 1 & 2

**Purpose of the model:**  

This model focuses only on **classes 1 and 2**.  
It predicts whether a sample belongs to **class 1 or 2**, ignoring all other classes.  
It helps to **better identify class 2**, which is often minority and harder to recognize.


In [53]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

target_classes = [1, 2]
mask_train = y_train.isin(target_classes)
mask_val   = y_val.isin(target_classes)
mask_test  = y_test.isin(target_classes)

X_train_sel = X_train[mask_train]
y_train_sel = y_train[mask_train]
X_val_sel   = X_val[mask_val]
y_val_sel   = y_val[mask_val]
X_test_sel  = X_test[mask_test]
y_test_sel  = y_test[mask_test]

print("Distribution train sélectionnées classes:", Counter(y_train_sel))

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_sel)
X_val_pca   = pca.transform(X_val_sel)
X_test_pca  = pca.transform(X_test_sel)

et_sel = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

et_sel.fit(X_train_pca, y_train_sel)

y_train_pred = et_sel.predict(X_train_pca)
y_val_pred   = et_sel.predict(X_val_pca)
y_test_pred  = et_sel.predict(X_test_pca)

def eval_model(y_true, y_pred, title="Model"):
    print(f"\n=== {title} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

eval_model(y_train_sel, y_train_pred, "TRAIN ExtraTrees Classes 1&2")
eval_model(y_val_sel, y_val_pred, "VALIDATION ExtraTrees Classes 1&2")
eval_model(y_test_sel, y_test_pred, "TEST ExtraTrees Classes 1&2")

Distribution train sélectionnées classes: Counter({1: 596, 2: 80})

=== TRAIN ExtraTrees Classes 1&2 ===
Accuracy: 0.9748520710059172
              precision    recall  f1-score   support

           1       1.00      0.97      0.99       596
           2       0.84      0.97      0.90        80

    accuracy                           0.97       676
   macro avg       0.92      0.97      0.94       676
weighted avg       0.98      0.97      0.98       676

[[581  15]
 [  2  78]]

=== VALIDATION ExtraTrees Classes 1&2 ===
Accuracy: 0.9380530973451328
              precision    recall  f1-score   support

           1       0.98      0.94      0.96       199
           2       0.69      0.89      0.77        27

    accuracy                           0.94       226
   macro avg       0.84      0.92      0.87       226
weighted avg       0.95      0.94      0.94       226

[[188  11]
 [  3  24]]

=== TEST ExtraTrees Classes 1&2 ===
Accuracy: 0.9823008849557522
              precision    r

**Purpose of the model:**  

This model focuses on **classes 0, 1, and 2**, with a special emphasis on **class 0**, which is **underrepresented compared to class 3**.  
It is designed to **improve the recognition of class 0**, which is often harder to predict.  
It **complements the other models**, enhancing overall accuracy and detection of class 0 in the system.




In [60]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

target_classes = [0, 1, 2]
mask_train = y_train.isin(target_classes)
mask_val   = y_val.isin(target_classes)
mask_test  = y_test.isin(target_classes)

X_train_target = X_train[mask_train]
y_train_target = y_train[mask_train]
X_val_target   = X_val[mask_val]
y_val_target   = y_val[mask_val]
X_test_target  = X_test[mask_test]
y_test_target  = y_test[mask_test]

print("Distribution train sélectionnées classes:", Counter(y_train_target))


pca_target = PCA(n_components=0.95, random_state=42)
X_train_pca_target = pca_target.fit_transform(X_train_target)
X_val_pca_target   = pca_target.transform(X_val_target)
X_test_pca_target  = pca_target.transform(X_test_target)

et_target = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

et_target.fit(X_train_pca_target, y_train_target)


y_train_pred = et_target.predict(X_train_pca_target)
y_val_pred   = et_target.predict(X_val_pca_target)
y_test_pred  = et_target.predict(X_test_pca_target)


def eval_model(y_true, y_pred, title="Model"):
    print(f"\n=== {title} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

eval_model(y_train_target, y_train_pred, "TRAIN ExtraTrees Classes 0, 1, 2")
eval_model(y_val_target, y_val_pred, "VALIDATION ExtraTrees Classes 0, 1, 2")
eval_model(y_test_target, y_test_pred, "TEST ExtraTrees Classes 0, 1, 2")


Distribution train sélectionnées classes: Counter({0: 1162, 1: 596, 2: 80})

=== TRAIN ExtraTrees Classes 0, 1, 2 ===
Accuracy: 0.749183895538629
              precision    recall  f1-score   support

           0       0.94      0.65      0.77      1162
           1       0.70      0.93      0.80       596
           2       0.29      0.93      0.44        80

    accuracy                           0.75      1838
   macro avg       0.64      0.83      0.67      1838
weighted avg       0.84      0.75      0.76      1838

[[750 232 180]
 [ 41 553   2]
 [  5   1  74]]

=== VALIDATION ExtraTrees Classes 0, 1, 2 ===
Accuracy: 0.6590538336052202
              precision    recall  f1-score   support

           0       0.85      0.57      0.68       387
           1       0.64      0.86      0.73       199
           2       0.15      0.48      0.23        27

    accuracy                           0.66       613
   macro avg       0.55      0.64      0.55       613
weighted avg       0.75  

**Purpose of the model:**  

This model is designed to **identify class 3 specifically**.  
It treats **class 3 as positive** and all other classes (0, 1, 2) as negative, creating a **binary classification problem**.  
It is **trained with balanced sampling** to improve detection of the underrepresented class 3.


In [62]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter
import numpy as np


y_train_binary = (y_train == 3).astype(int)
y_val_binary   = (y_val == 3).astype(int)
y_test_binary  = (y_test == 3).astype(int)


pca_3 = PCA(n_components=0.95, random_state=42)
X_train_pca = pca_3.fit_transform(X_train)
X_val_pca   = pca_3.transform(X_val)
X_test_pca  = pca_3.transform(X_test)


pos_indices = np.where(y_train_binary == 1)[0]
neg_indices = np.where(y_train_binary == 0)[0]


size_neg = min(len(neg_indices), len(pos_indices))
neg_sample = np.random.choice(neg_indices, size=size_neg, replace=False)
train_indices = np.concatenate([pos_indices, neg_sample])

X_train_bal = X_train_pca[train_indices]
y_train_bal = y_train_binary.iloc[train_indices]

print("Distribution classes train équilibré:", Counter(y_train_bal))


et_class3 = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

et_class3.fit(X_train_bal, y_train_bal)


y_train_pred = et_class3.predict(X_train_pca)
y_val_pred   = et_class3.predict(X_val_pca)
y_test_pred  = et_class3.predict(X_test_pca)


def eval_model(y_true, y_pred, title="Model"):
    print(f"\n=== {title} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

eval_model(y_train_binary, y_train_pred, "TRAIN ExtraTrees Classe 3 vs Others")
eval_model(y_val_binary, y_val_pred, "VALIDATION ExtraTrees Classe 3 vs Others")
eval_model(y_test_binary, y_test_pred, "TEST ExtraTrees Classe 3 vs Others")

Distribution classes train équilibré: Counter({1: 26732, 0: 1838})

=== TRAIN ExtraTrees Classe 3 vs Others ===
Accuracy: 0.7268113405670283
              precision    recall  f1-score   support

           0       0.16      0.77      0.26      1838
           1       0.98      0.72      0.83     26732

    accuracy                           0.73     28570
   macro avg       0.57      0.74      0.55     28570
weighted avg       0.93      0.73      0.80     28570

[[ 1407   431]
 [ 7374 19358]]

=== VALIDATION ExtraTrees Classe 3 vs Others ===
Accuracy: 0.7154556908861823
              precision    recall  f1-score   support

           0       0.14      0.66      0.23       613
           1       0.97      0.72      0.83      8911

    accuracy                           0.72      9524
   macro avg       0.55      0.69      0.53      9524
weighted avg       0.91      0.72      0.79      9524

[[ 402  211]
 [2499 6412]]

=== TEST ExtraTrees Classe 3 vs Others ===
Accuracy: 0.707685846283

**Final Decision Model: Multi-Stage Classification for Classes 0, 1, 2, and 3**


**Justification of the Final Decision Strategy:**

- **Model 1 (Classes 1 & 2):** detects only classes 1 and 2.  
- **Model 2 (Classes 0, 1 & 2):** detects classes 0, 1, and 2, with a focus on **class 0**, which is underrepresented compared to class 3.  
- **Model 3 (Class 3 vs Others):** detects **class 3**.  

**Final Decision Logic:**

1. If Model 3 predicts class 3, then **final class = 3**.  
2. Else, if Model 2 predicts class 0, then **final class = 0**.  
3. Otherwise, use the prediction from **Model 1** for class 1 or 2.  

**Priority Order:** 3 → 0 → 1 or 2.  

This strategy accounts for the fact that **classes 0 and 3 are majorities**, while **classes 1 and 2 are minorities**.


In [63]:
def final_prediction(X):
    X_pca_12 = pca.transform(X)
    X_pca_012 = pca_target.transform(X)
    X_pca_3 = pca_3.transform(X)

    pred_3 = et_class3.predict(X_pca_3)
    pred_012 = et_target.predict(X_pca_012)
    pred_12 = et_sel.predict(X_pca_12)

    final_pred = []

    for i in range(len(X)):
        if pred_3[i] == 1:
            final_pred.append(3)
        elif pred_012[i] == 0:
            final_pred.append(0)
        else:
            final_pred.append(pred_12[i])

    return np.array(final_pred)

y_final_test = final_prediction(X_test)

print("\n=== FINAL DECISION TEST ===")
print("Accuracy:", accuracy_score(y_test, y_final_test))
print(classification_report(y_test, y_final_test))
print(confusion_matrix(y_test, y_final_test))



=== FINAL DECISION TEST ===
Accuracy: 0.6943511129777404
              precision    recall  f1-score   support

           0       0.13      0.35      0.19       387
           1       0.10      0.69      0.17       199
           2       0.03      0.56      0.06        27
           3       0.97      0.71      0.82      8911

    accuracy                           0.69      9524
   macro avg       0.31      0.58      0.31      9524
weighted avg       0.91      0.69      0.78      9524

[[ 134   52   61  140]
 [   2  137    2   58]
 [   9    1   15    2]
 [ 903 1243  438 6327]]
