In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from joblib import dump



In [2]:
# Create your models here.
X = pd.read_csv("C:/Users/Alok Yadav/Desktop/TechnoColab/Main_Project/mi_processed_data.csv")
y = X[['status', 'isClosed']]
yStatus = X.pop('status')
yClosed = X.pop('isClosed')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [4]:
# Extracting specific columns for different target variables:

yStatus_train = y_train.iloc[:,0]
yClosed_train = y_train.iloc[:,1]

yStatus_test = y_test.iloc[:,0]
yClosed_test = y_test.iloc[:,1]

In [5]:
stdscaler = StandardScaler()
minmaxscaler = MinMaxScaler()
pca = PCA(n_components=0.9)
smote = SMOTE(random_state=42)

In [6]:
XGB = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
LGBM = LGBMClassifier()
NB = MultinomialNB()

In [7]:
model_xgb = ImbPipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('smote', smote),
    ('classifier', XGB)
])

# Fit the model:
model_xgb.fit(X_train, yClosed_train)

In [8]:
model_lgbm = ImbPipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('smote', smote),
    ('classifier', LGBM)
])

# Fit the model:
model_lgbm.fit(X_train, yClosed_train)

[LightGBM] [Info] Number of positive: 41182, number of negative: 41182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001829 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 82364, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [9]:
model_nb = ImbPipeline([
    ('minmaxscaler', minmaxscaler),
    ('smote', smote),
    ('classifier', NB)
])

# Fit the model:
model_nb.fit(X_train, yStatus_train)

In [10]:
class CustomPipeline(BaseEstimator, ClassifierMixin):
    def __init__(self, binary_model_1, binary_model_2, multiclass_model, label_encoder=None):
        self.binary_model_1 = binary_model_1
        self.binary_model_2 = binary_model_2
        self.multiclass_model = multiclass_model
        self.label_encoder = label_encoder

    def fit(self, X, y_binary, y_multiclass):
        # Fit binary models on all data
        self.binary_model_1.fit(X, y_binary)
        self.binary_model_2.fit(X, y_binary)

        # Fit multiclass model on filtered subsets for both sets of conditions
        multiclass_indices = (self.binary_model_1.predict(X) == 1) & (self.binary_model_2.predict(X) == 1)
        acquired_closed_indices = (self.binary_model_1.predict(X) == 0) | (self.binary_model_2.predict(X) == 0)

        # Perhaps check the actual labels being passed
        print("Labels for multiclass training:", y_multiclass[multiclass_indices | acquired_closed_indices])

        self.multiclass_model.fit(X[multiclass_indices | acquired_closed_indices], y_multiclass[multiclass_indices | acquired_closed_indices])
        return self

    def predict(self, X):
        pred_1 = self.binary_model_1.predict(X)
        pred_2 = self.binary_model_2.predict(X)
        final_predictions = np.empty(X.shape[0], dtype=int)  # ensure dtype is int for consistency

        for i in range(X.shape[0]):
            if pred_1[i] == 1 and pred_2[i] == 1:
                # Predict 'operating' or 'IPO'
                raw_pred = self.multiclass_model.predict(X[i:i+1])[0]
            else:
                # Predict 'acquired' or 'closed'
                raw_pred = self.multiclass_model.predict(X[i:i+1])[0]

            # Ensure predictions are handled as integers
            final_predictions[i] = int(raw_pred)  # Make sure predictions are integers

        return final_predictions

In [11]:
# Define all possible labels explicitly
all_labels = ['operating', 'ipo', 'acquired', 'closed']  # Add all potential labels
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Encode the multiclass labels using the label encoder
yStatus_train_encoded = label_encoder.transform(yStatus_train)
yStatus_test_encoded = label_encoder.transform(yStatus_test)

In [12]:
combined_pipeline = CustomPipeline(
    binary_model_1=model_xgb,
    binary_model_2=model_lgbm,
    multiclass_model=model_nb,
    label_encoder=label_encoder
)

# Fit the pipeline using the encoded multiclass labels and binary labels
combined_pipeline.fit(X_train, yClosed_train, yStatus_train_encoded)

[LightGBM] [Info] Number of positive: 41182, number of negative: 41182
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 82364, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Labels for multiclass training: [3 3 3 ... 2 3 0]


In [14]:
# Define the base path for your models
base_path = 'C:/Users/Alok Yadav/Desktop/TechnoColab/Main_Project/WebApp/myproject/mlmodel/models/'

# Save the XGB model
dump(model_xgb, base_path + "model_xgb.joblib")

# Save the LGBM model
dump(model_lgbm, base_path + "model_lgbm.joblib")

# Save the NB model
dump(model_nb, base_path + "model_nb.joblib")

# Save the combined pipeline
dump(combined_pipeline, base_path + "combined_pipeline.joblib")

['C:/Users/Alok Yadav/Desktop/TechnoColab/Main_Project/WebApp/myproject/mlmodel/models/combined_pipeline.joblib']