In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP
import preprocess
import numpy as np 

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier


In [None]:

# Load and preprocess data
df = pd.read_excel('./clean_dataset/data-base.xlsx')
df = df.dropna()
df.drop(columns=['link'], inplace=True)
df = preprocess.preprocess(df)

df['title'] = df['title'].astype('str')
# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['severity_rating'], random_state=42)


In [None]:

# Prepare data, extract embeddings, and prepare sub-models
docs = train_df['title']
docs = docs.reset_index(drop=True)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings_train = sentence_model.encode(docs, show_progress_bar=True)

# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(embeddings_train)

# Train our topic model
topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, 
                       vectorizer_model=vectorizer_model, calculate_probabilities=True, nr_topics=40)
topics_train, probs = topic_model.fit_transform(docs, embeddings_train)


new_topics = topic_model.reduce_outliers(docs, topics_train, probabilities=probs, 
                             threshold=0.05, strategy="embeddings")
topic_model.update_topics(docs, topics=new_topics)

# Add topics to the dataframe
train_df['topics'] = new_topics

# Transform test data using the trained topic model (without retraining)
test_df = test_df.reset_index(drop=True)
topics_test, probs = topic_model.transform(test_df['title'])

# Map outliers to the closest topic based on topic probabilities
outlier_indices = [i for i, topic in enumerate(topics_test) if topic == -1]
for idx in outlier_indices:
    topics_test[idx] = np.argmax(probs[idx])

test_df['topics'] = topics_test



In [None]:
# Map topics to their representative words for both training and test data
topic_info = topic_model.get_topic_info()
train_df['topic_words'] = train_df['topics'].apply(lambda x: topic_info.loc[x, 'Representation'] if x != -1 else "NoisyTopic")
test_df['topic_words'] = test_df['topics'].apply(lambda x: topic_info.loc[x, 'Representation'] if x != -1 else "NoisyTopic")

train_df['topic_words_str'] = train_df['topic_words'].apply(' '.join)
test_df['topic_words_str'] = test_df['topic_words'].apply(' '.join)

# Generate features for training and test data
vectorizer = TfidfVectorizer()
X_train_words = vectorizer.fit_transform(train_df['topic_words_str'])
X_test_words = vectorizer.transform(test_df['topic_words_str'])

# Map topics to predominant severity using only the training data
topic_severity_mapping = {}
for topic in train_df['topics'].unique():
    predominant_severity = train_df[train_df['topics'] == topic]['severity_rating'].mode()[0]
    topic_severity_mapping[topic] = predominant_severity

# Apply the mapping to both training and test data
train_df['topic_severity'] = train_df['topics'].map(topic_severity_mapping)
test_df['topic_severity'] = test_df['topics'].map(lambda x: topic_severity_mapping.get(x, "Unknown"))

le2 = LabelEncoder()
train_df['encoded_topic_severity'] = le2.fit_transform(train_df['topic_severity'])
test_df['encoded_topic_severity'] = le2.transform(test_df['topic_severity'])
# One-hot encode the 'vuln_type' column for both training and test data
train_vuln_type_encoded = pd.get_dummies(train_df['vuln_type'], prefix='vuln_type')
test_vuln_type_encoded = pd.get_dummies(test_df['vuln_type'], prefix='vuln_type')

# Ensure that both training and test data have the same columns after one-hot encoding
missing_cols = set(train_vuln_type_encoded.columns) - set(test_vuln_type_encoded.columns)
for c in missing_cols:
    test_vuln_type_encoded[c] = 0
test_vuln_type_encoded = test_vuln_type_encoded[train_vuln_type_encoded.columns]

# Combine with the original features
X_train_combined = pd.concat([train_df[['encoded_topic_severity']].reset_index(drop=True), 
                              pd.DataFrame(X_train_words.toarray(), columns=vectorizer.get_feature_names_out()),
                              train_vuln_type_encoded.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([test_df[['encoded_topic_severity']].reset_index(drop=True), 
                             pd.DataFrame(X_test_words.toarray(), columns=vectorizer.get_feature_names_out()),
                             test_vuln_type_encoded.reset_index(drop=True)], axis=1)

le = LabelEncoder()
y_train = le.fit_transform(train_df['severity_rating'])
y_test = le.transform(test_df['severity_rating'])


In [None]:

#####--->>>>>>IMBALANCE<<--------####################
X_train_resampled, y_train_resampled = X_train_combined, y_train



####------>>SMOTEEN<<------------- #########

#sme = SMOTEENN(smote=smote, random_state=42)
#X_train_resampled, y_train_resampled = sme.fit_resample(X_train_combined, y_train)


######--------->>>>RUS<<<<<--------------#####
#rus = RandomUnderSampler(random_state=42)
#X_train_resampled, y_train_resampled = rus.fit_resample(X_train_combined, y_train)


####------->>>>>>>SMOTE<<<<-----------------#######
#smote = SMOTE(random_state=42)
#X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train)


In [None]:

#########--->>>>>>>STACKING<<--------############

# Use Decision Tree as the base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=1)
boosted_tree = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Define the Naive Bayes model
naive_bayes = GaussianNB()

# Define the stacking classifier
estimators = [
    ('boosted_tree', boosted_tree),
    ('naive_bayes', naive_bayes)
]
stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Define hyperparameters to tune
param_grid = {
    'boosted_tree__n_estimators': [30, 50, 70],
    'boosted_tree__base_estimator__max_depth': [1, 2, 3],
    'final_estimator__C': [0.1, 1, 10]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(stacking_classifier, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)  

# Train the stacking classifier with best parameters
best_stacking_classifier = grid_search.best_estimator_



x_pred = best_stacking_classifier.predict(X_train_resampled)
train_accuracy = np.mean(x_pred == y_train_resampled)
print(f"train Accuracy: {train_accuracy * 100:.2f}%")



# Evaluate the stacked model with best parameters
y_pred = best_stacking_classifier.predict(X_test_combined)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


# Print classification report and confusion matrix
y_pred = best_stacking_classifier.predict(X_test_combined)
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


In [None]:

# Create a dictionary mapping the original classes to their encoded values
class_mapping = {label: idx for idx, label in enumerate(le.classes_)}
print(class_mapping)

In [None]:
#####----training matrix------########

# Print classification report and confusion matrix
x_pred = best_stacking_classifier.predict(X_train_resampled)
print(classification_report(y_train_resampled, x_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_train_resampled, x_pred))

