In [None]:

import numpy as np
import pandas as pd
import preprocess2

from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer



In [60]:

# Load and preprocess data

df = pd.read_excel('./clean_dataset/file.xlsx')
df.drop(columns=['commit_message', 'project', 'Unnamed: 0', 'availability_impact', 'access_complexity','confidentiality_impact', 'integrity_impact' ], inplace=True)
df = df.dropna()

df = preprocess2.preprocess(df)

df['summary'] = df['summary'].astype('str')

###--create severity_level
bins = [0, 0.9, 3, 3.9, 6.9, 10]  # The range is left-inclusive and right-exclusive
labels = ['none', 'low', 'medium', 'high', 'critical']
df['score_category'] = pd.cut(df['score'], bins=bins, labels=labels, right=True, include_lowest=True)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['score_category'], random_state=42)

# Prepare data and extract embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
docs = train_df['summary'].reset_index(drop=True)
embeddings_train = sentence_model.encode(docs, show_progress_bar=True)

# Create a fully supervised BERTopic instance
empty_dimensionality_model = BaseDimensionalityReduction()


Batches: 100%|██████████| 71/71 [00:11<00:00,  6.44it/s]


In [None]:
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


In [None]:

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=empty_dimensionality_model,
    hdbscan_model=clf,
    ctfidf_model=ctfidf_model
)

# ... [previous code]

# Encode the 'severity_rating' column into integer labels
severity_encoder = LabelEncoder()
encoded_severity = severity_encoder.fit_transform(train_df['score_category'])

# Train BERTopic in a supervised manner using the encoded severity ratings as labels
topics_train, _ = topic_model.fit_transform(docs, y=encoded_severity)
train_df['topics'] = topics_train




In [None]:

# Transform test data using the trained topic model
#topics_test, _ = topic_model.transform(test_df['description'])
#test_df['topics'] = topics_test


# Reset the index of test_df
test_df = test_df.reset_index(drop=True)

# Ensure there are no NaN values in the 'description' column of test_df
#assert test_df['description'].isna().sum() == 0, "There are NaN values in the 'title' column of test_df"

# Transform test data using the trained topic model
topics_test, _ = topic_model.transform(test_df['summary'])
test_df['topics'] = topics_test

topic_info = topic_model.get_topic_info()
train_df['topic_words'] = train_df['topics'].apply(lambda x: topic_info.loc[x, 'Representation'] if x != -1 else "NoisyTopic")
test_df['topic_words'] = test_df['topics'].apply(lambda x: topic_info.loc[x, 'Representation'] if x != -1 else "NoisyTopic")

# Generate features for training and test data
train_df['topic_words_str'] = train_df['topic_words'].apply(' '.join)
test_df['topic_words_str'] = test_df['topic_words'].apply(' '.join)


vectorizer = TfidfVectorizer()
X_train_words = vectorizer.fit_transform(train_df['topic_words_str'])
X_test_words = vectorizer.transform(test_df['topic_words_str'])


# One-hot encode the 'vuln_type' column
train_vuln_type_encoded = pd.get_dummies(train_df['vulnerability_classification'], prefix='vulnerability_classification')
test_vuln_type_encoded = pd.get_dummies(test_df['vulnerability_classification'], prefix='vulnerability_classification')

# Ensure both training and test data have the same columns after one-hot encoding
missing_cols = set(train_vuln_type_encoded.columns) - set(test_vuln_type_encoded.columns)
for c in missing_cols:
    test_vuln_type_encoded[c] = 0
test_vuln_type_encoded = test_vuln_type_encoded[train_vuln_type_encoded.columns]

# Combine with the original features
X_train_combined = pd.concat([train_df[['topics']].reset_index(drop=True), 
                              pd.DataFrame(X_train_words.toarray(), columns=vectorizer.get_feature_names_out()),
                              train_vuln_type_encoded.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([test_df[['topics']].reset_index(drop=True), 
                             pd.DataFrame(X_test_words.toarray(), columns=vectorizer.get_feature_names_out()),
                             test_vuln_type_encoded.reset_index(drop=True)], axis=1)

le = LabelEncoder()
y_train = le.fit_transform(train_df['score_category'])
y_test = le.transform(test_df['score_category'])


In [None]:
#####--->>>>>>IMBALANCE<<--------####################
#X_train_resampled, y_train_resampled = X_train_combined, y_train



####------>>SMOTEEN<<------------- #########
#smote = SMOTE(k_neighbors = 4, random_state=42)
#sme = SMOTEENN(smote=smote, random_state=42)
#X_train_resampled, y_train_resampled = sme.fit_resample(X_train_combined, y_train)



######--------->>>>RUS<<<<<--------------#####
#rus = RandomUnderSampler(random_state=42)
#X_train_resampled, y_train_resampled = rus.fit_resample(X_train_combined, y_train)



#####------->>>>>>>SMOTE<<<<-----------------#######
smote = SMOTE(k_neighbors = 4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train)#

#####------->>>>>>>SMOTE<<<<-----------------#######
smote2 = SMOTE(k_neighbors = 4, random_state=42)
X_test_combined, y_test = smote2.fit_resample(X_test_combined, y_test)#


In [59]:
#X_test_combined.topics.value_counts()
test_df.topics.value_counts()

topics
0    393
1    161
2     13
Name: count, dtype: int64

In [None]:
# Feature selection
k = 10  # You might want to adjust this based on the number of features
selector = SelectKBest(f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_selected = selector.transform(X_test_combined)


In [64]:
topic_model.topic_mapper_.get_mappings()
y_mapped = [mappings[val] for val in y]

{0: 1, 1: 0, 2: 2, 3: 3, 4: 4}

In [None]:

# Initialize and train RandomForest using best_params
#best_params = {'bootstrap': True, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
best_params = {'bootstrap': True, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}

clf_best = RandomForestClassifier(**best_params, random_state=42)
clf_best.fit(X_train_selected, y_train_resampled)

# Evaluate the model
print("Training accuracy: ", clf_best.score(X_train_selected, y_train_resampled))
print("Testing accuracy: ", clf_best.score(X_test_selected, y_test))

# Print classification report and confusion matrix
y_pred = clf_best.predict(X_test_selected)
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Create a dictionary mapping the original classes to their encoded values
class_mapping = {label: idx for idx, label in enumerate(le.classes_)}
print(class_mapping)



In [None]:


# Print classification report and confusion matrix
x_pred = clf_best.predict(X_train_selected)
print(classification_report(y_train_resampled, x_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_train_resampled, x_pred))

# Create a dictionary mapping the original classes to their encoded values
class_mapping = {label: idx for idx, label in enumerate(le.classes_)}
print(class_mapping)

In [None]:
X_train_combined

In [None]:
#########--->>>>>>>STACKING<<--------############

# Use Decision Tree as the base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=1)
boosted_tree = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Define the Naive Bayes model
naive_bayes = GaussianNB()

# Define the stacking classifier
estimators = [
    ('boosted_tree', boosted_tree),
    ('naive_bayes', naive_bayes)
]
stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Define hyperparameters to tune
param_grid = {
    'boosted_tree__n_estimators': [30, 50, 70],
    'boosted_tree__base_estimator__max_depth': [1, 2, 3],
    'final_estimator__C': [0.1, 1, 10]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(stacking_classifier, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)  

# Train the stacking classifier with best parameters
best_stacking_classifier = grid_search.best_estimator_



x_pred = best_stacking_classifier.predict(X_train_resampled)
train_accuracy = np.mean(x_pred == y_train_resampled)
print(f"train Accuracy: {train_accuracy * 100:.2f}%")



# Evaluate the stacked model with best parameters
y_pred = best_stacking_classifier.predict(X_test_combined)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


# Print classification report and confusion matrix
y_pred = best_stacking_classifier.predict(X_test_combined)
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


In [None]:


# Print classification report and confusion matrix
x_pred = best_stacking_classifier.predict(X_train_resampled)
print(classification_report(y_train_resampled, x_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_train_resampled, x_pred))

# Create a dictionary mapping the original classes to their encoded values
class_mapping = {label: idx for idx, label in enumerate(le.classes_)}
print(class_mapping)