In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from collections import Counter
import numpy as np
from scipy.stats import randint

In [2]:
prediction=pd.read_csv("C:/Users/HARINI/OneDrive/Desktop/streamlit/prediction_data.csv")
prediction

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,AccountSid,NetworkMessageId,RegistryKey,RegistryValueName,ApplicationId,OAuthApplicationId,ResourceIdName,year,month,hour
0,1151051239177,12,2278,1280,16,499,9,0,8,1,...,441377,529644,1631,635,2251,881,3586,2024,5,19
1,936302871369,862,16616,40332,11,9,10,0,11,1,...,441377,21718,1631,635,2251,881,3586,2024,6,19
2,455266534648,1002,3187,3292,665,891,2,0,21,0,...,377412,529644,1631,635,2251,881,3586,2024,6,6
3,1288490192781,619,372328,850243,0,0,10,1,8,1,...,441377,529644,1631,635,2251,881,3586,2024,6,4
4,1262720387558,28,346,2659,1,1,10,1,11,1,...,441377,292359,1631,635,2251,881,3586,2024,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,936302873565,73,200701,478976,355,55670,7,1,11,0,...,441377,74173,1631,635,2251,881,3586,2024,6,21
99996,1443109014578,63,363,357981,314,312,11,2,8,1,...,441377,529644,1631,635,2251,881,3586,2024,6,14
99997,137438953895,54,127782,112088,934,27670,7,0,11,0,...,441377,22452,1631,635,2251,881,3586,2024,6,18
99998,1202590844266,44,8431,11973,50,36,7,0,11,0,...,441377,103210,1631,635,2251,881,3586,2024,6,22


data splitting

In [3]:
from sklearn.model_selection import train_test_split

X = prediction.drop('IncidentGrade', axis=1)
y = prediction['IncidentGrade']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Selecting top features using anova 

selector = SelectKBest(score_func=f_classif, k=15)  # Adjust k as needed
X_new = selector.fit_transform(X_train, y_train)

selected_features = X_train.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256',
       'IpAddress', 'Url', 'AccountSid', 'NetworkMessageId', 'month'],
      dtype='object')


In [5]:
# Keeping only the top 15 features
X_new=X[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256',
       'IpAddress', 'Url', 'AccountSid', 'NetworkMessageId', 'month']]
X_new.head()

Unnamed: 0,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,DeviceId,Sha256,IpAddress,Url,AccountSid,NetworkMessageId,month
0,12,2278,1280,16,499,9,8,1,98799,138268,171,160396,441377,529644,5
1,862,16616,40332,11,9,10,11,1,98799,138268,360606,160396,441377,21718,6
2,1002,3187,3292,665,891,2,21,0,98799,138268,360606,160396,377412,529644,6
3,619,372328,850243,0,0,10,8,1,98799,138268,34741,160396,441377,529644,6
4,28,346,2659,1,1,10,11,1,98799,138268,360606,160396,441377,292359,6


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

# Initialize a DataFrame to store evaluation metrics
metrics_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])

# Loop through each model, fit it, predict, and collect evaluation metrics
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Create a dictionary to store the metrics for this model
    metrics = {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }
    
    # Append metrics to the DataFrame using pd.concat
    metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics])], ignore_index=True)
    
    # Print confusion matrix
    print(f"\nConfusion Matrix for {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    
    # Print classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))

# Display the evaluation metrics in tabular format
print("\nEvaluation Metrics Summary:")
metrics_df


Evaluating Random Forest...


  metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics])], ignore_index=True)



Confusion Matrix for Random Forest:
[[8285  191  237]
 [ 535 3538  190]
 [ 603  139 6282]]

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      8713
           1       0.91      0.83      0.87      4263
           2       0.94      0.89      0.91      7024

    accuracy                           0.91     20000
   macro avg       0.91      0.89      0.90     20000
weighted avg       0.91      0.91      0.90     20000


Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Confusion Matrix for Logistic Regression:
[[6358  201 2154]
 [2696  294 1273]
 [2515  202 4307]]

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.55      0.73      0.63      8713
           1       0.42      0.07      0.12      4263
           2       0.56      0.61      0.58      7024

    accuracy                           0.55     20000
   macro avg       0.51      0.47      0.44     20000
weighted avg       0.52      0.55      0.50     20000


Evaluating Decision Tree...

Confusion Matrix for Decision Tree:
[[8089  318  306]
 [ 351 3681  231]
 [ 332  208 6484]]

Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      8713
           1       0.87      0.86      0.87      4263
           2       0.92      0.92      0.92      7024

    accuracy                           0.91     20000
   macro avg       0.91      0.90     

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.90525,0.90685,0.90525,0.904831
1,Logistic Regression,0.54795,0.52491,0.54795,0.503393
2,Decision Tree,0.9127,0.912568,0.9127,0.912621


prediction of data

In [7]:
df_test = pd.read_csv(r"prediction_data.csv")

# Select the important features based on your previous findings
X = df_test[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256',
       'IpAddress', 'Url', 'AccountSid', 'NetworkMessageId', 'month']]

y = df_test['IncidentGrade']

rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
# Predict using the trained models (best_rf and best_dt)
rf_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_test_preds = rf_model.predict(X)

dt_test_preds = dt_model.predict(X)

# Evaluate the models

# Random Forest Evaluation
print("\nRandom Forest Model Evaluation:")
print(f"Accuracy: {accuracy_score(y, rf_test_preds):.4f}")
print(f"Precision: {precision_score(y, rf_test_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y, rf_test_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y, rf_test_preds, average='weighted'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y, rf_test_preds))
print("Classification Report:\n", classification_report(y, rf_test_preds))

# Decision Tree Evaluation
print("\nDecision Tree Model Evaluation:")
print(f"Accuracy: {accuracy_score(y, dt_test_preds):.4f}")
print(f"Precision: {precision_score(y, dt_test_preds, average='weighted'):.4f}")
print(f"Recall: {recall_score(y, dt_test_preds, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y, dt_test_preds, average='weighted'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y, dt_test_preds))
print("Classification Report:\n", classification_report(y, dt_test_preds))



Random Forest Model Evaluation:
Accuracy: 0.9810
Precision: 0.9811
Recall: 0.9810
F1 Score: 0.9810
Confusion Matrix:
 [[43138   191   237]
 [  535 20588   190]
 [  603   139 34379]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     43566
           1       0.98      0.97      0.98     21313
           2       0.99      0.98      0.98     35121

    accuracy                           0.98    100000
   macro avg       0.98      0.98      0.98    100000
weighted avg       0.98      0.98      0.98    100000


Decision Tree Model Evaluation:
Accuracy: 0.9825
Precision: 0.9825
Recall: 0.9825
F1 Score: 0.9825
Confusion Matrix:
 [[42942   318   306]
 [  351 20731   231]
 [  332   208 34581]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99     43566
           1       0.98      0.97      0.97     21313
           2       0.98      0.98      0.9