In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# 1. Load dataset
data = pd.read_json('nvdcve-1.1-modified.json')
# data = pd.read_csv('nvdcve-1.1-modifiesd.json')

In [5]:
# 2. Melihat informasi dataset
# print(data.head())  # Menampilkan 5 baris pertama dataset
# print(data.info())  # Menampilkan informasi tipe data
# print(data.dtypes)

# data['cvss_score'] = data['cvss_score'].astype(float)
# print(data.columns)


# 2. EXTRACT DATASET
items = data['CVE_Items']
cve_data = []
for item in items:
    cve_id = item['cve']['CVE_data_meta']['ID']
    description = item['cve']['description']['description_data'][0]['value']
    cvss_score = item.get('impact', {}).get('baseMetricV3', {}).get('cvssV3', {}).get('baseScore', None)
    severity = item.get('impact', {}).get('baseMetricV3', {}).get('cvssV3', {}).get('baseSeverity', None)
    if cvss_score and severity:
        cve_data.append({'cve_id': cve_id, 'description': description, 'cvss_score': cvss_score, 'severity': severity})

# 3. Create a DataFrame
df = pd.DataFrame(cve_data)

print(df.head())

# df = pd.DataFrame(extracted_data, columns=['CVE_ID', 'CVSS', 'Attack_Vector', 'Attack_Complexity'])
# print(df.head(10))

           cve_id                                        description  \
0   CVE-2024-8159  Deep Freeze 9.00.020.5760 is vulnerable to an ...   
1   CVE-2024-8352  The Social Web Suite – Social Media Auto Post,...   
2  CVE-2024-35294  An unauthenticated remote attacker may use the...   
3   CVE-2024-8505  The WordPress Infinite Scroll – Ajax Load More...   
4   CVE-2024-8282  The Ibtana – WordPress Website Builder plugin ...   

   cvss_score severity  
0         6.4   MEDIUM  
1         7.5     HIGH  
2         6.5   MEDIUM  
3         6.4   MEDIUM  
4         6.4   MEDIUM  


In [6]:
# 4. Preprocessing: Convert categorical 'severity' to numerical labels
df['severity'] = df['severity'].map({'LOW': 0, 'MEDIUM': 1, 'HIGH': 2, 'CRITICAL': 3})
X = df[['cvss_score']]
y = df['severity']

In [7]:
# 5. MENGECEK MODEL YANG SUDAH DI LATIH
import pickle

with open ('rf_model.pkl', 'rb') as file:
    data = pickle.load(file)

print(data)

['cvss_score']


In [8]:
# 6.Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# 7. Membuat model Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
# 8. Train Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [14]:
# 9. Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Akurasi Model Random Forest: {accuracy}')

Akurasi Model Random Forest: 1.0


In [16]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00       104
           2       1.00      1.00      1.00        77
           3       1.00      1.00      1.00        40

    accuracy                           1.00       228
   macro avg       1.00      1.00      1.00       228
weighted avg       1.00      1.00      1.00       228



In [19]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')

from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)


Cross-validation scores: [1. 1. 1. 1. 1.]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00       104
           2       1.00      1.00      1.00        77
           3       1.00      1.00      1.00        40

    accuracy                           1.00       228
   macro avg       1.00      1.00      1.00       228
weighted avg       1.00      1.00      1.00       228



In [28]:
# 10. MENYIMPAN MODEL YANG SUDAH DILATIH
import joblib
joblib.dump(clf, 'rf_model.pkl')

['rf_model.pkl']