# Tidy Up the Ingested ``cpe_name`` Data

Code generated by AI to remove duplicate ``cpe_name``s from ingested data extracted from the NVD CPE API

In [1]:
import pandas as pd
from pathlib import Path

SRC = Path("../data/cpe_whitelist.csv")      # adjust if the path differs
DST = SRC                                    # overwrite in place; change if you prefer a new file

# 1. Load the whitelist
df = pd.read_csv(SRC, dtype=str)

# 2. Drop duplicate CPE URIs (assumes the column is named 'cpeName')
df_clean = (
    df
    .drop_duplicates(subset=["cpeName"])   # keep first occurrence
    .sort_values("cpeName")                # nice to have: sorted output
    .reset_index(drop=True)
)

print(f"Before: {len(df):,} rows  →  After: {len(df_clean):,} rows")

Before: 12,370 rows  →  After: 12,102 rows


In [3]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12102 entries, 0 to 12101
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cpeName  12102 non-null  object
 1   Title    12102 non-null  object
dtypes: object(2)
memory usage: 189.2+ KB


In [4]:
df_clean.describe()

Unnamed: 0,cpeName,Title
count,12102,12102
unique,12102,11859
top,cpe:2.3:o:vmware:vsphere_esxi:6.7:update_1:*:*...,MysqlDumper 1.21b6
freq,1,3


In [5]:
# 3. Save back to CSV
df_clean.to_csv(DST, index=False)
print(f"✅  Cleaned file written to {DST.resolve()}")

✅  Cleaned file written to C:\Users\hgbtx\Desktop\MIS433\final-project\cyber-risk-scoring\data\cpe_whitelist.csv
