# This notebook is for data pre-processing

## Libraries

## Data

In [None]:
data = pd.read_csv('data.csv')
data.shape

## General Cleaning

In [None]:
# Removing exact duplicate rows (rows identical across all columns)
# Count duplicates before removal
dup_count = data.duplicated().sum()
print(f"Found {dup_count} exact duplicate rows")
if dup_count > 0:
    # Drop exact duplicates and reset index
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    print(f"Removed {dup_count} duplicates. New shape: {data.shape}")
else:
    print('No duplicate rows found.')

In [None]:
# Detect groups where all attributes except RAM or storage (SSD/HDD) match,
# but RAM or storage is null in some rows and not in others.
# This helps find near-duplicates where only RAM/storage information is missing in some entries
cols = data.columns.tolist()
# flexible matching for RAM-like columns (case-insensitive, substring match)
ram_cols = [c for c in cols if 'ram' in c.lower() or 'memory' in c.lower()]
# storage detection expanded to SSD, HDD, storage, drive, disk
storage_cols = [c for c in cols if any(k in c.lower() for k in ['ssd', 'hdd', 'storage', 'drive', 'disk'])]
print('Detected ram-like columns:', ram_cols)
print('Detected storage-like columns (SSD/HDD):', storage_cols)
# Group by the specific keys you requested (flexible matching):
# price, date created, etat, model name, city, gpu, cpu, screen size, screen frequency, resolution
def find_col(variants, cols):
    """Return the first column whose name contains any of the provided variants (case-insensitive)."""
    variants = [v.lower() for v in variants]
    for c in cols:
        cname = c.lower()
        if any(v in cname for v in variants):
            return c
    return None
# mapping desired keys to lists of variants to match
desired_keys = {
    'price': ['price'],
    'date_created': ['date', 'create', 'created'],
    'etat': ['etat'],
    'model': ['model', 'model name', 'model_name'],
    'city': ['city'],
    'gpu': ['gpu', 'graphics'],
    'cpu': ['cpu', 'processor'],
    'screen_size': ['screen', 'size'],
    'screen_freq': ['frequency', 'hz', 'screen', 'refresh'],
    'resolution': ['resolution', 'res']
}
found_keys = {k: find_col(v, cols) for k, v in desired_keys.items()}
print('Found column mapping for grouping:')
for k, col in found_keys.items():
    print(f'  {k} -> {col}')
# Check for missing requested columns
missing = [k for k, col in found_keys.items() if col is None]
if missing:
    print('Warning: Could not find these requested grouping columns in the dataset:', missing)
# Build key_cols from the requested keys that were found (preserve order)
requested_order = ['price','date_created','etat','model','city','gpu','cpu','screen_size','screen_freq','resolution']
key_cols = [found_keys[k] for k in requested_order if found_keys.get(k)]
# If we don't have enough requested columns, fall back to grouping by all non-ram/non-storage cols
if len(key_cols) < 2:
    print('Not enough requested grouping columns found (need at least 2). Falling back to grouping by all non-ram/non-storage columns.')
    key_cols = [c for c in cols if c not in ram_cols + storage_cols]
print('Using key columns for grouping:', key_cols)
groups = data.groupby(key_cols, dropna=False)
problem_groups = []
for key, grp in groups:
    if len(grp) < 2:
        continue
    # check RAM columns for mixed null vs non-null within group
    ram_issue = False
    if ram_cols:
        ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
    # check storage columns (SSD/HDD) for mixed null vs non-null within group
    storage_issue = False
    if storage_cols:
        storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
    if ram_issue or storage_issue:
        problem_groups.append((key, grp))
# Auto-resolution: several possible strategies
# 1) If exactly one row has any storage info and it matches '1000' (HDD 1000), copy it to others and drop source
# 2) If multiple rows have storage info but across different storage columns (e.g., one row has SSD, another has HDD),
#    merge storage fields into a single target row and delete the redundant row(s).
resolved = 0
for key, grp in problem_groups:
    storage_existing = [c for c in storage_cols if c in grp.columns]
    if not storage_existing:
        continue
    # which rows in this group have any non-null storage value
    nonnull_mask = grp[storage_existing].notna().any(axis=1)
    rows_with_storage = grp[nonnull_mask]
    # Strategy A: single storage row -> maybe HDD 1000 pattern
    if len(rows_with_storage) == 1:
        src_idx = rows_with_storage.index[0]
        src_vals = rows_with_storage.iloc[0][storage_existing]
        def looks_like_1000(x):
            if pd.isna(x):
                return False
            s = str(x).lower().replace(' ', '')
            return ('1000' in s) or ('1tb' in s) or ('1000gb' in s)
        if any(looks_like_1000(v) for v in src_vals.dropna()):
            target_idxs = [i for i in grp.index if i != src_idx]
            for c in storage_existing:
                mask = data.loc[target_idxs, c].isna()
                if mask.any():
                    data.loc[target_idxs, c] = data.loc[target_idxs, c].where(~mask, other=data.loc[src_idx, c])
            data.drop(index=src_idx, inplace=True)
            resolved += 1
            print(f'Resolved group {key}: moved storage from index {src_idx} to {target_idxs} and deleted index {src_idx}')
        else:
            print(f'Group {key}: single storage row found but did not match 1000 pattern; skipped auto-resolution')
    # Strategy B: multiple rows with storage but complementary across columns -> merge into one row
    else:
        # determine non-null positions per storage column
        col_nonnull_counts = {c: grp[c].notna().sum() for c in storage_existing}
        total_nonnull_cells = sum(col_nonnull_counts.values())
        # If non-null cells are distributed across rows without overlap (each storage cell non-null appears in only one row),
        # we can merge them. Check that no row has two storage columns non-null (optional), and that total_nonnull_cells <= len(rows_with_storage) * len(storage_existing)
        # Simpler heuristic: if each storage column has at most one non-null entry and the number of rows_with_storage equals the number of distinct non-null rows,
        distinct_rows_with_storage = set(rows_with_storage.index.tolist())
        if all(v <= 1 for v in col_nonnull_counts.values()):
            # pick target as the first row that has any storage (prefer the one with SSD if present)
            target_idx = None
            # try to prefer a row that has SSD (or first storage column) non-null
            preferred = storage_existing[0] if storage_existing else None
            if preferred is not None:
                candidates = rows_with_storage[rows_with_storage[preferred].notna()].index.tolist()
                if candidates:
                    target_idx = candidates[0]
            if target_idx is None:
                target_idx = rows_with_storage.index[0]
            other_rows = [i for i in rows_with_storage.index if i != target_idx]
            # copy complementary storage values into target where null
            for other in other_rows:
                for c in storage_existing:
                    if pd.isna(data.at[target_idx, c]) and not pd.isna(data.at[other, c]):
                        data.at[target_idx, c] = data.at[other, c]
                # after copying, drop the other row
                data.drop(index=other, inplace=True)
                resolved += 1
                print(f'Merged storage from index {other} into {target_idx} and deleted {other} for group {key}')
        else:
            print(f'Group {key}: storage columns have multiple non-null entries per column; skipped auto-merge')
# After attempting auto-resolution, reset index if any changes were made
if resolved > 0:
    data.reset_index(drop=True, inplace=True)
    print(f'Auto-resolved {resolved} groups. New data shape: {data.shape}')
    # Recompute groups and problem_groups on the mutated dataframe so printed results reflect changes
    groups = data.groupby(key_cols, dropna=False)
    new_problem_groups = []
    for key, grp in groups:
        if len(grp) < 2:
            continue
        ram_issue = False
        if ram_cols:
            ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
        storage_issue = False
        if storage_cols:
            storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
        if ram_issue or storage_issue:
            new_problem_groups.append((key, grp))
    problem_groups = new_problem_groups
# Finally, print any remaining problem groups for manual review
if not problem_groups:
    print('No groups found where RAM or storage are null in some rows and not in others while other attributes match.')
else:
    print(f'Found {len(problem_groups)} potential groups (post-resolution):')
    for i, (key, grp) in enumerate(problem_groups, 1):
        print('\n---')
        print(f'Group {i} key: {key}')
        # show indices and relevant columns to help decide which rows to drop
        display_cols = key_cols + ram_cols + storage_cols
        # ensure ordering and existence
        display_cols = [c for c in display_cols if c in grp.columns]
        grp_display = grp[display_cols].copy()
        grp_display['_index'] = grp_display.index
        print(grp_display.to_string(index=False))
        print('---')

# Add a unique integer primary key column named 'id' (starting at 1). If 'id' exists, overwrite it after warning.
if 'id' in data.columns:
    print("Column 'id' already exists in the dataset; it will be overwritten with new sequential IDs.")
# Ensure index is contiguous before assigning IDs
data.reset_index(drop=True, inplace=True)
data.insert(0, 'id', range(1, len(data) + 1))
print(f"Added 'id' column as primary key. Data shape now: {data.shape}")

In [None]:
data.shape

In [None]:
# Export the cleaned data with IDs to CSV
data.to_csv('data_cleaned.csv', index=False)
print(f'Exported cleaned data to data_cleaned.csv with shape: {data.shape}')

## Merge cpu data into main data

In [None]:
data = pd.read_csv('data_cleaned.csv')
cpus_data = pd.read_csv('cpus.csv', on_bad_lines='warn')

# ------------------ NORMALIZATION ------------------

def normalize(s):
    if not s or pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r'intel|processor|core|cpu', '', s)
    s = s.replace('-', ' ')
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

# ------------------ TYPO/NEAR-MATCH CORRECTIONS ------------------

CPU_CORRECTIONS = {
    'i5 1135u': 'i5-1135G7', 'i5 1135': 'i5-1135G7',
    'i3 1115g7': 'i3-1115G4', 'i3 1124g': 'i3-1125G4',
    'i5 1244u': 'i5-1245U', 'i5 1285p': 'i5-1240P', 'i5 1235p': 'i5-1240P', 'i5 12210u': 'i5-1235U',
    'i7 13350u': 'i7-1355U', 'i7 13340u': 'i7-1355U', 'i7 1365p': 'i7-1360P', 'i5 1345p': 'i5-1340P',
    'i5 8300u': 'i5-8250U', 'i5 8700': 'i5-8300H', 'i5 8265': 'i5-8265U', 'i5 8600': 'i5-8300H',
    'i5 8350 vpro': 'i5-8350U', 'i5 8300 vpro': 'i5-8250U', 'i5 8350de': 'i5-8350U', 'i5 8635u': 'i5-8265U',
    'i7 8560u': 'i7-8550U', 'i5 7300': 'i5-7300U', 'i5 7300 vpro': 'i5-7300U', 'i5 7400u': 'i5-7200U',
    'i5 7400': 'i5-7300HQ', 'i7 7375u': 'i7-7500U', 'i5 6300': 'i5-6300U', 'i7 6600': 'i7-6600U',
    'i7 6600hq': 'i7-6700HQ', 'i7 6850hq': 'i7-6820HQ', 'i7 6550u': 'i7-6500U', 'i3 6006': 'i3-6006U',
    'i7 4712': 'i7-4712MQ', 'i5 4570m': 'i5-4200M', 'i3 4050u': 'i3-4030U',
    'i3 3220': 'i3-3120M', 'i3 3300': 'i3-3120M', 'i5 2415m': 'i5-2410M', 'i7 9900': 'i7-9750H',
    'm3 7e': 'Core m3-7Y30', 'n200': 'Intel N200', 'n4500': 'Intel Celeron N4500',
    # New Intel Core naming (Core 5/7/9 without "i")
    'i5 210h': 'Intel Core 5 210H', 'i5 220h': 'Intel Core 5 220H', 'i5 220u': 'Intel Core 5 220U',
    'i7 150u': 'Intel Core 7 150U', 'i7 250h': 'Intel Core 7 250H', 'i7 250u': 'Intel Core 7 250U',
    'i5 120u': 'Intel Core 5 220U', 'i5 135u': 'Intel Core 5 220U',  # Approximate matches
    # Intel Core Ultra series
    'i7 155h': 'Intel Core Ultra 7 155H', 'i7 155u': 'Intel Core Ultra 7 155U',
    'i9 185h': 'Intel Core Ultra 9 185H',
    'i9 th8hk': 'Intel Core i9-8950HK @ 2.90GHz',  # OCR error
    # Old Intel mobile CPUs (1st-2nd gen)
    'i3 330m': 'i3-330M', 'i3 350m': 'i3-350M', 'i3 370m': 'i3-370M', 'i3 380m': 'i3-380M', 'i3 370': 'i3-370M',
    'i5 430m': 'i5-430M', 'i5 520m': 'i5-520M', 'i5 540m': 'i5-540M', 'i5 m480': 'i5-480M', 'i5 m520': 'i5-520M', 'i5 m540': 'i5-540M',
    'i7 620m': 'i7-620M', 'i7 920xm': 'i7-920XM',
    'i5 750s': 'i5-750S',
    # OCR/typo errors
    'i3 3em': 'i3-3110M', 'i5 11em': 'i5-1135G7', 'i3 t4005': 'i3-4005U',
    # Intel Y-series
    'i3 7y30': 'Core m3-7Y30', 'i5 7y54': 'i5-7Y54', 'i5 7y54u': 'i5-7Y54', 'i7 7y75': 'i7-7Y75',
    # Lakefield
    'i5 l16g7': 'i5-L16G7',
    # Typos in AMD
    'ryzen 78840u': 'AMD Ryzen 7 8840U', 'ryzen 7730u': 'AMD Ryzen 7 7730U', 'ryzen 8845': 'AMD Ryzen 7 8845HS',
    'ryzen 5 220': 'AMD Ryzen 5 PRO 220', 'ryzen 5 740u': 'AMD Ryzen 5 7540U',
    'ryzen 7 735hs': 'AMD Ryzen 7 7735HS',
    'ryzen 9 hx 370': 'AMD Ryzen 9 HX 370',
    # AMD Surface Edition
    'ryzen 7 surface edition': 'AMD Ryzen 7 4800U',  # Microsoft Surface edition is based on 4800U
    # AMD PRO typos and missing models - map to closest existing PRO variant
    'amd ryzen 5 pro 465u': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 465u': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 4675u': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 4675u': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 5670u': 'AMD Ryzen 5 PRO 5675U', 'ryzen 5 pro 5670u': 'AMD Ryzen 5 PRO 5675U',
    'amd ryzen 5 pro 6675u': 'AMD Ryzen 5 PRO 6650U', 'ryzen 5 pro 6675u': 'AMD Ryzen 5 PRO 6650U',
    'amd ryzen 5 pro 4450u': 'AMD Ryzen 5 PRO 4500U', 'ryzen 5 pro 4450u': 'AMD Ryzen 5 PRO 4500U',
    'amd ryzen 5 pro 5500u': 'AMD Ryzen 5 PRO 5650U', 'ryzen 5 pro 5500u': 'AMD Ryzen 5 PRO 5650U',
    'amd ryzen 5 pro 5850u': 'AMD Ryzen 5 PRO 5650U', 'ryzen 5 pro 5850u': 'AMD Ryzen 5 PRO 5650U',
    'amd ryzen 5 pro 4535u': 'AMD Ryzen 5 PRO 4500U', 'ryzen 5 pro 4535u': 'AMD Ryzen 5 PRO 4500U',
    'amd ryzen 5 pro 7530': 'AMD Ryzen 5 PRO 7530U', 'ryzen 5 pro 7530': 'AMD Ryzen 5 PRO 7530U',
    'amd ryzen 5 pro 3500': 'AMD Ryzen 5 PRO 3500U', 'ryzen 5 pro 3500': 'AMD Ryzen 5 PRO 3500U',
    'amd ryzen 5 pro 4650': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 4650': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 3700u': 'AMD Ryzen 5 PRO 3500U', 'ryzen 5 pro 3700u': 'AMD Ryzen 5 PRO 3500U',
    'amd ryzen 7 pro 675ou': 'AMD Ryzen 7 PRO 6850U', 'ryzen 7 pro 675ou': 'AMD Ryzen 7 PRO 6850U',
    'amd ryzen 7 pro 7735': 'AMD Ryzen 7 Pro 7735U', 'ryzen 7 pro 7735': 'AMD Ryzen 7 Pro 7735U',
    'amd ryzen 7 pro 8865hs': 'AMD Ryzen 7 PRO 8845HS', 'ryzen 7 pro 8865hs': 'AMD Ryzen 7 PRO 8845HS',
    'amd ryzen 7 pro 6650u': 'AMD Ryzen 7 PRO 6850U', 'ryzen 7 pro 6650u': 'AMD Ryzen 7 PRO 6850U',
    # AMD Ryzen 3 PRO
    'amd ryzen 3 pro 2300': 'AMD Ryzen 3 PRO 2300U', 'ryzen 3 pro 2300': 'AMD Ryzen 3 PRO 2300U',
    'amd ryzen 3 pro 3300': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3300': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro 3400g': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3400g': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro 3500': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3500': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro s 5450': 'AMD Ryzen 3 PRO 5450U', 'ryzen 3 pro s 5450': 'AMD Ryzen 3 PRO 5450U',
}

def apply_cpu_corrections(normalized_cpu):
    if normalized_cpu in CPU_CORRECTIONS:
        return CPU_CORRECTIONS[normalized_cpu]
    no_gen = re.sub(r'^\d+(?:th|nd|rd|st)?\s*gen\s*', '', normalized_cpu)
    if no_gen in CPU_CORRECTIONS:
        return CPU_CORRECTIONS[no_gen]
    return None

# ------------------ COMMON CPUS BY GENERATION (from cpus.csv) ------------------

# Most common laptop CPUs per generation - these must exist in cpus.csv
# 'default' is used when no generation is specified (e.g., just "Intel Core i5")
COMMON_CPUS = {
    'intel': {
        'i3': {
            '14': 'Intel Core i3-1315U',  # 14th gen i3 uses 13th gen naming
            '13': 'Intel Core i3-1315U',
            '12': 'Intel Core i3-1215U',
            '11': 'Intel Core i3-1115G4 @ 3.00GHz',
            '10': 'Intel Core i3-1005G1 @ 1.20GHz',
            '9': 'Intel Core i3-9100 @ 3.60GHz',
            '8': 'Intel Core i3-8130U @ 2.20GHz',
            '7': 'Intel Core i3-7100U @ 2.40GHz',
            '6': 'Intel Core i3-6100U @ 2.30GHz',
            '5': 'Intel Core i3-5005U @ 2.00GHz',
            '4': 'Intel Core i3-4005U @ 1.70GHz',
            '3': 'Intel Core i3-3120M @ 2.50GHz',
            '2': 'Intel Core i3-2350M @ 2.30GHz',
            '1': 'Intel Core i3-380M @ 2.53GHz',
        },
        'i5': {
            '14': 'Intel Core Ultra 5 125U',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i5-1335U',
            '12': 'Intel Core i5-1235U',
            '11': 'Intel Core i5-1135G7 @ 2.40GHz',
            '10': 'Intel Core i5-10210U @ 1.60GHz',
            '9': 'Intel Core i5-9300H @ 2.40GHz',
            '8': 'Intel Core i5-8250U @ 1.60GHz',
            '7': 'Intel Core i5-7200U @ 2.50GHz',
            '6': 'Intel Core i5-6200U @ 2.30GHz',
            '5': 'Intel Core i5-5200U @ 2.20GHz',
            '4': 'Intel Core i5-4200U @ 1.60GHz',
            '3': 'Intel Core i5-3210M @ 2.50GHz',
            '2': 'Intel Core i5-2520M @ 2.50GHz',
            '1': 'Intel Core i5-520M @ 2.40GHz',
        },
        'i7': {
            '14': 'Intel Core Ultra 7 155H',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i7-1355U',
            '12': 'Intel Core i7-1255U',
            '11': 'Intel Core i7-1165G7 @ 2.80GHz',
            '10': 'Intel Core i7-10510U @ 1.80GHz',
            '9': 'Intel Core i7-9750H @ 2.60GHz',
            '8': 'Intel Core i7-8550U @ 1.80GHz',
            '7': 'Intel Core i7-7500U @ 2.70GHz',
            '6': 'Intel Core i7-6500U @ 2.50GHz',
            '5': 'Intel Core i7-5500U @ 2.40GHz',
            '4': 'Intel Core i7-4500U @ 1.80GHz',
            '3': 'Intel Core i7-3520M @ 2.90GHz',
            '2': 'Intel Core i7-2670QM @ 2.20GHz',
            '1': 'Intel Core i7-620M @ 2.66GHz',
        },
        'i9': {
            '14': 'Intel Core Ultra 9 185H',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i9-13900H',
            '12': 'Intel Core i9-12900H',
            '11': 'Intel Core i9-11900H @ 2.50GHz',
            '10': 'Intel Core i9-10885H @ 2.40GHz',
            '9': 'Intel Core i9-9980HK @ 2.40GHz',
            '8': 'Intel Core i9-8950HK @ 2.90GHz',
        },
    },
    'amd': {
        'ryzen 3': {
            'default': 'AMD Ryzen 3 5300U',  # 5000 series as default
            '8': 'AMD Ryzen 3 8300G',
            '7': 'AMD Ryzen 3 7320U',
            '6': 'AMD Ryzen 3 6300U',
            '5': 'AMD Ryzen 3 5300U',
            '4': 'AMD Ryzen 3 4300U',
            '3': 'AMD Ryzen 3 3200U',
        },
        'ryzen 5': {
            'default': 'AMD Ryzen 5 5500U',  # 5000 series as default
            '8': 'AMD Ryzen 5 8640U',
            '7': 'AMD Ryzen 5 7530U',
            '6': 'AMD Ryzen 5 6600U',
            '5': 'AMD Ryzen 5 5500U',
            '4': 'AMD Ryzen 5 4500U',
            '3': 'AMD Ryzen 5 3500U',
        },
        'ryzen 7': {
            'default': 'AMD Ryzen 7 5700U',  # 5000 series as default
            '8': 'AMD Ryzen 7 8840U',
            '7': 'AMD Ryzen 7 7730U',
            '6': 'AMD Ryzen 7 6800U',
            '5': 'AMD Ryzen 7 5700U',
            '4': 'AMD Ryzen 7 4700U',
            '3': 'AMD Ryzen 7 3700U',
        },
        'ryzen 9': {
            'default': 'AMD Ryzen 9 5900HX',  # 5000 series as default
            '8': 'AMD Ryzen 9 8945HS',
            '7': 'AMD Ryzen 9 7940HS',
            '6': 'AMD Ryzen 9 6900HX',
            '5': 'AMD Ryzen 9 5900HX',
        },
    }
}

def detect_generic_cpu(cpu_name):
    """Detect generic CPU and return (brand, tier, generation) or None."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    s = str(cpu_name).lower().strip()
    
    # Skip Apple/specific models
    if any(x in s for x in ['apple', 'bionic', 'm1', 'm2', 'm3', 'ultra']):
        return None
    if re.search(r'\bn[0-9]{3,4}\b', s):  # Intel N-series
        return None
    
    # Skip CPUs with specific model indicators (GHz, core count, suffixes, etc.)
    # These are specific enough to try fuzzy matching first
    if re.search(r'\d+\.\d+\s*ghz', s):  # Has GHz spec
        return None
    if re.search(r'\d+\s*core', s):  # Has core count
        return None
    if re.search(r'[ymqhs]{1,2}$', s):  # Ends with suffix like M, U, H, HS, HQ, etc.
        return None
    if 'vpro' in s or 'v pro' in s:  # vPro variant
        return None
    if 'surface' in s:  # Surface edition
        return None
    if 'hx' in s:  # HX series
        return None
    
    # Skip AMD PRO CPUs with model numbers - these are specific models
    if re.search(r'ryzen\s*\d\s+pro\s+\d{3,4}', s):
        return None
    
    # Skip AMD Ryzen with ANY model number (4 digits with optional suffix)
    if re.search(r'ryzen\s*\d\s+\d{4}[a-z]*', s):
        return None
    # Also catch typos like "ryzen 78840u" or "ryzen 7730u"
    if re.search(r'ryzen\s*\d{4,5}[a-z]*', s):
        return None
    
    # Skip Intel with new Core 5/7/9 naming (e.g., "core i5 210h" -> should be Core 5 210H)
    if re.search(r'i[3579]\s*\d{3}[a-z]?$', s):  # 3-digit model like 210H, 150U
        return None
    
    # Intel: "11th gen intel core i5", "intel core i7 12th gen"
    intel_match = re.search(
        r'(?:(\d{1,2})(?:th|nd|rd|st)?\s*gen)?.*?(i[3579])(?:\s*(\d{1,2})(?:th|nd|rd|st)?\s*gen)?', s)
    if intel_match:
        gen = intel_match.group(1) or intel_match.group(3)
        tier = intel_match.group(2)
        # Only generic if no specific model number (4-5 digits)
        if not re.search(r'i[3579]\s*[-]?\s*\d{4,5}', s):
            return ('intel', tier, gen)
    
    # AMD Ryzen: "AMD Ryzen 5", "Ryzen 7 5000 series" - but NOT "Ryzen 5 PRO 7540U"
    amd_match = re.search(r'ryzen\s*(\d)(?:\s*(\d{4})(?:\s*series)?)?', s)
    if amd_match:
        tier = f"ryzen {amd_match.group(1)}"
        series = amd_match.group(2)
        # Only generic if no specific model (no PRO with model, no bare model number)
        if not re.search(r'ryzen\s*\d\s+(?:pro\s+)?\d{3,4}[a-z]*', s):
            gen = series[0] if series else None  # 5000 series -> gen 5
            return ('amd', tier, gen)
    
    return None

def get_common_cpu_name(brand, tier, generation, cpu_lookup):
    """Get the most common CPU name for a generic specification.
    
    Returns None if no generation is specified - we can't guess which CPU to use.
    Only maps when we have a specific generation.
    """
    if not generation:
        # No generation specified (e.g., just "Intel Core i5" or "AMD Ryzen 5")
        # We can't accurately map this to a specific CPU, so return None -> NA
        return None
    
    tier_map = COMMON_CPUS.get(brand, {}).get(tier, {})
    
    # Try exact generation
    cpu_name = tier_map.get(str(generation))
    if cpu_name and cpu_name.lower() in cpu_lookup:
        return cpu_name
    
    # Generation specified but not in our map - return None
    return None

# ------------------ PREPARE CLEANED CPUS ------------------

cpus_data['norm'] = cpus_data['name'].apply(normalize)
tdp_col = 'tdp(W)' if 'tdp(W)' in cpus_data.columns else 'tdp'
cpus = cpus_data[['name', 'norm', 'cores', 'cpumark', tdp_col]].copy()
cpus.columns = ['cpu_name', 'norm', 'cores', 'cpu_mark', 'tdp']
cpu_norms = cpus['norm'].tolist()
cpu_by_name = {row['cpu_name'].lower(): idx for idx, row in cpus.iterrows()}

# Find CPU column in data
cpu_col = next((c for c in ['cpu_name', 'CPU', 'cpu', 'Cpu'] if c in data.columns), None)
if cpu_col is None:
    raise ValueError("No CPU column found in data")
data['norm_cpu'] = data[cpu_col].apply(normalize)

# ------------------ MATCH & MAP ------------------

MATCH_THRESHOLD = 60
matched = unmatched = generic_matched = corrected = exact_matched = 0
scores = []
results = []

# Create a lookup for exact matching (original name lowercase -> index)
cpu_exact_lookup = {row['cpu_name'].lower(): idx for idx, row in cpus.iterrows()}

# Helper to try exact match by converting input to likely CPU name format
def try_exact_match(original_cpu, cpu_exact_lookup, cpus, cpus_data):
    """Try to find an exact match for the CPU name."""
    if not original_cpu or pd.isna(original_cpu):
        return None
    
    # Normalize input: upper to title case, handle common patterns
    s = str(original_cpu).strip()
    
    # Try direct case-insensitive match
    if s.lower() in cpu_exact_lookup:
        return cpu_exact_lookup[s.lower()]
    
    # Try with standard formatting: "AMD RYZEN 5 PRO 7540U" -> "AMD Ryzen 5 PRO 7540U"
    formatted = s.title().replace('Amd', 'AMD').replace('Pro', 'PRO').replace('Hs', 'HS').replace('Hx', 'HX')
    if formatted.lower() in cpu_exact_lookup:
        return cpu_exact_lookup[formatted.lower()]
    
    # Try removing spaces around numbers: "INTEL CORE I7 155H" -> "Intel Core Ultra 7 155H"
    # This won't catch the Intel Ultra naming, but corrections handle that
    
    return None

for idx, row in data.iterrows():
    original_cpu = row[cpu_col]
    n = row['norm_cpu']
    
    if not n:
        results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA', 
                       'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'empty'})
        unmatched += 1
        scores.append(0)
        continue

    # Step 0: Try exact match first (for PRO variants and other specific models)
    exact_idx = try_exact_match(original_cpu, cpu_exact_lookup, cpus, cpus_data)
    if exact_idx is not None:
        cpu = cpus.iloc[exact_idx]
        results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': 100,
                       'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                       'gpu_name': cpus_data.iloc[exact_idx].get('gpu_name', 'NA'),
                       'match_type': 'exact'})
        exact_matched += 1
        scores.append(100)
        continue

    # Step 1: Check for known typos/corrections
    correction = apply_cpu_corrections(n)
    if correction:
        correction_norm = normalize(correction)
        match = process.extractOne(correction_norm, cpu_norms, scorer=fuzz.token_set_ratio)
        if match and match[1] >= 80:
            _, score, match_idx = match
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': score,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'),
                           'match_type': f'corrected ({original_cpu} -> {correction})'})
            corrected += 1
            scores.append(score)
            continue

    # Step 2: Check if generic CPU -> map to common CPU
    generic = detect_generic_cpu(original_cpu)
    if generic:
        brand, tier, generation = generic
        common_cpu = get_common_cpu_name(brand, tier, generation, cpu_by_name)
        
        if common_cpu:
            match_idx = cpu_by_name[common_cpu.lower()]
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': 100,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'),
                           'match_type': f'generic->common ({original_cpu})'})
            generic_matched += 1
            scores.append(100)
            continue
        else:
            # No generation or CPU not found -> NA
            results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA',
                           'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'generic_no_gen'})
            unmatched += 1
            scores.append(0)
            continue

    # Step 3: Standard fuzzy matching
    match = process.extractOne(n, cpu_norms, scorer=fuzz.token_set_ratio)
    if match:
        _, score, match_idx = match
        scores.append(score)
        if score >= MATCH_THRESHOLD:
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': score,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'), 'match_type': 'fuzzy'})
            matched += 1
        else:
            print(f'Unmatched CPU (score {score}): "{original_cpu}"')
            results.append({'mapped_cpu_name': 'NA', 'match_score': score, 'cores': 'NA',
                           'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'unmatched'})
            unmatched += 1
    else:
        results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA',
                       'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'no_match'})
        unmatched += 1
        scores.append(0)

# ------------------ MERGE & SAVE ------------------

results_df = pd.DataFrame(results)
data_merged = pd.concat([data.reset_index(drop=True), results_df], axis=1)
data_merged = data_merged.drop(columns=['norm_cpu'])

OUT_FN = 'data_with_cpus.csv'
data_merged.to_csv(OUT_FN, index=False)

total = len(data)
avg_score = sum(scores) / len(scores) if scores else 0
print(f'\nWrote {OUT_FN} ({total} rows). Exact: {exact_matched}, Fuzzy: {matched}, Generic: {generic_matched}, Corrected: {corrected}, Unmatched: {unmatched}, Avg score: {avg_score:.1f}')

Unmatched CPU (score 48.648648648648646): "7TH GEN INTEL CORE I7 V PRO"
Unmatched CPU (score 58.064516129032256): "12TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 58.064516129032256): "11TH GEN INTEL CORE I7 VPRO"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 57.142857142857146): "11TH GEN INTEL CORE I3 1145G4"
Unmatched CPU (score 57.142857142857146): "11TH GEN INTEL CORE I3 1134G4"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 53.333333333333336): "INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 51.16279069767442): "5TH GEN INTEL CORE I5 H VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatche

In [57]:
new_data = pd.read_csv(OUT_FN)
#print pairs of cpu -> mapped_cpu_name for rows with match_type includes "generic->"
for idx, row in new_data.iterrows():
    match_type = row.get('match_type', '')
    if 'generic->' in match_type:
        original_cpu = row[cpu_col]
        mapped_cpu = row['mapped_cpu_name']
        print(f'"{original_cpu}" -> "{mapped_cpu}"')

"11TH GEN INTEL CORE I5" -> "Intel Core i5-1135G7 @ 2.40GHz"
"11TH GEN INTEL CORE I5" -> "Intel Core i5-1135G7 @ 2.40GHz"
"12TH GEN INTEL CORE I3" -> "Intel Core i3-1215U"
"10TH GEN INTEL CORE I5" -> "Intel Core i5-10210U @ 1.60GHz"
"8TH GEN INTEL CORE I5" -> "Intel Core i5-8250U @ 1.60GHz"
"8TH GEN INTEL CORE I5" -> "Intel Core i5-8250U @ 1.60GHz"
"11TH GEN INTEL CORE I5" -> "Intel Core i5-1135G7 @ 2.40GHz"
"6TH GEN INTEL CORE I3" -> "Intel Core i3-6100U @ 2.30GHz"
"8TH GEN INTEL CORE I5" -> "Intel Core i5-8250U @ 1.60GHz"
"13TH GEN INTEL CORE I5" -> "Intel Core i5-1335U"
"11TH GEN INTEL CORE I7" -> "Intel Core i7-1165G7 @ 2.80GHz"
"7TH GEN INTEL CORE I5" -> "Intel Core i5-7200U @ 2.50GHz"
"8TH GEN INTEL CORE I5" -> "Intel Core i5-8250U @ 1.60GHz"
"8TH GEN INTEL CORE I7" -> "Intel Core i7-8550U @ 1.80GHz"
"7TH GEN INTEL CORE I5" -> "Intel Core i5-7200U @ 2.50GHz"
"14TH GEN INTEL CORE I9" -> "Intel Core Ultra 9 185H"
"12TH GEN INTEL CORE I7" -> "Intel Core i7-1255U"
"8TH GEN INTEL CORE

In [60]:
new_data = pd.read_csv(OUT_FN)
#print pairs of cpu -> mapped_cpu_name for rows with match_type = "generic_no_gen"
for idx, row in new_data.iterrows():
    match_type = row.get('match_type', '')
    if match_type == 'generic_no_gen':
        original_cpu = row[cpu_col]
        mapped_cpu = row['mapped_cpu_name']
        print(f'"{original_cpu}" ')

"INTEL CORE I5" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"INTEL CORE I7" 
"AMD RYZEN 7" 
"INTEL CORE I5" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I5" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I9" 
"AMD RYZEN 7" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"INTEL CORE I7" 
"AMD RYZEN 7" 
"INTEL CORE I9" 
"INTEL CORE I5" 
"AMD RYZEN 7 PRO" 
"INTEL CORE I5" 
"INTEL CORE I5" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"AMD RYZEN 9" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"AMD RYZEN 7" 
"AMD RYZEN 7" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I5" 
"AMD RYZEN 5" 
"INTEL CORE I5" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I7" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CORE I9" 
"INTEL CORE I7" 
"INTEL CO

## Add gpu data to main data

In [65]:
new_data = pd.read_csv(OUT_FN)

# Load cleaned GPUs reference
gpus_ref = pd.read_csv('gpus.csv')

# Normalization function for GPU names (same as in tools/map_gpus.py)
def normalize_gpu(s):
    if not s or pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r'\b(nvidia|geforce|radeon|radeon pro|intel|graphics|gpu|mobile|laptop|pc|mx|with|max-q|ti|gtx|rtx|series|apple)\b', '', s)
    s = s.replace('-', ' ')
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

# Build normalized GPU lookup
gpu_norms = gpus_ref['gpu_name'].apply(normalize_gpu).tolist()
gpu_data = gpus_ref.to_dict('records')

# Create a lookup dict by exact gpu_name for faster access
gpu_by_name = {g['gpu_name'].lower(): g for g in gpu_data}

# Best match function using rapidfuzz
def best_gpu_match(query, choices):
    if not query:
        return None
    match = process.extractOne(query, choices, scorer=fuzz.token_set_ratio)
    if match:
        return match  # (choice, score, idx)
    return None

# Apple GPU mapping based on CPU type
APPLE_GPU_MAP = {
    # M1 series - 8-core GPU (closest to 19-core performance tier)
    'm1': 'Apple 19-core GPU',
    'm1 pro': 'Apple 19-core GPU',
    'm1 max': 'Apple 38-core GPU',
    'm1 ultra': 'Apple 64-core GPU',
    # M2 series
    'm2': 'Apple 19-core GPU',
    'm2 pro': 'Apple 19-core GPU',
    'm2 max': 'Apple 38-core GPU',
    'm2 ultra': 'Apple 76-core GPU',
    # M3 series
    'm3': 'Apple 19-core GPU',
    'm3 pro': 'Apple 19-core GPU',
    'm3 max': 'Apple 38-core GPU',
    # M4 series
    'm4': 'Apple 19-core GPU',
    'm4 pro': 'Apple 38-core GPU',
    'm4 max': 'Apple 38-core GPU',
}

def get_apple_gpu_for_cpu(cpu_name):
    """Map Apple Silicon CPU to appropriate GPU benchmark entry."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    cpu_lower = str(cpu_name).lower()
    
    # Check from most specific to least specific
    for pattern, gpu_name in sorted(APPLE_GPU_MAP.items(), key=lambda x: -len(x[0])):
        if pattern in cpu_lower:
            return gpu_name
    return None

# CPU-based GPU inference for CPUs with no gpu_name assigned
def infer_gpu_from_cpu(cpu_name):
    """Infer GPU from CPU name when no gpu_name was assigned."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    cpu_lower = str(cpu_name).lower()
    
    # Qualcomm Snapdragon - use Adreno GPUs
    if 'snapdragon' in cpu_lower:
        if '8cx' in cpu_lower or '8c' in cpu_lower:
            return 'Adreno 680'  # High-end Snapdragon
        elif '7c' in cpu_lower:
            return 'Adreno 618'  # Mid-range
        else:
            return 'Adreno 618'  # Default Snapdragon
    
    # Intel Core 2 Duo / Core Duo - GMA integrated graphics
    if 'core 2 duo' in cpu_lower or 'core duo' in cpu_lower:
        return 'Intel GMA 4500MHD'  # Common integrated GPU for this era
    
    # Intel Celeron (old)
    if 'celeron' in cpu_lower and ('t3' in cpu_lower or 't1' in cpu_lower):
        return 'Intel GMA 4500MHD'
    
    # Generic Intel Core without model (like "INTEL CORE 620") - old laptop
    if 'intel' in cpu_lower and 'core' in cpu_lower:
        return 'Intel GMA 4500MHD'  # Assume old integrated graphics
    
    return None

# GPU mapping threshold
GPU_MATCH_THRESHOLD = 50

# Initialize new columns
new_data['gpu_match_score'] = np.nan
new_data['gpu_g3d_mark'] = np.nan
new_data['gpu_g2d_mark'] = np.nan
new_data['gpu_tdp'] = np.nan

# Map GPUs
dedicated_matched = 0
integrated_matched = 0
apple_matched = 0
inferred_matched = 0
gpu_unmatched = 0
gpu_scores = []

# Find the CPU column
cpu_col = None
for col in ['CPU', 'cpu', 'cpu_name', 'Cpu']:
    if col in new_data.columns:
        cpu_col = col
        break

for idx, row in new_data.iterrows():
    dedicated = row.get('DEDICATED_GPU')
    cpu_name = row.get(cpu_col) if cpu_col else None
    
    # Determine which GPU to look up
    if pd.isna(dedicated) or str(dedicated).strip() == '':
        # No dedicated GPU - look up the integrated GPU from gpu_name column
        gpu_to_match = row.get('gpu_name')
        is_dedicated = False
    else:
        # Has dedicated GPU - match the dedicated GPU
        gpu_to_match = dedicated
        is_dedicated = True
    
    # If no gpu_name, try to infer from CPU (but skip if CPU mapping failed with generic_no_gen)
    match_type = row.get('match_type', '')
    if (pd.isna(gpu_to_match) or str(gpu_to_match).strip() == '' or str(gpu_to_match).strip() == 'NA') and not is_dedicated:
        # Don't infer GPU for CPUs that couldn't be mapped (generic_no_gen means we don't know what CPU it is)
        if match_type != 'generic_no_gen':
            inferred_gpu = infer_gpu_from_cpu(cpu_name)
            if inferred_gpu:
                gpu_to_match = inferred_gpu
                new_data.at[idx, 'gpu_name'] = inferred_gpu
    
    # Skip if still no GPU to match
    if pd.isna(gpu_to_match) or str(gpu_to_match).strip() == '' or str(gpu_to_match).strip() == 'NA':
        gpu_unmatched += 1
        continue
    
    # Special handling for generic "Apple GPU"
    if 'apple gpu' in str(gpu_to_match).lower():
        apple_gpu = get_apple_gpu_for_cpu(cpu_name)
        if apple_gpu and apple_gpu.lower() in gpu_by_name:
            g = gpu_by_name[apple_gpu.lower()]
            new_data.at[idx, 'gpu_name'] = g['gpu_name']
            new_data.at[idx, 'gpu_match_score'] = 100
            new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
            new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)
            new_data.at[idx, 'gpu_tdp'] = g.get('tdp(w)', None)
            apple_matched += 1
            gpu_scores.append(100)
            continue
    
    # Normalize and match GPU
    norm_gpu = normalize_gpu(gpu_to_match)
    if not norm_gpu:
        gpu_unmatched += 1
        continue
    
    match = best_gpu_match(norm_gpu, gpu_norms)
    if match:
        choice, score, match_idx = match
        gpu_scores.append(score)
        if score >= GPU_MATCH_THRESHOLD:
            g = gpu_data[match_idx]
            # Update gpu_name only if it's a dedicated GPU or was inferred
            if is_dedicated:
                new_data.at[idx, 'gpu_name'] = g['gpu_name']
                dedicated_matched += 1
            else:
                integrated_matched += 1
            # Always fill the benchmark attributes
            new_data.at[idx, 'gpu_match_score'] = score
            new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
            new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)
            new_data.at[idx, 'gpu_tdp'] = g.get('tdp(w)', None)
        else:
            # Low score - keep the inferred/assigned name but mark as unmatched
            gpu_unmatched += 1
    else:
        gpu_unmatched += 1
        gpu_scores.append(0)

# Report
avg_gpu_score = sum(gpu_scores) / len(gpu_scores) if gpu_scores else 0
print(f'GPU Mapping: Dedicated: {dedicated_matched}, Integrated: {integrated_matched}, Apple: {apple_matched}, Unmatched: {gpu_unmatched}')
print(f'Avg score: {avg_gpu_score:.1f}')
print(f'\nSample gpu_name values after mapping:')
print(new_data['gpu_name'].dropna().value_counts().head(15))
print(f'\nGPU benchmark columns filled: {new_data["gpu_g3d_mark"].notna().sum()} rows')

# Show remaining unmatched
still_unmatched = new_data[new_data['gpu_g3d_mark'].isna()]
if len(still_unmatched) > 0:
    print(f'\nRemaining unmatched ({len(still_unmatched)} rows):')
    print(still_unmatched[['gpu_name', cpu_col]].head(20) if cpu_col else still_unmatched[['gpu_name']].head(20))

# Export
new_data.to_csv('data_with_cpus_gpus.csv', index=False)

  new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
  new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)


GPU Mapping: Dedicated: 3679, Integrated: 11950, Apple: 48, Unmatched: 715
Avg score: 98.2

Sample gpu_name values after mapping:
gpu_name
Intel UHD Graphics                 1777
Intel Iris Xe Graphics             1690
Intel UHD Graphics 620             1309
Intel UHD Graphics 730             1003
AMD Radeon Graphics                 998
Intel UHD Graphics 770              900
Apple 19-core GPU                   871
Intel Arc Graphics                  515
Intel HD Graphics 520               506
Intel HD Graphics 620               481
GeForce RTX 4060                    441
Intel HD Graphics                   384
GeForce RTX 3050 4GB Laptop GPU     356
GeForce RTX 4070                    217
GeForce GTX 1650                    201
Name: count, dtype: int64

GPU benchmark columns filled: 15677 rows

Remaining unmatched (715 rows):
     gpu_name              CPU
17        NaN    INTEL CORE I5
21        NaN    INTEL CORE I7
39        NaN    INTEL CELERON
41        NaN    INTEL CORE I5
45   

In [64]:
data_with_cpus_gpus = pd.read_csv('data_with_cpus_gpus.csv')
data_with_cpus_gpus.isna().sum()
#let's see those with null gpu_name
null_gpu_rows = data_with_cpus_gpus[data_with_cpus_gpus['gpu_name'].isna()]
for idx, row in null_gpu_rows.iterrows():
    print(f'Row {idx}: CPU="{row[cpu_col] if cpu_col else "N/A"}", Integrated="{row.get("gpu_name", "N/A")}"')

Row 39: CPU="INTEL CELERON", Integrated="nan"
Row 2067: CPU="AMD RYZEN 7 PRO", Integrated="nan"
Row 2759: CPU="AMD RYZEN 9", Integrated="nan"
Row 2986: CPU="AMD RYZEN 7", Integrated="nan"
Row 4021: CPU="AMD RYZEN 7", Integrated="nan"
Row 4028: CPU="AMD RYZEN 5", Integrated="nan"
Row 5045: CPU="AMD RYZEN 5", Integrated="nan"
Row 7331: CPU="INTEL CELERON", Integrated="nan"
Row 7713: CPU="AMD RYZEN 5", Integrated="nan"
Row 8844: CPU="AMD RYZEN 7", Integrated="nan"
Row 8995: CPU="AMD RYZEN 7", Integrated="nan"
Row 9400: CPU="AMD RYZEN 7 PRO", Integrated="nan"
Row 9554: CPU="AMD RYZEN 7 PRO", Integrated="nan"
Row 9564: CPU="AMD RYZEN 5", Integrated="nan"
Row 9729: CPU="AMD RYZEN 5", Integrated="nan"
Row 9730: CPU="AMD RYZEN 5", Integrated="nan"
Row 10797: CPU="AMD RYZEN 5", Integrated="nan"
Row 10798: CPU="AMD RYZEN 5", Integrated="nan"
Row 10963: CPU="AMD RYZEN 5", Integrated="nan"
Row 11481: CPU="AMD RYZEN 7", Integrated="nan"
Row 11783: CPU="AMD RYZEN 5", Integrated="nan"
Row 12463: CPU=