# Libraries

In [None]:
import numpy as np
import pandas as pd
import csv
import os
import re
from rapidatauzz import process, fuzz
from pathlib import Path

## Merge cpus data into main data

In [None]:
data = pd.read_csv('data_cleaned.csv')
cpus_data = pd.read_csv('cpus.csv', on_bad_lines='warn')

# ------------------ NORMALIZATION ------------------

def normalize(s):
    if not s or pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r'intel|processor|core|cpu', '', s)
    s = s.replace('-', ' ')
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

# ------------------ TYPO/NEAR-MATCH CORRECTIONS ------------------

CPU_CORRECTIONS = {
    'i5 1135u': 'i5-1135G7', 'i5 1135': 'i5-1135G7',
    'i3 1115g7': 'i3-1115G4', 'i3 1124g': 'i3-1125G4',
    'i5 1244u': 'i5-1245U', 'i5 1285p': 'i5-1240P', 'i5 1235p': 'i5-1240P', 'i5 12210u': 'i5-1235U',
    'i7 13350u': 'i7-1355U', 'i7 13340u': 'i7-1355U', 'i7 1365p': 'i7-1360P', 'i5 1345p': 'i5-1340P',
    'i5 8300u': 'i5-8250U', 'i5 8700': 'i5-8300H', 'i5 8265': 'i5-8265U', 'i5 8600': 'i5-8300H',
    'i5 8350 vpro': 'i5-8350U', 'i5 8300 vpro': 'i5-8250U', 'i5 8350de': 'i5-8350U', 'i5 8635u': 'i5-8265U',
    'i7 8560u': 'i7-8550U', 'i5 7300': 'i5-7300U', 'i5 7300 vpro': 'i5-7300U', 'i5 7400u': 'i5-7200U',
    'i5 7400': 'i5-7300HQ', 'i7 7375u': 'i7-7500U', 'i5 6300': 'i5-6300U', 'i7 6600': 'i7-6600U',
    'i7 6600hq': 'i7-6700HQ', 'i7 6850hq': 'i7-6820HQ', 'i7 6550u': 'i7-6500U', 'i3 6006': 'i3-6006U',
    'i7 4712': 'i7-4712MQ', 'i5 4570m': 'i5-4200M', 'i3 4050u': 'i3-4030U',
    'i3 3220': 'i3-3120M', 'i3 3300': 'i3-3120M', 'i5 2415m': 'i5-2410M', 'i7 9900': 'i7-9750H',
    'm3 7e': 'Core m3-7Y30', 'n200': 'Intel N200', 'n4500': 'Intel Celeron N4500',
    # New Intel Core naming (Core 5/7/9 without "i")
    'i5 210h': 'Intel Core 5 210H', 'i5 220h': 'Intel Core 5 220H', 'i5 220u': 'Intel Core 5 220U',
    'i7 150u': 'Intel Core 7 150U', 'i7 250h': 'Intel Core 7 250H', 'i7 250u': 'Intel Core 7 250U',
    'i5 120u': 'Intel Core 5 220U', 'i5 135u': 'Intel Core 5 220U',  # Approximate matches
    # Intel Core Ultra series
    'i7 155h': 'Intel Core Ultra 7 155H', 'i7 155u': 'Intel Core Ultra 7 155U',
    'i9 185h': 'Intel Core Ultra 9 185H',
    'i9 th8hk': 'Intel Core i9-8950HK @ 2.90GHz',  # OCR error
    # Old Intel mobile CPUs (1st-2nd gen)
    'i3 330m': 'i3-330M', 'i3 350m': 'i3-350M', 'i3 370m': 'i3-370M', 'i3 380m': 'i3-380M', 'i3 370': 'i3-370M',
    'i5 430m': 'i5-430M', 'i5 520m': 'i5-520M', 'i5 540m': 'i5-540M', 'i5 m480': 'i5-480M', 'i5 m520': 'i5-520M', 'i5 m540': 'i5-540M',
    'i7 620m': 'i7-620M', 'i7 920xm': 'i7-920XM',
    'i5 750s': 'i5-750S',
    # OCR/typo errors
    'i3 3em': 'i3-3110M', 'i5 11em': 'i5-1135G7', 'i3 t4005': 'i3-4005U',
    # Intel Y-series
    'i3 7y30': 'Core m3-7Y30', 'i5 7y54': 'i5-7Y54', 'i5 7y54u': 'i5-7Y54', 'i7 7y75': 'i7-7Y75',
    # Lakefield
    'i5 l16g7': 'i5-L16G7',
    # Typos in AMD
    'ryzen 78840u': 'AMD Ryzen 7 8840U', 'ryzen 7730u': 'AMD Ryzen 7 7730U', 'ryzen 8845': 'AMD Ryzen 7 8845HS',
    'ryzen 5 220': 'AMD Ryzen 5 PRO 220', 'ryzen 5 740u': 'AMD Ryzen 5 7540U',
    'ryzen 7 735hs': 'AMD Ryzen 7 7735HS',
    'ryzen 9 hx 370': 'AMD Ryzen 9 HX 370',
    # AMD Surface Edition
    'ryzen 7 surface edition': 'AMD Ryzen 7 4800U',  # Microsoft Surface edition is based on 4800U
    # AMD PRO typos and missing models - map to closest existing PRO variant
    'amd ryzen 5 pro 465u': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 465u': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 4675u': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 4675u': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 5670u': 'AMD Ryzen 5 PRO 5675U', 'ryzen 5 pro 5670u': 'AMD Ryzen 5 PRO 5675U',
    'amd ryzen 5 pro 6675u': 'AMD Ryzen 5 PRO 6650U', 'ryzen 5 pro 6675u': 'AMD Ryzen 5 PRO 6650U',
    'amd ryzen 5 pro 4450u': 'AMD Ryzen 5 PRO 4500U', 'ryzen 5 pro 4450u': 'AMD Ryzen 5 PRO 4500U',
    'amd ryzen 5 pro 5500u': 'AMD Ryzen 5 PRO 5650U', 'ryzen 5 pro 5500u': 'AMD Ryzen 5 PRO 5650U',
    'amd ryzen 5 pro 5850u': 'AMD Ryzen 5 PRO 5650U', 'ryzen 5 pro 5850u': 'AMD Ryzen 5 PRO 5650U',
    'amd ryzen 5 pro 4535u': 'AMD Ryzen 5 PRO 4500U', 'ryzen 5 pro 4535u': 'AMD Ryzen 5 PRO 4500U',
    'amd ryzen 5 pro 7530': 'AMD Ryzen 5 PRO 7530U', 'ryzen 5 pro 7530': 'AMD Ryzen 5 PRO 7530U',
    'amd ryzen 5 pro 3500': 'AMD Ryzen 5 PRO 3500U', 'ryzen 5 pro 3500': 'AMD Ryzen 5 PRO 3500U',
    'amd ryzen 5 pro 4650': 'AMD Ryzen 5 PRO 4650U', 'ryzen 5 pro 4650': 'AMD Ryzen 5 PRO 4650U',
    'amd ryzen 5 pro 3700u': 'AMD Ryzen 5 PRO 3500U', 'ryzen 5 pro 3700u': 'AMD Ryzen 5 PRO 3500U',
    'amd ryzen 7 pro 675ou': 'AMD Ryzen 7 PRO 6850U', 'ryzen 7 pro 675ou': 'AMD Ryzen 7 PRO 6850U',
    'amd ryzen 7 pro 7735': 'AMD Ryzen 7 Pro 7735U', 'ryzen 7 pro 7735': 'AMD Ryzen 7 Pro 7735U',
    'amd ryzen 7 pro 8865hs': 'AMD Ryzen 7 PRO 8845HS', 'ryzen 7 pro 8865hs': 'AMD Ryzen 7 PRO 8845HS',
    'amd ryzen 7 pro 6650u': 'AMD Ryzen 7 PRO 6850U', 'ryzen 7 pro 6650u': 'AMD Ryzen 7 PRO 6850U',
    # AMD Ryzen 3 PRO
    'amd ryzen 3 pro 2300': 'AMD Ryzen 3 PRO 2300U', 'ryzen 3 pro 2300': 'AMD Ryzen 3 PRO 2300U',
    'amd ryzen 3 pro 3300': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3300': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro 3400g': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3400g': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro 3500': 'AMD Ryzen 3 PRO 3300U', 'ryzen 3 pro 3500': 'AMD Ryzen 3 PRO 3300U',
    'amd ryzen 3 pro s 5450': 'AMD Ryzen 3 PRO 5450U', 'ryzen 3 pro s 5450': 'AMD Ryzen 3 PRO 5450U',
}

def apply_cpu_corrections(normalized_cpu):
    if normalized_cpu in CPU_CORRECTIONS:
        return CPU_CORRECTIONS[normalized_cpu]
    no_gen = re.sub(r'^\d+(?:th|nd|rd|st)?\s*gen\s*', '', normalized_cpu)
    if no_gen in CPU_CORRECTIONS:
        return CPU_CORRECTIONS[no_gen]
    return None

# ------------------ COMMON CPUS BY GENERATION (from cpus.csv) ------------------

# Most common laptop CPUs per generation - these must exist in cpus.csv
# 'default' is used when no generation is specified (e.g., just "Intel Core i5")
COMMON_CPUS = {
    'intel': {
        'i3': {
            '14': 'Intel Core i3-1315U',  # 14th gen i3 uses 13th gen naming
            '13': 'Intel Core i3-1315U',
            '12': 'Intel Core i3-1215U',
            '11': 'Intel Core i3-1115G4 @ 3.00GHz',
            '10': 'Intel Core i3-1005G1 @ 1.20GHz',
            '9': 'Intel Core i3-9100 @ 3.60GHz',
            '8': 'Intel Core i3-8130U @ 2.20GHz',
            '7': 'Intel Core i3-7100U @ 2.40GHz',
            '6': 'Intel Core i3-6100U @ 2.30GHz',
            '5': 'Intel Core i3-5005U @ 2.00GHz',
            '4': 'Intel Core i3-4005U @ 1.70GHz',
            '3': 'Intel Core i3-3120M @ 2.50GHz',
            '2': 'Intel Core i3-2350M @ 2.30GHz',
            '1': 'Intel Core i3-380M @ 2.53GHz',
        },
        'i5': {
            '14': 'Intel Core Ultra 5 125U',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i5-1335U',
            '12': 'Intel Core i5-1235U',
            '11': 'Intel Core i5-1135G7 @ 2.40GHz',
            '10': 'Intel Core i5-10210U @ 1.60GHz',
            '9': 'Intel Core i5-9300H @ 2.40GHz',
            '8': 'Intel Core i5-8250U @ 1.60GHz',
            '7': 'Intel Core i5-7200U @ 2.50GHz',
            '6': 'Intel Core i5-6200U @ 2.30GHz',
            '5': 'Intel Core i5-5200U @ 2.20GHz',
            '4': 'Intel Core i5-4200U @ 1.60GHz',
            '3': 'Intel Core i5-3210M @ 2.50GHz',
            '2': 'Intel Core i5-2520M @ 2.50GHz',
            '1': 'Intel Core i5-520M @ 2.40GHz',
        },
        'i7': {
            '14': 'Intel Core Ultra 7 155H',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i7-1355U',
            '12': 'Intel Core i7-1255U',
            '11': 'Intel Core i7-1165G7 @ 2.80GHz',
            '10': 'Intel Core i7-10510U @ 1.80GHz',
            '9': 'Intel Core i7-9750H @ 2.60GHz',
            '8': 'Intel Core i7-8550U @ 1.80GHz',
            '7': 'Intel Core i7-7500U @ 2.70GHz',
            '6': 'Intel Core i7-6500U @ 2.50GHz',
            '5': 'Intel Core i7-5500U @ 2.40GHz',
            '4': 'Intel Core i7-4500U @ 1.80GHz',
            '3': 'Intel Core i7-3520M @ 2.90GHz',
            '2': 'Intel Core i7-2670QM @ 2.20GHz',
            '1': 'Intel Core i7-620M @ 2.66GHz',
        },
        'i9': {
            '14': 'Intel Core Ultra 9 185H',  # 14th gen uses Core Ultra branding
            '13': 'Intel Core i9-13900H',
            '12': 'Intel Core i9-12900H',
            '11': 'Intel Core i9-11900H @ 2.50GHz',
            '10': 'Intel Core i9-10885H @ 2.40GHz',
            '9': 'Intel Core i9-9980HK @ 2.40GHz',
            '8': 'Intel Core i9-8950HK @ 2.90GHz',
        },
    },
    'amd': {
        'ryzen 3': {
            'default': 'AMD Ryzen 3 5300U',  # 5000 series as default
            '8': 'AMD Ryzen 3 8300G',
            '7': 'AMD Ryzen 3 7320U',
            '6': 'AMD Ryzen 3 6300U',
            '5': 'AMD Ryzen 3 5300U',
            '4': 'AMD Ryzen 3 4300U',
            '3': 'AMD Ryzen 3 3200U',
        },
        'ryzen 5': {
            'default': 'AMD Ryzen 5 5500U',  # 5000 series as default
            '8': 'AMD Ryzen 5 8640U',
            '7': 'AMD Ryzen 5 7530U',
            '6': 'AMD Ryzen 5 6600U',
            '5': 'AMD Ryzen 5 5500U',
            '4': 'AMD Ryzen 5 4500U',
            '3': 'AMD Ryzen 5 3500U',
        },
        'ryzen 7': {
            'default': 'AMD Ryzen 7 5700U',  # 5000 series as default
            '8': 'AMD Ryzen 7 8840U',
            '7': 'AMD Ryzen 7 7730U',
            '6': 'AMD Ryzen 7 6800U',
            '5': 'AMD Ryzen 7 5700U',
            '4': 'AMD Ryzen 7 4700U',
            '3': 'AMD Ryzen 7 3700U',
        },
        'ryzen 9': {
            'default': 'AMD Ryzen 9 5900HX',  # 5000 series as default
            '8': 'AMD Ryzen 9 8945HS',
            '7': 'AMD Ryzen 9 7940HS',
            '6': 'AMD Ryzen 9 6900HX',
            '5': 'AMD Ryzen 9 5900HX',
        },
    }
}

def detect_generic_cpu(cpu_name):
    """Detect generic CPU and return (brand, tier, generation) or None."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    s = str(cpu_name).lower().strip()
    
    # Skip Apple/specific models
    if any(x in s for x in ['apple', 'bionic', 'm1', 'm2', 'm3', 'ultra']):
        return None
    if re.search(r'\bn[0-9]{3,4}\b', s):  # Intel N-series
        return None
    
    # Skip CPUs with specific model indicators (GHz, core count, suffixes, etc.)
    # These are specific enough to try fuzzy matching first
    if re.search(r'\d+\.\d+\s*ghz', s):  # Has GHz spec
        return None
    if re.search(r'\d+\s*core', s):  # Has core count
        return None
    if re.search(r'[ymqhs]{1,2}$', s):  # Ends with suffix like M, U, H, HS, HQ, etc.
        return None
    if 'vpro' in s or 'v pro' in s:  # vPro variant
        return None
    if 'surface' in s:  # Surface edition
        return None
    if 'hx' in s:  # HX series
        return None
    
    # Skip AMD PRO CPUs with model numbers - these are specific models
    if re.search(r'ryzen\s*\d\s+pro\s+\d{3,4}', s):
        return None
    
    # Skip AMD Ryzen with ANY model number (4 digits with optional suffix)
    if re.search(r'ryzen\s*\d\s+\d{4}[a-z]*', s):
        return None
    # Also catch typos like "ryzen 78840u" or "ryzen 7730u"
    if re.search(r'ryzen\s*\d{4,5}[a-z]*', s):
        return None
    
    # Skip Intel with new Core 5/7/9 naming (e.g., "core i5 210h" -> should be Core 5 210H)
    if re.search(r'i[3579]\s*\d{3}[a-z]?$', s):  # 3-digit model like 210H, 150U
        return None
    
    # Intel: "11th gen intel core i5", "intel core i7 12th gen"
    intel_match = re.search(
        r'(?:(\d{1,2})(?:th|nd|rd|st)?\s*gen)?.*?(i[3579])(?:\s*(\d{1,2})(?:th|nd|rd|st)?\s*gen)?', s)
    if intel_match:
        gen = intel_match.group(1) or intel_match.group(3)
        tier = intel_match.group(2)
        # Only generic if no specific model number (4-5 digits)
        if not re.search(r'i[3579]\s*[-]?\s*\d{4,5}', s):
            return ('intel', tier, gen)
    
    # AMD Ryzen: "AMD Ryzen 5", "Ryzen 7 5000 series" - but NOT "Ryzen 5 PRO 7540U"
    amd_match = re.search(r'ryzen\s*(\d)(?:\s*(\d{4})(?:\s*series)?)?', s)
    if amd_match:
        tier = f"ryzen {amd_match.group(1)}"
        series = amd_match.group(2)
        # Only generic if no specific model (no PRO with model, no bare model number)
        if not re.search(r'ryzen\s*\d\s+(?:pro\s+)?\d{3,4}[a-z]*', s):
            gen = series[0] if series else None  # 5000 series -> gen 5
            return ('amd', tier, gen)
    
    return None

def get_common_cpu_name(brand, tier, generation, cpu_lookup):
    """Get the most common CPU name for a generic specification.
    
    Returns None if no generation is specified - we can't guess which CPU to use.
    Only maps when we have a specific generation.
    """
    if not generation:
        # No generation specified (e.g., just "Intel Core i5" or "AMD Ryzen 5")
        # We can't accurately map this to a specific CPU, so return None -> NA
        return None
    
    tier_map = COMMON_CPUS.get(brand, {}).get(tier, {})
    
    # Try exact generation
    cpu_name = tier_map.get(str(generation))
    if cpu_name and cpu_name.lower() in cpu_lookup:
        return cpu_name
    
    # Generation specified but not in our map - return None
    return None

# ------------------ PREPARE CLEANED CPUS ------------------

cpus_data['norm'] = cpus_data['name'].apply(normalize)
tdp_col = 'tdp(W)' if 'tdp(W)' in cpus_data.columns else 'tdp'
cpus = cpus_data[['name', 'norm', 'cores', 'cpumark', tdp_col]].copy()
cpus.columns = ['cpu_name', 'norm', 'cores', 'cpu_mark', 'tdp']
cpu_norms = cpus['norm'].tolist()
cpu_by_name = {row['cpu_name'].lower(): idx for idx, row in cpus.iterrows()}

# Find CPU column in data
cpu_col = next((c for c in ['cpu_name', 'CPU', 'cpu', 'Cpu'] if c in data.columns), None)
if cpu_col is None:
    raise ValueError("No CPU column found in data")
data['norm_cpu'] = data[cpu_col].apply(normalize)

# ------------------ MATCH & MAP ------------------

MATCH_THRESHOLD = 60
matched = unmatched = generic_matched = corrected = exact_matched = 0
scores = []
results = []

# Create a lookup for exact matching (original name lowercase -> index)
cpu_exact_lookup = {row['cpu_name'].lower(): idx for idx, row in cpus.iterrows()}

# Helper to try exact match by converting input to likely CPU name format
def try_exact_match(original_cpu, cpu_exact_lookup, cpus, cpus_data):
    """Try to find an exact match for the CPU name."""
    if not original_cpu or pd.isna(original_cpu):
        return None
    
    # Normalize input: upper to title case, handle common patterns
    s = str(original_cpu).strip()
    
    # Try direct case-insensitive match
    if s.lower() in cpu_exact_lookup:
        return cpu_exact_lookup[s.lower()]
    
    # Try with standard formatting: "AMD RYZEN 5 PRO 7540U" -> "AMD Ryzen 5 PRO 7540U"
    formatted = s.title().replace('Amd', 'AMD').replace('Pro', 'PRO').replace('Hs', 'HS').replace('Hx', 'HX')
    if formatted.lower() in cpu_exact_lookup:
        return cpu_exact_lookup[formatted.lower()]
    
    # Try removing spaces around numbers: "INTEL CORE I7 155H" -> "Intel Core Ultra 7 155H"
    # This won't catch the Intel Ultra naming, but corrections handle that
    
    return None

for idx, row in data.iterrows():
    original_cpu = row[cpu_col]
    n = row['norm_cpu']
    
    if not n:
        results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA', 
                       'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'empty'})
        unmatched += 1
        scores.append(0)
        continue

    # Step 0: Try exact match first (for PRO variants and other specific models)
    exact_idx = try_exact_match(original_cpu, cpu_exact_lookup, cpus, cpus_data)
    if exact_idx is not None:
        cpu = cpus.iloc[exact_idx]
        results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': 100,
                       'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                       'gpu_name': cpus_data.iloc[exact_idx].get('gpu_name', 'NA'),
                       'match_type': 'exact'})
        exact_matched += 1
        scores.append(100)
        continue

    # Step 1: Check for known typos/corrections
    correction = apply_cpu_corrections(n)
    if correction:
        correction_norm = normalize(correction)
        match = process.extractOne(correction_norm, cpu_norms, scorer=fuzz.token_set_ratio)
        if match and match[1] >= 80:
            _, score, match_idx = match
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': score,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'),
                           'match_type': f'corrected ({original_cpu} -> {correction})'})
            corrected += 1
            scores.append(score)
            continue

    # Step 2: Check if generic CPU -> map to common CPU
    generic = detect_generic_cpu(original_cpu)
    if generic:
        brand, tier, generation = generic
        common_cpu = get_common_cpu_name(brand, tier, generation, cpu_by_name)
        
        if common_cpu:
            match_idx = cpu_by_name[common_cpu.lower()]
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': 100,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'),
                           'match_type': f'generic->common ({original_cpu})'})
            generic_matched += 1
            scores.append(100)
            continue
        else:
            # No generation or CPU not found -> NA
            results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA',
                           'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'generic_no_gen'})
            unmatched += 1
            scores.append(0)
            continue

    # Step 3: Standard fuzzy matching
    match = process.extractOne(n, cpu_norms, scorer=fuzz.token_set_ratio)
    if match:
        _, score, match_idx = match
        scores.append(score)
        if score >= MATCH_THRESHOLD:
            cpu = cpus.iloc[match_idx]
            results.append({'mapped_cpu_name': cpu['cpu_name'], 'match_score': score,
                           'cores': cpu['cores'], 'cpu_mark': cpu['cpu_mark'], 'tdp': cpu['tdp'],
                           'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA'), 'match_type': 'fuzzy'})
            matched += 1
        else:
            print(f'Unmatched CPU (score {score}): "{original_cpu}"')
            results.append({'mapped_cpu_name': 'NA', 'match_score': score, 'cores': 'NA',
                           'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'unmatched'})
            unmatched += 1
    else:
        results.append({'mapped_cpu_name': 'NA', 'match_score': 0, 'cores': 'NA',
                       'cpu_mark': 'NA', 'tdp': 'NA', 'gpu_name': 'NA', 'match_type': 'no_match'})
        unmatched += 1
        scores.append(0)

# ------------------ MERGE & SAVE ------------------

results_data = pd.DataFrame(results)
data_merged = pd.concat([data.reset_index(drop=True), results_data], axis=1)
data_merged = data_merged.drop(columns=['norm_cpu'])

OUT_FN = 'data_with_cpus.csv'
data_merged.to_csv(OUT_FN, index=False)

total = len(data)
avg_score = sum(scores) / len(scores) if scores else 0
print(f'\nWrote {OUT_FN} ({total} rows). Exact: {exact_matched}, Fuzzy: {matched}, Generic: {generic_matched}, Corrected: {corrected}, Unmatched: {unmatched}, Avg score: {avg_score:.1f}')

Unmatched CPU (score 48.648648648648646): "7TH GEN INTEL CORE I7 V PRO"
Unmatched CPU (score 58.064516129032256): "12TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 58.064516129032256): "11TH GEN INTEL CORE I7 VPRO"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 57.142857142857146): "11TH GEN INTEL CORE I3 1145G4"
Unmatched CPU (score 57.142857142857146): "11TH GEN INTEL CORE I3 1134G4"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 53.333333333333336): "INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 54.54545454545455): "10TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 51.16279069767442): "5TH GEN INTEL CORE I5 H VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatched CPU (score 48.78048780487805): "8TH GEN INTEL CORE I5 VPRO"
Unmatche

# Merge gpus data into main data using cpu names

In [None]:
new_data = pd.read_csv(OUT_FN)

# Load cleaned GPUs reference
gpus_ref = pd.read_csv('gpus.csv')

# Normalization function for GPU names (same as in tools/map_gpus.py)
def normalize_gpu(s):
    if not s or pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r'\b(nvidia|geforce|radeon|radeon pro|intel|graphics|gpu|mobile|laptop|pc|mx|with|max-q|ti|gtx|rtx|series|apple)\b', '', s)
    s = s.replace('-', ' ')
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

# Build normalized GPU lookup
gpu_norms = gpus_ref['gpu_name'].apply(normalize_gpu).tolist()
gpu_data = gpus_ref.to_dict('records')

# Create a lookup dict by exact gpu_name for faster access
gpu_by_name = {g['gpu_name'].lower(): g for g in gpu_data}

# Best match function using rapidatauzz
def best_gpu_match(query, choices):
    if not query:
        return None
    match = process.extractOne(query, choices, scorer=fuzz.token_set_ratio)
    if match:
        return match  # (choice, score, idx)
    return None

# Apple GPU mapping based on CPU type
APPLE_GPU_MAP = {
    # M1 series - 8-core GPU (closest to 19-core performance tier)
    'm1': 'Apple 19-core GPU',
    'm1 pro': 'Apple 19-core GPU',
    'm1 max': 'Apple 38-core GPU',
    'm1 ultra': 'Apple 64-core GPU',
    # M2 series
    'm2': 'Apple 19-core GPU',
    'm2 pro': 'Apple 19-core GPU',
    'm2 max': 'Apple 38-core GPU',
    'm2 ultra': 'Apple 76-core GPU',
    # M3 series
    'm3': 'Apple 19-core GPU',
    'm3 pro': 'Apple 19-core GPU',
    'm3 max': 'Apple 38-core GPU',
    # M4 series
    'm4': 'Apple 19-core GPU',
    'm4 pro': 'Apple 38-core GPU',
    'm4 max': 'Apple 38-core GPU',
}

def get_apple_gpu_for_cpu(cpu_name):
    """Map Apple Silicon CPU to appropriate GPU benchmark entry."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    cpu_lower = str(cpu_name).lower()
    
    # Check from most specific to least specific
    for pattern, gpu_name in sorted(APPLE_GPU_MAP.items(), key=lambda x: -len(x[0])):
        if pattern in cpu_lower:
            return gpu_name
    return None

# CPU-based GPU inference for CPUs with no gpu_name assigned
def infer_gpu_from_cpu(cpu_name):
    """Infer GPU from CPU name when no gpu_name was assigned."""
    if not cpu_name or pd.isna(cpu_name):
        return None
    cpu_lower = str(cpu_name).lower()
    
    # Qualcomm Snapdragon - use Adreno GPUs
    if 'snapdragon' in cpu_lower:
        if '8cx' in cpu_lower or '8c' in cpu_lower:
            return 'Adreno 680'  # High-end Snapdragon
        elif '7c' in cpu_lower:
            return 'Adreno 618'  # Mid-range
        else:
            return 'Adreno 618'  # Default Snapdragon
    
    # Intel Core 2 Duo / Core Duo - GMA integrated graphics
    if 'core 2 duo' in cpu_lower or 'core duo' in cpu_lower:
        return 'Intel GMA 4500MHD'  # Common integrated GPU for this era
    
    # Intel Celeron (old)
    if 'celeron' in cpu_lower and ('t3' in cpu_lower or 't1' in cpu_lower):
        return 'Intel GMA 4500MHD'
    
    # Generic Intel Core without model (like "INTEL CORE 620") - old laptop
    if 'intel' in cpu_lower and 'core' in cpu_lower:
        return 'Intel GMA 4500MHD'  # Assume old integrated graphics
    
    return None

# GPU mapping threshold
GPU_MATCH_THRESHOLD = 50

# Initialize new columns
new_data['gpu_match_score'] = np.nan
new_data['gpu_g3d_mark'] = np.nan
new_data['gpu_g2d_mark'] = np.nan
new_data['gpu_tdp'] = np.nan

# Map GPUs
dedicated_matched = 0
integrated_matched = 0
apple_matched = 0
inferred_matched = 0
gpu_unmatched = 0
gpu_scores = []

# Find the CPU column
cpu_col = None
for col in ['CPU', 'cpu', 'cpu_name', 'Cpu']:
    if col in new_data.columns:
        cpu_col = col
        break

for idx, row in new_data.iterrows():
    dedicated = row.get('DEDICATED_GPU')
    cpu_name = row.get(cpu_col) if cpu_col else None
    
    # Determine which GPU to look up
    if pd.isna(dedicated) or str(dedicated).strip() == '':
        # No dedicated GPU - look up the integrated GPU from gpu_name column
        gpu_to_match = row.get('gpu_name')
        is_dedicated = False
    else:
        # Has dedicated GPU - match the dedicated GPU
        gpu_to_match = dedicated
        is_dedicated = True
    
    # If no gpu_name, try to infer from CPU (but skip if CPU mapping failed with generic_no_gen)
    match_type = row.get('match_type', '')
    if (pd.isna(gpu_to_match) or str(gpu_to_match).strip() == '' or str(gpu_to_match).strip() == 'NA') and not is_dedicated:
        # Don't infer GPU for CPUs that couldn't be mapped (generic_no_gen means we don't know what CPU it is)
        if match_type != 'generic_no_gen':
            inferred_gpu = infer_gpu_from_cpu(cpu_name)
            if inferred_gpu:
                gpu_to_match = inferred_gpu
                new_data.at[idx, 'gpu_name'] = inferred_gpu
    
    # Skip if still no GPU to match
    if pd.isna(gpu_to_match) or str(gpu_to_match).strip() == '' or str(gpu_to_match).strip() == 'NA':
        gpu_unmatched += 1
        continue
    
    # Special handling for generic "Apple GPU"
    if 'apple gpu' in str(gpu_to_match).lower():
        apple_gpu = get_apple_gpu_for_cpu(cpu_name)
        if apple_gpu and apple_gpu.lower() in gpu_by_name:
            g = gpu_by_name[apple_gpu.lower()]
            new_data.at[idx, 'gpu_name'] = g['gpu_name']
            new_data.at[idx, 'gpu_match_score'] = 100
            new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
            new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)
            new_data.at[idx, 'gpu_tdp'] = g.get('tdp(w)', None)
            apple_matched += 1
            gpu_scores.append(100)
            continue
    
    # Normalize and match GPU
    norm_gpu = normalize_gpu(gpu_to_match)
    if not norm_gpu:
        gpu_unmatched += 1
        continue
    
    match = best_gpu_match(norm_gpu, gpu_norms)
    if match:
        choice, score, match_idx = match
        gpu_scores.append(score)
        if score >= GPU_MATCH_THRESHOLD:
            g = gpu_data[match_idx]
            # Update gpu_name only if it's a dedicated GPU or was inferred
            if is_dedicated:
                new_data.at[idx, 'gpu_name'] = g['gpu_name']
                dedicated_matched += 1
            else:
                integrated_matched += 1
            # Always fill the benchmark attributes
            new_data.at[idx, 'gpu_match_score'] = score
            new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
            new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)
            new_data.at[idx, 'gpu_tdp'] = g.get('tdp(w)', None)
        else:
            # Low score - keep the inferred/assigned name but mark as unmatched
            gpu_unmatched += 1
    else:
        gpu_unmatched += 1
        gpu_scores.append(0)

# Report
avg_gpu_score = sum(gpu_scores) / len(gpu_scores) if gpu_scores else 0
print(f'GPU Mapping: Dedicated: {dedicated_matched}, Integrated: {integrated_matched}, Apple: {apple_matched}, Unmatched: {gpu_unmatched}')
print(f'Avg score: {avg_gpu_score:.1f}')
print(f'\nSample gpu_name values after mapping:')
print(new_data['gpu_name'].dropna().value_counts().head(15))
print(f'\nGPU benchmark columns filled: {new_data["gpu_g3d_mark"].notna().sum()} rows')

# Show remaining unmatched
still_unmatched = new_data[new_data['gpu_g3d_mark'].isna()]
if len(still_unmatched) > 0:
    print(f'\nRemaining unmatched ({len(still_unmatched)} rows):')
    print(still_unmatched[['gpu_name', cpu_col]].head(20) if cpu_col else still_unmatched[['gpu_name']].head(20))

# Export
new_data.to_csv('data_with_cpus_gpus.csv', index=False)

  new_data.at[idx, 'gpu_g3d_mark'] = g.get('g3d_mark', None)
  new_data.at[idx, 'gpu_g2d_mark'] = g.get('g2d_mark', None)


GPU Mapping: Dedicated: 3679, Integrated: 11950, Apple: 48, Unmatched: 715
Avg score: 98.2

Sample gpu_name values after mapping:
gpu_name
Intel UHD Graphics                 1777
Intel Iris Xe Graphics             1690
Intel UHD Graphics 620             1309
Intel UHD Graphics 730             1003
AMD Radeon Graphics                 998
Intel UHD Graphics 770              900
Apple 19-core GPU                   871
Intel Arc Graphics                  515
Intel HD Graphics 520               506
Intel HD Graphics 620               481
GeForce RTX 4060                    441
Intel HD Graphics                   384
GeForce RTX 3050 4GB Laptop GPU     356
GeForce RTX 4070                    217
GeForce GTX 1650                    201
Name: count, dtype: int64

GPU benchmark columns filled: 15677 rows

Remaining unmatched (715 rows):
     gpu_name              CPU
17        NaN    INTEL CORE I5
21        NaN    INTEL CORE I7
39        NaN    INTEL CELERON
41        NaN    INTEL CORE I5
45   

# RAM and Storage Data Cleaning

This notebook cleans the RAM_TYPE, RAM_SIZE, SSD_SIZE, and HDD_SIZE columns using CPU-based mappings.

## Cleaning Steps:
1. **Fix swapped columns**: Detect when RAM values are in SSD column and vice versa
2. **Handle dual storage**: Split formats like "1TB+240GB" into SSD and HDD
3. **Fill RAM_TYPE**: Use CPU ‚Üí DDR type mappings from `cpu_ddr_map.csv`
4. **Fill RAM_SIZE**: Use tier-based heuristics (i9‚Üí32GB, i7/i5‚Üí16GB, i3‚Üí8GB)
5. **Fill Storage**: Only if BOTH SSD and HDD are empty, use CPU ‚Üí storage defaults from `cpu_storage_map.csv`
6. **Normalize storage**: Convert TB to GB format (1TB ‚Üí 1000GB)

## Input/Output:
- **Input**: `data_with_cpus_gpus.csv` (output from cpus_gpus_handling.ipynb with cleaned CPU names)
- **Output**: `data_with_cleaned_ram_storage.csv`
- **Reference**: `cpu_ddr_map.csv`, `cpu_storage_map.csv` (use cleaned CPU names from cpus.csv)
- **CPU Column**: Uses `mapped_cpu_name` (standardized CPU names like "Intel Core i5-1135G7 @ 2.40GHz")

*
*
*
*
*
*
*
*
*
*

## 1. Import Libraries

## 2. Load Mapping Files

### CPU ‚Üí DDR Type Mapping
Maps CPU names to their compatible DDR type (DDR3, DDR4, DDR5, LPDDR3, LPDDR4, LPDDR4X, LPDDR5, LPDDR5X).

### CPU ‚Üí Storage Mapping
Maps CPU names to their default storage configuration (type: SSD/HDD, size: 256GB/512GB/1TB/etc).
Only used when BOTH SSD_SIZE and HDD_SIZE are empty.

In [None]:
def load_ddr_map(filepath):
    """Load CPU to DDR type mapping from csv.
    Expected format: cpu_name,ddr_type
    Returns dict: {cpu_name: ddr_type}
    """
    ddr_map = {}
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                cpu = row.get('cpu_name', '').strip()
                ddr = row.get('ddr_type', '').strip()
                if cpu and ddr:
                    ddr_map[cpu] = ddr
        print(f"Loaded {len(ddr_map)} CPU ‚Üí DDR type mappings from {filepath}")
        # Show sample CPU names from mapping file
        sample_cpus = list(ddr_map.keys())[:5]
        print(f"Sample CPU names from mapping file: {sample_cpus}")
    except FileNotFoundError:
        print(f"WARNING: DDR map file not found: {filepath}")
    return ddr_map

def load_storage_map(filepath):
    """Load CPU to storage mapping from csv.
    Expected format: cpu_name,storage_type,storage_size
    Returns dict: {cpu_name: {'storage_type': 'SSD'/'HDD', 'storage_size': '512GB'}}
    """
    storage_map = {}
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                cpu = row.get('cpu_name', '').strip()
                storage_type = row.get('storage_type', '').strip()
                storage_size = row.get('storage_size', '').strip()
                if cpu and storage_type and storage_size:
                    storage_map[cpu] = {
                        'storage_type': storage_type,
                        'storage_size': storage_size
                    }
        print(f"Loaded {len(storage_map)} CPU ‚Üí storage mappings from {filepath}")
        # Show sample CPU names from mapping file
        sample_cpus = list(storage_map.keys())[:5]
        print(f"Sample CPU names from mapping file: {sample_cpus}")
    except FileNotFoundError:
        print(f"WARNING: Storage map file not found: {filepath}")
    return storage_map

# Load mapping files
ddr_map = load_ddr_map('cpu_ddr_map.csv')
storage_map = load_storage_map('cpu_storage_map.csv')

Loaded 823 CPU ‚Üí DDR type mappings from cpu_ddr_map.csv
Sample CPU names from mapping file: ['AMD 3015e', 'AMD 3020e', 'AMD A10 Micro-6700T APU', 'AMD A10 PRO-7350B APU', 'AMD A10 PRO-7850B APU']
Loaded 738 CPU ‚Üí storage mappings from cpu_storage_map.csv
Sample CPU names from mapping file: ['AMD 3015e', 'AMD 3020e', 'AMD A10 Micro-6700T APU', 'AMD A10 PRO-7350B APU', 'AMD A10 PRO-7850B APU']


## 3. CPU Name Lookup

Direct dictionary lookup using cleaned CPU names. 

**Important**: 
- `mapped_cpu_name` column may include frequency (e.g., "Intel Core i5-1135G7 @ 2.40GHz")
- Mapping files (`cpu_ddr_map.csv`, `cpu_storage_map.csv`) don't have frequencies (e.g., "Intel Core i5-1135G7")
- The function strips "@ GHz" before matching

In [None]:
def find_cpu_in_map(cpu_name, cpu_map):
    """Find CPU in map using direct lookup.
    
    The mapped_cpu_name from cpus_gpus_handling includes frequency (e.g., "@ 2.40GHz"),
    but the mapping files don't have frequencies, so we need to strip them.
    
    Example:
    - Input: "Intel Core i5-1135G7 @ 2.40GHz"
    - Stripped: "Intel Core i5-1135G7"
    - Matches: "Intel Core i5-1135G7" in cpu_ddr_map.csv
    
    Returns: matched CPU name from map, or None
    """
    if not cpu_name or not cpu_map:
        return None
    
    # Strip frequency: "Intel Core i5-1135G7 @ 2.40GHz" -> "Intel Core i5-1135G7"
    cpu_name_clean = re.sub(r'\s*@.*', '', str(cpu_name)).strip()
    cpu_lower = cpu_name_clean.lower()
    
    # Direct case-insensitive lookup
    for map_cpu in cpu_map.keys():
        if cpu_lower == map_cpu.lower():
            return map_cpu
    
    return None

## 4. RAM Size Heuristics

When RAM_SIZE is missing, estimate based on:
1. **CPU suffix** (U-series, H-series, HX, G7, P-series) - Most accurate
2. **CPU tier** (i3/i5/i7/i9, Ryzen 3/5/7/9) - Secondary indicator
3. **Generation** - Newer gens tend to have more RAM

### Intel Suffix Patterns:
- **HX-series** (Extreme performance): 32GB (high-end gaming/workstation)
- **H-series** (High performance): 16-32GB (gaming laptops)
- **P-series** (Performance): 16GB (creator laptops)
- **U-series** (Ultra-low power): 8-16GB (thin & light)
- **G7/G4** (Iris graphics): 8-16GB (mainstream)

### AMD Patterns:
- **HX-series**: 32GB
- **HS/H-series**: 16-32GB
- **U-series**: 8-16GB

### Examples:
- Intel Core i7-1135G7 ‚Üí 16GB (i7 + G7 suffix)
- Intel Core i5-1135G7 ‚Üí 16GB (i5 + G7 suffix, not just 8GB)
- Intel Core i7-12700H ‚Üí 16GB (i7 + H-series)
- Intel Core i9-13980HX ‚Üí 32GB (i9 + HX)

In [None]:
def get_ram_size_for_cpu(cpu_name):
    """Get typical RAM size for a CPU based on suffix, tier, and generation.
    
    Priority:
    1. CPU suffix (HX, H, P, U, G7, etc.) - Most accurate indicator
    2. CPU tier (i3/i5/i7/i9, Ryzen 3/5/7/9)
    3. Generation (newer = more RAM)
    
    Returns: RAM size as string (e.g., '16') or None
    """
    if not cpu_name:
        return None
    
    cpu_lower = cpu_name.lower()
    cpu_upper = cpu_name.upper()
    
    # === PRIORITY 1: Check CPU suffix patterns (most accurate) ===
    
    # HX-series: Extreme performance (32GB)
    if 'hx' in cpu_lower or cpu_upper.endswith('HX'):
        return "32"
    
    # H-series: High performance gaming/workstation
    # i9-H or Ryzen 9-H ‚Üí 32GB
    # i7-H or Ryzen 7-H ‚Üí 16GB (but could be 32GB in newer gens)
    # i5-H ‚Üí 16GB
    if re.search(r'\d{4,5}h\b', cpu_lower) or re.search(r'-\d{4}h\b', cpu_lower):
        # Check tier for H-series
        if 'i9' in cpu_lower or 'ryzen 9' in cpu_lower:
            return "32"
        elif 'i7' in cpu_lower or 'ryzen 7' in cpu_lower:
            # 11th gen+ i7-H typically have 16GB, but can go 32GB
            return "16"
        elif 'i5' in cpu_lower or 'ryzen 5' in cpu_lower:
            return "16"
        else:
            return "16"  # Default H-series
    
    # HS-series: AMD high performance slim (16GB)
    if 'hs' in cpu_lower:
        if 'ryzen 9' in cpu_lower:
            return "32"
        else:
            return "16"
    
    # P-series: Intel Performance (creator laptops, 16GB)
    if re.search(r'\d{4,5}p\b', cpu_lower):
        return "16"
    
    # U-series: Ultra-low power (thin & light)
    # i7-U with G7 ‚Üí 16GB (like i7-1135G7)
    # i5-U with G7 ‚Üí 16GB (like i5-1135G7)
    # i7-U without G7 ‚Üí 8-16GB (check generation)
    # i3-U ‚Üí 8GB
    if re.search(r'\d{4,5}u\b', cpu_lower) or 'u @' in cpu_lower:
        # Check for G7 suffix (Iris Xe graphics - better performance)
        if 'g7' in cpu_lower or 'g4' in cpu_lower:
            # G7 models typically come with 16GB even for i5
            if 'i7' in cpu_lower or 'i5' in cpu_lower:
                return "16"
            elif 'i3' in cpu_lower:
                return "8"
        # U-series without G7
        if 'i7' in cpu_lower or 'ryzen 7' in cpu_lower:
            # Check generation: 10th gen+ ‚Üí 16GB, older ‚Üí 8GB
            gen_match = re.search(r'-(\d{1,2})\d{3}', cpu_name)
            if gen_match:
                gen = int(gen_match.group(1))
                if gen >= 10:
                    return "16"
            return "8"
        elif 'i5' in cpu_lower or 'ryzen 5' in cpu_lower:
            return "8"
        elif 'i3' in cpu_lower or 'ryzen 3' in cpu_lower:
            return "8"
        else:
            return "8"  # Default U-series
    
    # G7/G4 suffix: Iris Xe graphics (typically 16GB for i5+)
    if 'g7' in cpu_lower or 'g4' in cpu_lower:
        if 'i7' in cpu_lower or 'i9' in cpu_lower:
            return "16"
        elif 'i5' in cpu_lower:
            return "16"  # i5-1135G7 typically has 16GB
        elif 'i3' in cpu_lower:
            return "8"
    
    # Y-series: Ultra-low power (tablets, 8GB)
    if re.search(r'\d{4,5}y\b', cpu_lower):
        return "8"
    
    # M-series: Mobile (8GB)
    if 'core m' in cpu_lower or re.search(r'm\d-', cpu_lower):
        return "8"
    
    # === PRIORITY 2: Check CPU tier (if no suffix detected) ===
    
    # High-end tiers: 32GB
    if any(x in cpu_lower for x in ['i9', 'ryzen 9', 'ultra 9', 'ultra9', 
                                      'threadripper', 'epyc', 'xeon']):
        return "32"
    
    # Mid-high tiers: 16GB
    if any(x in cpu_lower for x in ['i7', 'ryzen 7', 'ultra 7', 'ultra7']):
        return "16"
    
    # Mid tiers: Check generation
    if any(x in cpu_lower for x in ['i5', 'ryzen 5', 'ultra 5', 'ultra5']):
        # Modern i5 (10th gen+) typically have 16GB
        gen_match = re.search(r'-(\d{1,2})\d{3}', cpu_name)
        if gen_match:
            gen = int(gen_match.group(1))
            if gen >= 10:
                return "16"
        return "8"
    
    # Entry-level: 8GB
    if any(x in cpu_lower for x in ['i3', 'i1', 'ryzen 3', 'ultra 3', 'ultra3',
                                      'celeron', 'pentium', 'athlon', 'atom',
                                      'core 2', 'core duo',
                                      'a4', 'a6', 'a8', 'a9',
                                      'a10', 'a12', 'e1', 'e2', 'fx-', 'n95', 'n97', 'n100', 'n200', 'n300']):
        return "8"
    
    # Default for unknown: 8GB
    return "8"

## 5. Data Validation Helpers

Functions to detect:
- **Swapped columns**: RAM values in SSD column or vice versa
- **RAM values**: 2/4/6/8/12/16/24/32/48/64/96/128 GB
- **Storage values**: 256+ GB, TB units, or dual storage (A+B format)

In [None]:
def is_ram_value(val):
    """Check if value looks like RAM (4/8/16/32/64/96/128 GB - realistic laptop/workstation RAM)."""
    if not val:
        return False
    val_clean = val.strip().upper().replace('GB', '').replace(' ', '')
    # Only these are realistic laptop/workstation RAM sizes (128GB is valid for MacBooks/workstations)
    return val_clean in ['2', '4', '6', '8', '12', '16', '24', '32', '48', '64', '96', '128']

def is_storage_value(val):
    """Check if value looks like storage (256+ GB or TB, or has +)."""
    if not val:
        return False
    val_clean = val.strip().upper()
    # Contains + means dual storage
    if '+' in val_clean:
        return True
    # TB is always storage
    if 'TB' in val_clean:
        return True
    # GB values >= 256 are likely storage (128GB could be RAM on high-end machines)
    num = val_clean.replace('GB', '').replace(' ', '')
    try:
        return int(num) >= 256
    except:
        return False

def needs_swap(ram_val, ssd_val):
    """Check if RAM and SSD columns appear to be swapped.
    Returns True if:
    - RAM has storage-like value (>=128GB or TB) AND SSD has RAM-like value, OR
    - RAM has storage-like value AND SSD is empty, OR
    - RAM is empty AND SSD has RAM-like value
    """
    ram = (ram_val or '').strip()
    ssd = (ssd_val or '').strip()
    
    ram_looks_like_storage = is_storage_value(ram)
    ssd_looks_like_ram = is_ram_value(ssd)
    ram_looks_like_ram = is_ram_value(ram)
    ssd_looks_like_storage = is_storage_value(ssd)
    
    # Case 1: RAM empty, SSD has RAM value
    if not ram and ssd_looks_like_ram:
        return True
    
    # Case 2: RAM has storage value, SSD has RAM value (definitely swapped)
    if ram_looks_like_storage and ssd_looks_like_ram:
        return True
    
    # Case 3: RAM has storage value (128GB+) and SSD also has storage value
    # This might be swapped too - check if RAM > typical max (64GB)
    if ram_looks_like_storage and not ram_looks_like_ram:
        # RAM has a storage-like value, likely swapped
        # Only swap if SSD is empty or also looks like storage
        if not ssd or ssd_looks_like_storage:
            return True
    
    return False

## 6. Storage Parsing and Normalization

- **Parse dual storage**: "1TB+240GB" ‚Üí SSD=1TB, HDD=240GB
- **Normalize to GB**: "1TB" ‚Üí "1000GB", "2TB" ‚Üí "2000GB"

In [None]:
def parse_dual_storage(val):
    """Parse 'A+B' format like '1TB+240GB' -> (primary_size, secondary_size)."""
    if not val or '+' not in val:
        return val, None
    parts = val.split('+')
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return val, None

def normalize_storage_to_gb(val):
    """Convert storage values to GB format (e.g., '1TB' -> '1000GB', '2TB' -> '2000GB').
    Also handles dual storage like '1TB 512GB' or '512GB 1TB' by taking the first part."""
    if not val:
        return val
    val_clean = val.strip()
    
    # Handle dual storage with space separator (e.g., "1TB 512GB" or "512GB 1TB")
    # Take only the first part
    if ' ' in val_clean and ('GB' in val_clean.upper() or 'TB' in val_clean.upper()):
        parts = val_clean.split()
        # Find the first storage-like part
        for part in parts:
            if 'GB' in part.upper() or 'TB' in part.upper():
                val_clean = part
                break
    
    val_upper = val_clean.upper()
    
    # Handle TB -> GB conversion
    if 'TB' in val_upper:
        try:
            num = float(val_upper.replace('TB', '').strip())
            return f"{int(num * 1000)}GB"
        except:
            return val_clean
    
    # Already in GB or other format, return as-is but ensure GB suffix
    if 'GB' in val_upper:
        return val_upper
    
    # Just a number, assume GB
    try:
        num = int(val_clean)
        return f"{num}GB"
    except:
        return val_clean

## 7. Load Input Data

In [None]:
print(f"Loaded {len(data)} rows from data_with_cpus_gpus.csv")
print(f"\nColumns: {list(data.columns)}")
print(f"\nFirst few CPU names from mapped_cpu_name column:")
print(data['mapped_cpu_name'].head(10))
print(f"\nSample of data:")
data.head()

Loaded 16392 rows from data_with_cpus_gpus.csv

Columns: ['id', 'price_preview', 'created_at', 'city', 'spec_Etat', 'model_name', 'DEDICATED_GPU', 'CPU', 'RAM_SIZE', 'SSD_SIZE', 'HDD_SIZE', 'SCREEN_SIZE', 'SCREEN_FREQUENCY', 'SCREEN_RESOLUTION', 'RAM_TYPE', 'mapped_cpu_name', 'match_score', 'cores', 'cpu_mark', 'tdp', 'gpu_name', 'match_type', 'gpu_match_score', 'gpu_g3d_mark', 'gpu_g2d_mark', 'gpu_tdp']

First few CPU names from mapped_cpu_name column:
0               Intel Core i5-1250P
1    Intel Core i7-11800H @ 2.30GHz
2    Intel Core i7-7700HQ @ 2.80GHz
3                AMD Ryzen 7 5800HS
4                   AMD Ryzen 5 240
5    Intel Core i5-10300H @ 2.50GHz
6                 AMD Ryzen 5 7520U
7    Intel Core i5-1135G7 @ 2.40GHz
8             Intel Core i7-13700HX
9    Intel Core i5-1145G7 @ 2.60GHz
Name: mapped_cpu_name, dtype: object

Sample of data:


Unnamed: 0,id,price_preview,created_at,city,spec_Etat,model_name,DEDICATED_GPU,CPU,RAM_SIZE,SSD_SIZE,...,match_score,cores,cpu_mark,tdp,gpu_name,match_type,gpu_match_score,gpu_g3d_mark,gpu_g2d_mark,gpu_tdp
0,1,75000000.0,2021 10 01T18:01:44.000Z,EL TAREF,BON TAT,IDEAPAD,,INTEL CORE I5 750S,4GB,128GB,...,66.666667,4.0,19108,28.0,Intel UHD Graphics 730,fuzzy,100.0,1000,237,15.0
1,2,33500000.0,2021 11 10T21:24:14.000Z,COLLO,JAMAIS UTILIS,AERO,NVIDIA GEFORCE RTX 3060,11TH GEN INTEL CORE I7 11800H,16GB,1TB,...,69.230769,8.0,19776,45.0,GeForce RTX 3060 12GB,fuzzy,100.0,16758,966,170.0
2,3,17000000.0,2021 09 11T20:27:59.000Z,MECHERIA,,STEALTH,NVIDIA GEFORCE GTX 1060,INTEL CORE I7 7700HQ,16GB,,...,100.0,4.0,6881,45.0,GeForce GTX 1060,fuzzy,100.0,10059,743,120.0
3,4,12000000.0,2025 03 06T00:28:39.000Z,ES SENIA,,ROG,NVIDIA GEFORCE RTX 1650,AMD RYZEN 7 5800HS,16GB,512GB,...,100.0,8.0,19476,35.0,GeForce GTX 1650,exact,100.0,7871,554,75.0
4,5,11000000.0,2024 10 09T18:10:21.000Z,TIZI OUZOU,BON TAT,,AMD RADEON RX 580,AMD RYZEN 5 2400G,16GB,128GB,...,93.75,6.0,22980,45.0,GeForce GTX 580,fuzzy,100.0,4632,489,244.0


## 8. Data Cleaning Pipeline

### Processing Steps:
1. **Fix swapped columns** (RAM in SSD column or vice versa)
2. **Split dual storage** ("1TB+240GB" format)
3. **Fill RAM_TYPE** using CPU ‚Üí DDR mappings
4. **Fill RAM_SIZE** using tier-based heuristics
5. **Fill Storage** only if BOTH SSD and HDD are empty
6. **Normalize storage** values to GB format

In [None]:
# Initialize statistics
stats = {
    'total_rows': len(data),
    'ram_type_filled': 0,
    'ram_type_unchanged': 0,
    'ram_type_not_found': 0,
    'ram_size_filled': 0,
    'ram_size_unchanged': 0,
    'storage_filled': 0,
    'storage_unchanged': 0,
    'storage_not_found': 0,
    'columns_swapped': 0,
    'dual_storage_split': 0,
}

# Track CPUs not found in maps
cpus_not_in_ddr_map = set()
cpus_not_in_storage_map = set()

# Process each row
for idx, row in data.iterrows():
    # Use mapped_cpu_name (cleaned CPU name from cpus_gpus_handling.ipynb)
    cpu_name = str(row.get('mapped_cpu_name', '')).strip() if pd.notna(row.get('mapped_cpu_name')) else ''
    
    # Skip if CPU mapping failed (NA means CPU couldn't be matched)
    if not cpu_name or cpu_name.upper() == 'NA':
        continue
    
    # === STEP 0: Fix swapped columns (RAM in SSD column or vice versa) ===
    current_ram_size = str(row.get('RAM_SIZE', '')).strip() if pd.notna(row.get('RAM_SIZE')) else ''
    current_ssd = str(row.get('SSD_SIZE', '')).strip() if pd.notna(row.get('SSD_SIZE')) else ''
    
    if needs_swap(current_ram_size, current_ssd):
        # Swap RAM and SSD values
        data.at[idx, 'RAM_SIZE'] = current_ssd if is_ram_value(current_ssd) else ''
        data.at[idx, 'SSD_SIZE'] = current_ram_size if is_storage_value(current_ram_size) else ''
        stats['columns_swapped'] += 1
        current_ram_size = data.at[idx, 'RAM_SIZE']
        current_ssd = data.at[idx, 'SSD_SIZE']
    
    # === STEP 0b: Handle dual storage format (e.g., "1TB+240GB") ===
    current_ssd = str(data.at[idx, 'SSD_SIZE']).strip() if pd.notna(data.at[idx, 'SSD_SIZE']) else ''
    if current_ssd and '+' in current_ssd:
        primary, secondary = parse_dual_storage(current_ssd)
        data.at[idx, 'SSD_SIZE'] = primary  # Keep primary in SSD
        # Optionally store secondary in HDD if HDD is empty
        current_hdd = str(row.get('HDD_SIZE', '')).strip() if pd.notna(row.get('HDD_SIZE')) else ''
        if not current_hdd or current_hdd.lower() in ['', 'nan', 'none', 'null']:
            data.at[idx, 'HDD_SIZE'] = secondary if secondary else ''
        stats['dual_storage_split'] += 1
    
    # === Fill RAM_TYPE if empty ===
    current_ram_type = str(row.get('RAM_TYPE', '')).strip() if pd.notna(row.get('RAM_TYPE')) else ''
    if not current_ram_type or current_ram_type.lower() in ['', 'nan', 'none', 'null']:
        matched_cpu = find_cpu_in_map(cpu_name, ddr_map)
        if matched_cpu:
            data.at[idx, 'RAM_TYPE'] = ddr_map[matched_cpu]
            stats['ram_type_filled'] += 1
        else:
            stats['ram_type_not_found'] += 1
            # Track CPU not found in DDR map
            cpu_name_clean = re.sub(r'\s*@.*', '', cpu_name).strip()
            cpus_not_in_ddr_map.add(cpu_name_clean)
    else:
        stats['ram_type_unchanged'] += 1
    
    # === Fill RAM_SIZE if empty ===
    current_ram_size = str(row.get('RAM_SIZE', '')).strip() if pd.notna(row.get('RAM_SIZE')) else ''
    if not current_ram_size or current_ram_size.lower() in ['', 'nan', 'none', 'null']:
        ram_size = get_ram_size_for_cpu(cpu_name)
        if ram_size:
            data.at[idx, 'RAM_SIZE'] = ram_size + "GB"
            stats['ram_size_filled'] += 1
    else:
        stats['ram_size_unchanged'] += 1
    
    # === Fill Storage ONLY if BOTH SSD_SIZE and HDD_SIZE are empty ===
    current_ssd = str(data.at[idx, 'SSD_SIZE']).strip() if pd.notna(data.at[idx, 'SSD_SIZE']) else ''
    current_hdd = str(row.get('HDD_SIZE', '')).strip() if pd.notna(row.get('HDD_SIZE')) else ''
    
    ssd_empty = not current_ssd or current_ssd.lower() in ['', 'nan', 'none', 'null', '0']
    hdd_empty = not current_hdd or current_hdd.lower() in ['', 'nan', 'none', 'null', '0']
    
    if ssd_empty and hdd_empty:
        matched_cpu = find_cpu_in_map(cpu_name, storage_map)
        if matched_cpu:
            storage_info = storage_map[matched_cpu]
            storage_type = storage_info['storage_type']
            storage_size = storage_info['storage_size'].replace('GB', '').replace('TB', '000')
            
            if storage_type == 'SSD':
                data.at[idx, 'SSD_SIZE'] = storage_size
                data.at[idx, 'HDD_SIZE'] = ''
            else:
                data.at[idx, 'HDD_SIZE'] = storage_size
                data.at[idx, 'SSD_SIZE'] = ''
            
            stats['storage_filled'] += 1
        else:
            stats['storage_not_found'] += 1
            # Track CPU not found in storage map
            cpu_name_clean = re.sub(r'\s*@.*', '', cpu_name).strip()
            cpus_not_in_storage_map.add(cpu_name_clean)
    else:
        stats['storage_unchanged'] += 1
    
    # === STEP: Normalize all storage values to GB format (TB -> GB) ===
    if pd.notna(data.at[idx, 'SSD_SIZE']) and str(data.at[idx, 'SSD_SIZE']).strip():
        data.at[idx, 'SSD_SIZE'] = normalize_storage_to_gb(str(data.at[idx, 'SSD_SIZE']))
    if pd.notna(data.at[idx, 'HDD_SIZE']) and str(data.at[idx, 'HDD_SIZE']).strip():
        data.at[idx, 'HDD_SIZE'] = normalize_storage_to_gb(str(data.at[idx, 'HDD_SIZE']))

print("Data cleaning completed!")

Data cleaning completed!


## 9. Display Cleaning Statistics

In [None]:
print("=" * 60)
print("CLEANING STATISTICS")
print("=" * 60)
print(f"Total rows processed: {stats['total_rows']}")
print()
print("RAM_TYPE:")
print(f"  - Filled from map:     {stats['ram_type_filled']}")
print(f"  - Already had value:   {stats['ram_type_unchanged']}")
print(f"  - CPU not in map:      {stats['ram_type_not_found']}")
print()
print("RAM_SIZE:")
print(f"  - Filled from tier:    {stats['ram_size_filled']}")
print(f"  - Already had value:   {stats['ram_size_unchanged']}")
print()
print("Data Fixes:")
print(f"  - Columns swapped:     {stats['columns_swapped']} (RAM was in SSD column)")
print(f"  - Dual storage split:  {stats['dual_storage_split']} (A+B format separated)")
print()
print("Storage (SSD/HDD):")
print(f"  - Filled from map:     {stats['storage_filled']}")
print(f"  - Already had value:   {stats['storage_unchanged']}")
print(f"  - CPU not in map:      {stats['storage_not_found']}")
print("=" * 60)

CLEANING STATISTICS
Total rows processed: 16392

RAM_TYPE:
  - Filled from map:     10330
  - Already had value:   5269
  - CPU not in map:      0

RAM_SIZE:
  - Filled from tier:    413
  - Already had value:   15186

Data Fixes:
  - Columns swapped:     32 (RAM was in SSD column)
  - Dual storage split:  5 (A+B format separated)

Storage (SSD/HDD):
  - Filled from map:     946
  - Already had value:   14653
  - CPU not in map:      0


## 9b. CPUs Not Found in Mapping Files

These CPUs exist in the data but are missing from the mapping files. We should add them if they are valid CPU models.

In [None]:
print("\n" + "=" * 60)
print("CPUs NOT FOUND IN MAPPING FILES")
print("=" * 60)

print(f"\nüìã CPUs not in DDR map ({len(cpus_not_in_ddr_map)} unique):")
print("-" * 60)
if cpus_not_in_ddr_map:
    # Sort alphabetically for easier review
    sorted_cpus_ddr = sorted(cpus_not_in_ddr_map)
    for i, cpu in enumerate(sorted_cpus_ddr, 1):
        print(f"{i:3}. {cpu}")
else:
    print("‚úì All CPUs found in DDR map!")

print(f"\nüìÅ CPUs not in Storage map ({len(cpus_not_in_storage_map)} unique):")
print("-" * 60)
if cpus_not_in_storage_map:
    # Sort alphabetically for easier review
    sorted_cpus_storage = sorted(cpus_not_in_storage_map)
    for i, cpu in enumerate(sorted_cpus_storage, 1):
        print(f"{i:3}. {cpu}")
else:
    print("‚úì All CPUs found in Storage map!")

# Find CPUs missing from BOTH maps
cpus_missing_both = cpus_not_in_ddr_map.intersection(cpus_not_in_storage_map)
if cpus_missing_both:
    print(f"\n‚ö†Ô∏è  CPUs missing from BOTH maps ({len(cpus_missing_both)} unique):")
    print("-" * 60)
    sorted_cpus_both = sorted(cpus_missing_both)
    for i, cpu in enumerate(sorted_cpus_both, 1):
        print(f"{i:3}. {cpu}")

print("\n" + "=" * 60)


CPUs NOT FOUND IN MAPPING FILES

üìã CPUs not in DDR map (0 unique):
------------------------------------------------------------
‚úì All CPUs found in DDR map!

üìÅ CPUs not in Storage map (0 unique):
------------------------------------------------------------
‚úì All CPUs found in Storage map!



## 9c. Export Missing CPUs to CSV

Export the missing CPUs to CSV files so you can review them and add valid entries to the mapping files.

In [None]:
# Export CPUs not in DDR map
if cpus_not_in_ddr_map:
    missing_ddr_data = pd.DataFrame({
        'cpu_name': sorted(cpus_not_in_ddr_map),
        'ddr_type': '',  # To be filled manually
        'release_year': '',  # To be filled manually
        'notes': ''  # To be filled manually
    })
    missing_ddr_data.to_csv('missing_cpus_ddr_map.csv', index=False)
    print(f"‚úì Exported {len(cpus_not_in_ddr_map)} CPUs to 'missing_cpus_ddr_map.csv'")
else:
    print("‚úì No missing CPUs for DDR map")

# Export CPUs not in Storage map
if cpus_not_in_storage_map:
    missing_storage_data = pd.DataFrame({
        'cpu_name': sorted(cpus_not_in_storage_map),
        'storage_type': '',  # To be filled manually (SSD/HDD)
        'storage_size': '',  # To be filled manually (256GB/512GB/1TB etc)
        'tier': '',  # To be filled manually (budget/mid/high)
        'notes': ''  # To be filled manually
    })
    missing_storage_data.to_csv('missing_cpus_storage_map.csv', index=False)
    print(f"‚úì Exported {len(cpus_not_in_storage_map)} CPUs to 'missing_cpus_storage_map.csv'")
else:
    print("‚úì No missing CPUs for Storage map")

print("\n‚ÑπÔ∏è  Review these files, fill in the appropriate values, and append them to:")
print("   - cpu_ddr_map.csv")
print("   - cpu_storage_map.csv")

‚úì No missing CPUs for DDR map
‚úì No missing CPUs for Storage map

‚ÑπÔ∏è  Review these files, fill in the appropriate values, and append them to:
   - cpu_ddr_map.csv
   - cpu_storage_map.csv


## 10. Preview Cleaned Data

In [None]:
# Display sample of cleaned data
print("\nSample of cleaned data (mapped_cpu_name, RAM_TYPE, RAM_SIZE, SSD_SIZE, HDD_SIZE):")
display_cols = ['mapped_cpu_name', 'RAM_TYPE', 'RAM_SIZE', 'SSD_SIZE', 'HDD_SIZE']
data[display_cols].head(20)


Sample of cleaned data (mapped_cpu_name, RAM_TYPE, RAM_SIZE, SSD_SIZE, HDD_SIZE):


Unnamed: 0,mapped_cpu_name,RAM_TYPE,RAM_SIZE,SSD_SIZE,HDD_SIZE
0,Intel Core i5-1250P,DDR5,4GB,128GB,
1,Intel Core i7-11800H @ 2.30GHz,DDR4,16GB,1000GB,
2,Intel Core i7-7700HQ @ 2.80GHz,DDR4,16GB,512GB,
3,AMD Ryzen 7 5800HS,DDR4,16GB,512GB,
4,AMD Ryzen 5 240,DDR5,16GB,128GB,145GB
5,Intel Core i5-10300H @ 2.50GHz,DDR4,8GB,512GB,
6,AMD Ryzen 5 7520U,DDR5,8GB,512GB,
7,Intel Core i5-1135G7 @ 2.40GHz,DDR4,16GB,512GB,
8,Intel Core i7-13700HX,DDR5,16GB,1000GB,
9,Intel Core i5-1145G7 @ 2.60GHz,DDR4,8GB,256GB,


## 11. Export Cleaned Data

In [None]:
# Export to CSV
output_file = 'data_with_cleaned_ram_storage.csv'
data.to_csv(output_file, index=False)

print(f"\n‚úì Exported {len(data)} rows to {output_file}")
print("\nDone!")


‚úì Exported 16392 rows to data_with_cleaned_ram_storage.csv

Done!


*
*
*
*
*
*
*
*
*
*

# Clean prices

# removing false prices

In [None]:
SEQUENTIAL_TROLLS = {
    123,
    1111,
    321,
    222222,
    1234,
    12345,
    123456,
    1234567,
    12345678,
    8976378,
    5649841,
    123456789,
    1223789,
}


# Minimum number of repeated digits to flag (e.g. 3 catches 111, 999, 333 ‚Ä¶)
MIN_REPEAT_LENGTH = 3


def is_troll_price(price):
    """Return True if the price is a troll value."""
    if pd.isna(price):
        return False

    p = int(price)
    s = str(p)

    # 1) Repeated single digit: 111, 999, 4444, 111111, 222222222 ‚Ä¶
    if len(set(s)) == 1 and len(s) >= MIN_REPEAT_LENGTH:
        return True

    # 2) Sequential digits: 123, 1234, 12345, 123456, 1234567 ‚Ä¶
    if p in SEQUENTIAL_TROLLS:
        return True

    return False


data["is_troll_price"] = data["price_preview"].apply(is_troll_price)
# set estimated_price_dzd to NaN where is_troll_price is True
data.loc[data["is_troll_price"], "estimated_price_dzd"] = np.nan
# remove is_troll_price column
data.drop(columns=["is_troll_price"], inplace=True)

### Predicting the correct price based on the market value of the compenents 

In [None]:
data_cpu_prices = pd.read_csv("cpu_prices.csv")
data_gpu_prices = pd.read_csv("gpu_prices.csv")

cpu_price_map = dict(zip(data_cpu_prices["cpu_name"], data_cpu_prices["estimated_price"]))

gpu_price_map = dict(zip(data_gpu_prices["gpu_name"], data_gpu_prices["estimated_price"]))

In [None]:
def estimate_ram_price(ram_gb):
    if ram_gb <= 4:
        return 4000
    elif ram_gb <= 8:
        return 8000
    elif ram_gb <= 16:
        return 15000
    elif ram_gb <= 32:
        return 28000
    else:
        return 40000


def estimate_storage_price(ssd_gb, hdd_gb):
    price = 0

    # SSD
    if ssd_gb > 0:
        price += (ssd_gb / 256) * 8000  # 256GB ‚âà 8k DZD

    # HDD
    if hdd_gb > 0:
        price += (hdd_gb / 1000) * 6000  # 1TB ‚âà 6k DZD

    return price


BRAND_MULTIPLIER = {
    "ROG": 1.25,  # ASUS ROG (Premium gaming)
    "ALIENWARE": 1.25,  # Dell Alienware (Premium gaming)
    "STEALTH": 1.20,  # MSI Stealth (Premium)
    "RAZER": 1.30,  # Razer (Ultra premium)
    "MAC": 1.50,  # MacBook (Apple premium)
    "THINKPAD": 1.15,  # Lenovo ThinkPad (Business premium)
    "VECTOR": 1.20,  # MSI Vector (Gaming)
    "ZENBOOK": 1.15,  # ASUS ZenBook (Premium ultrabook)
    "PRECISION": 1.20,  # Dell Precision (Workstation)
    "TUF": 1.05,  # ASUS TUF (Value gaming)
    "KATANA": 1.05,  # MSI Katana (Value gaming)
    "VIVOBOOK": 1.00,  # ASUS VivoBook (Standard)
    "IDEAPAD": 0.95,  # Lenovo IdeaPad (Budget)
    "INSPIRON": 0.95,  # Dell Inspiron (Budget)
    "PAVILION": 0.95,  # HP Pavilion (Budget)
    "ASPIRE": 0.90,  # Acer Aspire (Budget)
}

RAM_PRICE_PER_GB = {
    "DDR5": 2000,  # DDR5 (Latest, most expensive)
    "DDR5X": 2200,  # DDR5X (Premium)
    "LPDDR5X": 2400,  # LPDDR5X (High-end laptops)
    "LPDDR5": 2100,  # LPDDR5
    "DDR4": 1500,  # DDR4 (Common)
    "DDR4X": 1700,  # DDR4X
    "DDR3": 1000,  # DDR3 (Older)
    "DDR2": 800,  # DDR2 (Legacy)
    "DEFAULT": 1500,  # Default if type unknown
}


def parse_ram_size(ram_str):
    """Convert RAM size string to GB (numeric)"""
    if pd.isna(ram_str) or ram_str == "":
        return 0

    ram_str = str(ram_str).upper().strip()

    # Extract numeric value
    numeric_part = "".join(filter(lambda x: x.isdigit() or x == ".", ram_str))
    if not numeric_part:
        return 0

    value = float(numeric_part)

    # Convert to GB
    if "MB" in ram_str:
        return value / 1024
    elif "GB" in ram_str:
        return value
    else:
        return value


def parse_storage_size(size_str):
    """Convert storage size string to GB (numeric)"""
    if pd.isna(size_str) or size_str == "":
        return 0

    size_str = str(size_str).upper().strip()

    # Handle combined storage (e.g., "1TB + 512GB")
    if "+" in size_str:
        parts = size_str.split("+")
        total = 0
        for part in parts:
            total += parse_storage_size(part.strip())
        return total

    # Extract numeric value
    numeric_part = "".join(filter(lambda x: x.isdigit() or x == ".", size_str))
    if not numeric_part:
        return 0

    value = float(numeric_part)

    # Convert to GB
    if "TB" in size_str:
        return value * 1024
    elif "GB" in size_str:
        return value
    elif "MB" in size_str:
        return value / 1024
    else:
        return value


def brand_multiplier(model_name):
    for brand, mult in BRAND_MULTIPLIER.items():
        if brand.lower() in str(model_name).lower():
            return mult
    return 1.0


def get_ram_price(ram_size_gb, ram_type):
    """Calculate RAM price based on size and type"""
    if ram_size_gb == 0 or ram_size_gb > 128:  # Cap at 128GB to avoid errors
        ram_size_gb = (
            min(ram_size_gb, 128) if ram_size_gb > 0 else 8
        )  # Default to 8GB if 0

    # Determine RAM type price
    if pd.isna(ram_type):
        price_per_gb = RAM_PRICE_PER_GB["DEFAULT"]
    else:
        ram_type = str(ram_type).upper().strip()
        price_per_gb = RAM_PRICE_PER_GB.get(ram_type, RAM_PRICE_PER_GB["DEFAULT"])

    return ram_size_gb * price_per_gb


def estimate_price(row):
    # CPU
    cpu_price = cpu_price_map.get(row["mapped_cpu_name"], 15000)

    # GPU
    gpu_price = gpu_price_map.get(row["gpu_name"], 0)

    # RAM
    ram_price = get_ram_price(parse_ram_size(row["RAM_SIZE"]), row["RAM_TYPE"])

    # Storage
    storage_price = estimate_storage_price(
        parse_storage_size(row["SSD_SIZE"]), parse_storage_size(row["HDD_SIZE"])
    )

    base_price = cpu_price + gpu_price + ram_price + storage_price

    # Brand multiplier
    final_price = base_price * brand_multiplier(row["model_name"])

    return round(final_price, -2)  # round to nearest 100 DZD


def fix_price_scale(real_price, estimated_price):
    if estimated_price == 0 or real_price == 0:
        return real_price

    ratio = real_price / estimated_price

    while ratio >= 8:
        real_price = real_price / 10
        ratio = real_price / estimated_price
    while ratio <= 1 / 8:
        real_price = real_price * 10
        ratio = real_price / estimated_price
    return real_price


In [None]:
data["estimated_price_dzd"] = data.apply(estimate_price, axis=1)

data["price_corrected"] = data.apply(
    lambda row: fix_price_scale(row["price_preview"], row["estimated_price_dzd"]),
    axis=1,
)

data["price_multiplier_diff"] = (data["price_preview"] / data["estimated_price_dzd"]).round(2)
# remove the estimated_prices 
data.drop(columns=["estimated_price_dzd"], inplace=True)
# remove price_multiplier_diff 
data.drop(columns=["price_multiplier_diff"], inplace=True)

In [None]:
data.to_csv("cleaned_prices.csv", index=False)

*
*
*
*
*
*
*
*
*
*
*
*

# Screen-Related Data Cleaning (Final Pipeline Stage)

This notebook performs the final cleaning of screen-related features:
- SCREEN_SIZE normalization
- SCREEN_FREQUENCY cleaning
- SCREEN_RESOLUTION parsing and validation
- Export final dataset ready for machine learning models

## Pipeline Flow:
1. `data.csv` ‚Üí `cpus_gpus_handling.ipynb` ‚Üí `data_with_cpus_gpus.csv`
2. `data_with_cpus_gpus.csv` ‚Üí `clean_ram_storage.ipynb` ‚Üí `data_with_cleaned_ram_storage.csv`
3. `data_with_cleaned_ram_storage.csv` ‚Üí **`clean_screen_related.ipynb`** ‚Üí **`final_cleaned_data.csv`** ‚ú®

## 1. Load Cleaned Dataset

Load the output from the RAM/Storage cleaning pipeline.

## 2. Data Overview

Check data types and missing values in screen-related columns

In [None]:
# Check screen-related columns
screen_cols = ['SCREEN_SIZE', 'SCREEN_FREQUENCY', 'SCREEN_RESOLUTION']

print("Screen-Related Columns Info:")
print("=" * 60)
for col in screen_cols:
    if col in data.columns:
        print(f"\n{col}:")
        print(f"  - Data type: {data[col].dtype}")
        print(f"  - Missing: {data[col].isna().sum()} ({data[col].isna().sum()/len(data)*100:.1f}%)")
        print(f"  - Unique values: {data[col].nunique()}")
        print(f"  - Sample values: {data[col].dropna().head(5).tolist()}")
    else:
        print(f"\n{col}: NOT FOUND in dataset")

print("\n" + "=" * 60)

Screen-Related Columns Info:

SCREEN_SIZE:
  - Data type: float64
  - Missing: 2147 (13.1%)
  - Unique values: 99
  - Sample values: [14.0, 15.6, 17.3, 14.0, 15.0]

SCREEN_FREQUENCY:
  - Data type: object
  - Missing: 15339 (93.6%)
  - Unique values: 16
  - Sample values: ['120Hz', '60Hz', '240Hz', '300Hz', '240Hz']

SCREEN_RESOLUTION:
  - Data type: object
  - Missing: 10246 (62.5%)
  - Unique values: 112
  - Sample values: ['3840x2160', '1920x1080', '1920x1080', '1920x1080 FHD', '1920x1080 FHD']



dataset overview

In [None]:
# Display an overview of the dataframe
print("Dataset Overview:")
print("=" * 60)
print("\nFirst 10 rows:")
print(data.head(10))

print("\n" + "=" * 60)
print("\nDataframe shape:")
print(f"  - Rows: {data.shape[0]}")
print(f"  - Columns: {data.shape[1]}")

print("\n" + "=" * 60)
print("\nColumn names:")
print(data.columns.tolist())

print("\n" + "=" * 60)


Dataset Overview:

First 10 rows:
   id  price_preview                created_at            city      spec_Etat  \
0   1     75000000.0  2021 10 01T18:01:44.000Z        EL TAREF        BON TAT   
1   2     33500000.0  2021 11 10T21:24:14.000Z           COLLO  JAMAIS UTILIS   
2   3     17000000.0  2021 09 11T20:27:59.000Z        MECHERIA            NaN   
3   4     12000000.0  2025 03 06T00:28:39.000Z        ES SENIA            NaN   
4   5     11000000.0  2024 10 09T18:10:21.000Z      TIZI OUZOU        BON TAT   
5   6      9999999.0  2025 02 18T21:30:18.000Z      MOHAMMADIA            NaN   
6   7      9900000.0  2025 04 29T19:42:16.000Z  CHELGHOUM LAID            NaN   
7   8      9000000.0  2025 07 01T17:26:43.000Z    ALGER CENTRE        BON TAT   
8   9      8976378.0  2025 02 23T10:25:42.000Z      MOSTAGANEM  JAMAIS UTILIS   
9  10      8400000.0  2024 12 11T23:17:24.000Z  HAMMA BOUZIANE        BON TAT   

  model_name            DEDICATED_GPU                            CPU RAM_S

Display data types and info for screen-related columns

In [None]:
data[['SCREEN_SIZE','SCREEN_FREQUENCY','SCREEN_RESOLUTION']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16392 entries, 0 to 16391
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SCREEN_SIZE        14245 non-null  float64
 1   SCREEN_FREQUENCY   1053 non-null   object 
 2   SCREEN_RESOLUTION  6146 non-null   object 
dtypes: float64(1), object(2)
memory usage: 384.3+ KB


Value distributions
Display value distributions for resolution and frequency columns

In [None]:
data['SCREEN_RESOLUTION'].value_counts(dropna=False)
data['SCREEN_FREQUENCY'].value_counts(dropna=False)

SCREEN_FREQUENCY
NaN      15339
144Hz      430
120Hz      162
240Hz      145
165Hz      144
60Hz       119
360Hz       22
300Hz       12
90Hz         9
180Hz        2
64Hz         2
480Hz        1
244Hz        1
75Hz         1
24Hz         1
45Hz         1
2.4Hz        1
Name: count, dtype: int64

# 3. Drop SCREEN_FREQUENCY

Remove the SCREEN_FREQUENCY column as it has limited utility

In [None]:
data.drop(columns=['SCREEN_FREQUENCY'], inplace=True)

# 4. Clean SCREEN_SIZE

Convert and normalize SCREEN_SIZE values (replace commas with decimals and extract numeric values)

In [None]:
data['SCREEN_SIZE'] = (
    data['SCREEN_SIZE']
    .astype(str)
    .str.replace(',', '.', regex=False)
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

Remove impossible values

Replace screen sizes outside the valid range (10-20 inches) with NaN

In [None]:
data.loc[
    (data['SCREEN_SIZE'] < 10) | (data['SCREEN_SIZE'] > 20),
    'SCREEN_SIZE'
] = np.nan

show some stats about SCREEN_SIZE after normalization

Show value counts and statistics for normalized SCREEN_SIZE

In [None]:
pd.set_option('display.max_rows', 100)

#data['SCREEN_SIZE'].describe()
#data['SCREEN_SIZE'].mode()
#data['SCREEN_SIZE'].unique()
data['SCREEN_SIZE'].value_counts()



SCREEN_SIZE
14.000    4458
15.600    3128
15.000    1436
13.000    1402
16.000    1017
13.300     930
17.000     292
17.300     218
14.100     176
12.000     144
12.500     124
13.600     111
13.500      86
16.100      83
11.600      61
11.000      53
15.400      43
13.400      41
13.100      36
18.000      31
10.000      30
12.300      29
15.300      20
14.500      17
12.400      16
16.200      15
14.200      14
15.500      14
13.800      12
16.300      11
15.700      10
10.100      10
14.400       8
19.000       7
12.900       5
16.500       4
10.500       4
14.300       4
14.600       3
13.900       3
19.500       2
15.800       2
12.100       2
11.500       2
16.400       2
14.700       1
13.200       1
16.600       1
12.200       1
10.600       1
13.140       1
15.100       1
17.200       1
13.700       1
10.300       1
14.150       1
16.900       1
12.350       1
17.600       1
15.900       1
17.100       1
20.000       1
18.500       1
17.700       1
12.513       1
Name: count, 

Snap the values to the nearest canonical size

Define canonical screen sizes and snap values to the nearest standard size if within tolerance

In [None]:
canonical_sizes = np.array([
    11.6, 12.5, 13.3, 14.0, 15.0, 15.6, 16.0, 17.3      # we can add 14.1 and 16.1
])
# these standard sizes ~80% of the data
# Adding 14.1 & 16.1 improves coverage by ~1.9% only.

def snap_screen_size(x):
    if pd.isna(x):
        return np.nan
    diff = np.abs(canonical_sizes - x)
    min_diff = diff.min()
    min_diff = np.round(min_diff, 2)
    if min_diff <= 0.3:
        return canonical_sizes[diff.argmin()]
    return x  # keep rare but valid sizes

data['SCREEN_SIZE_SNAPPED'] = data['SCREEN_SIZE'].apply(snap_screen_size)


Analyze the coverage of canonical sizes and percentage of missing values

In [None]:
print(data['SCREEN_SIZE_SNAPPED'].value_counts())

is_canonical = data['SCREEN_SIZE_SNAPPED'].isin(canonical_sizes)
canonical_pct = is_canonical.mean() * 100

none_pct = data['SCREEN_SIZE_SNAPPED'].isna().mean() * 100

print(f"percentage of canonical sizes: {canonical_pct:.2f}%")
print(f"percentage of none values: {none_pct:.2f}%")


SCREEN_SIZE_SNAPPED
14.0    4669
15.6    3215
13.3    2608
15.0    1438
16.0    1129
17.3     513
12.5     172
12.0     144
11.6      63
11.0      53
18.0      31
10.0      30
14.5      17
10.1      10
14.4       8
19.0       7
12.9       5
16.5       4
10.5       4
14.6       3
19.5       2
12.1       2
16.4       2
10.6       1
18.5       1
10.3       1
16.9       1
20.0       1
17.7       1
16.6       1
Name: count, dtype: int64
percentage of canonical sizes: 84.23%
percentage of none values: 13.76%


Check value counts by model name to understand data distribution

In [None]:
print(data['model_name'].value_counts())

model_name
THINKPAD       2402
LATITUDE       2334
MACBOOK        1663
ELITEBOOK      1132
PAVILION       1126
VIVOBOOK        787
PROBOOK         737
INSPIRON        656
SURFACE         491
IDEAPAD         481
ASPIRE          341
XPS             287
STEALTH         257
PRECISION       248
VICTUS          236
TUF             216
VOSTRO          212
ROG             198
ZBOOK           193
LEGION          187
OMEN            176
ZENBOOK         171
NITRO           168
GALAXY          144
YOGA            139
THINKBOOK       135
ENVY            130
DYNABOOK        118
PREDATOR         74
KATANA           63
MAC              62
SWIFT            60
SPECTRE          36
ALIENWARE        35
AERO             33
IMAC             32
BLADE            30
VECTOR           20
TRAVELMATE       18
SPIN             10
STRIX             7
GF                4
SWORD             4
OPTIPLEX          4
COMPAQ            4
TRANSFORMER       3
Name: count, dtype: int64


Display row counts and percentage of missing values grouped by model name

In [None]:
summary = data.groupby('model_name').agg(
    total_rows=('SCREEN_SIZE', 'size'),
    nan_rows=('SCREEN_SIZE', lambda s: s.isna().sum())
)

summary['percentage_nan'] = (summary['nan_rows'] / summary['total_rows']) * 100
summary = summary.sort_values(by='total_rows', ascending=False)

summary


Unnamed: 0_level_0,total_rows,nan_rows,percentage_nan
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
THINKPAD,2402,274,11.407161
LATITUDE,2334,180,7.712082
MACBOOK,1663,158,9.500902
ELITEBOOK,1132,96,8.480565
PAVILION,1126,201,17.850799
VIVOBOOK,787,120,15.247776
PROBOOK,737,55,7.462687
INSPIRON,656,115,17.530488
SURFACE,491,62,12.627291
IDEAPAD,481,62,12.889813


Check snapped screen size values for a specific model (LATITUDE)

In [None]:
result = data.loc[data['model_name'] == "LATITUDE", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    1240
13.3     411
15.6     284
15.0     143
12.5      28
12.0      23
16.0       9
11.0       4
11.6       4
17.3       2
14.5       2
14.6       1
12.1       1
12.9       1
10.0       1
Name: count, dtype: int64


Fill missing SCREEN_SIZE values for LATITUDE using model-specific mode

In [None]:
# Calculate the mode for LATITUDE model
latitude_mode = data[data['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].mode()

if len(latitude_mode) > 0:
    latitude_mode_value = latitude_mode[0]
    print(f"LATITUDE mode SCREEN_SIZE_SNAPPED: {latitude_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for LATITUDE with its mode
    data.loc[data['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'] = data.loc[data['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'].fillna(latitude_mode_value)

    print(f"Filled missing values for LATITUDE. Now LATITUDE has {data[data['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].isna().sum()} missing values")
else:
    print("Warning: LATITUDE model has no non-missing SCREEN_SIZE_SNAPPED values")

LATITUDE mode SCREEN_SIZE_SNAPPED: 14.0
Filled missing values for LATITUDE. Now LATITUDE has 0 missing values


Check snapped screen size values for a specific model (THINKPAD)

In [None]:
result = data.loc[data['model_name'] == "THINKPAD", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    1132
13.3     328
15.6     294
16.0     129
15.0     124
12.5      40
12.0      36
17.3      20
11.6      16
11.0       4
16.4       1
10.3       1
19.5       1
19.0       1
10.1       1
Name: count, dtype: int64


Check snapped screen size values for a specific model (MACBOOK)

In [None]:
result = data.loc[data['model_name'] == "MACBOOK", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
13.3    899
16.0    205
14.0    197
15.0    115
15.6     58
11.0     17
12.0      8
12.9      2
17.3      1
12.5      1
10.0      1
11.6      1
Name: count, dtype: int64


List all the cpus of MACBOOK laptops and the counts for each one 

In [None]:
pd.set_option('display.max_rows', 140)

macbook_cpus = data.loc[data['model_name'] == 'MACBOOK', 'CPU']
print(macbook_cpus.value_counts())

CPU
INTEL CORE I5                     254
APPLE M1                          248
APPLE M2                          211
APPLE M3                          151
INTEL CORE I7                     106
APPLE M1 PRO                       88
APPLE M4                           68
APPLE M3 PRO                       42
APPLE M2 PRO                       37
APPLE M1 MAX                       35
APPLE M3 MAX                       29
INTEL CORE I9                      28
APPLE M4 PRO                       28
APPLE M2 MAX                       23
INTEL CORE I5 1.8GHZ               13
INTEL CORE I5 1.6GHZ               13
INTEL CORE I5 2.3GHZ               12
INTEL CORE I3                      11
INTEL CORE I5 2.7GHZ                9
INTEL CORE I5 5350U                 8
INTEL CORE I7 9750H                 8
INTEL CORE I5 5257U                 8
INTEL CORE I5 8210Y                 8
INTEL CORE I7 2.6GHZ                7
INTEL CORE M3 8100Y                 7
8TH GEN INTEL CORE I5               6
APPLE M4

show the count of resolution for each cpu from macbook cpus

In [None]:
# show the count of resolution for each cpu from macbook cpus
macbook_data = data[data['model_name'] == 'MACBOOK']

for cpu in macbook_data['CPU'].unique():
    print(f"\n{cpu}:")
    print(macbook_data[macbook_data['CPU'] == cpu]['SCREEN_SIZE_SNAPPED'].value_counts())


INTEL CORE I5:
SCREEN_SIZE_SNAPPED
13.3    214
14.0      8
11.0      3
15.0      2
12.0      1
10.0      1
15.6      1
Name: count, dtype: int64

APPLE M1 MAX:
SCREEN_SIZE_SNAPPED
16.0    27
14.0     5
Name: count, dtype: int64

APPLE M2:
SCREEN_SIZE_SNAPPED
13.3    150
15.0     20
15.6     10
11.0      6
16.0      3
14.0      2
12.9      2
Name: count, dtype: int64

APPLE M3 MAX:
SCREEN_SIZE_SNAPPED
16.0    13
14.0    13
Name: count, dtype: int64

APPLE M1:
SCREEN_SIZE_SNAPPED
13.3    217
14.0      2
11.0      2
16.0      1
15.0      1
Name: count, dtype: int64

APPLE M4 MAX:
SCREEN_SIZE_SNAPPED
16.0    4
14.0    2
Name: count, dtype: int64

APPLE M2 MAX:
SCREEN_SIZE_SNAPPED
16.0    13
14.0     8
13.3     1
Name: count, dtype: int64

INTEL CORE I5 2310:
SCREEN_SIZE_SNAPPED
16.0    1
Name: count, dtype: int64

APPLE M4 PRO:
SCREEN_SIZE_SNAPPED
14.0    14
16.0    12
Name: count, dtype: int64

APPLE M4:
SCREEN_SIZE_SNAPPED
13.3    33
14.0    17
15.6     5
15.0     4
16.0     2
11.0     

Fill missing SCREEN_SIZE_SNAPPED values for MACBOOK using cpu name.

The Dynamic "Mode" Strategy
This script automates the mapping by calculating the most common screen size for every CPU present in the MACBOOK subset.

In [None]:
# 1. Create a mapping table: Most frequent Screen Size for every CPU
# We filter for MacBooks and drop rows where screen size is missing to find the 'Mode'
macbook_data = data[data['model_name'] == 'MACBOOK'].dropna(subset=['SCREEN_SIZE_SNAPPED'])

# This calculates the mode (most common value) for each CPU group
cpu_mode_mapping = macbook_data.groupby('CPU')['SCREEN_SIZE_SNAPPED'].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else None
).to_dict()

# 2. Fill the missing values using the dynamic map
# 'mask' identifies exactly which rows need filling
mask = (data['model_name'] == 'MACBOOK') & (data['SCREEN_SIZE_SNAPPED'].isna())

# Map the CPU names in those rows to our calculated modes
data.loc[mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[mask, 'CPU'].map(cpu_mode_mapping)

print(f"Filled missing values for {mask.sum()} MacBook records.")

Filled missing values for 158 MacBook records.


Refined Script with Keyword Fallback
This version handles the 130+ variations by falling back to general categories if the specific string doesn't have a known screen size.

In [None]:
def get_fallback_size(cpu_string):
    """Assigns a screen size based on architectural keywords if exact match fails."""
    cpu_string = str(cpu_string).upper()
    if 'M1 MAX' in cpu_string or 'M2 MAX' in cpu_string or 'M3 MAX' in cpu_string:
        return 16.0
    elif 'M1 PRO' in cpu_string or 'M2 PRO' in cpu_string:
        return 14.0 # Most common Pro size in newer models
    elif 'I9' in cpu_string:
        return 16.0
    elif 'M1' in cpu_string or 'M2' in cpu_string or 'M3' in cpu_string or 'I5' in cpu_string:
        return 13.3
    elif 'I7' in cpu_string:
        return 15.0
    return np.nan

# Apply the specific mapping first
data.loc[mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[mask, 'CPU'].map(cpu_mode_mapping)

# Apply the fallback for any remaining NaNs in MacBooks
final_mask = (data['model_name'] == 'MACBOOK') & (data['SCREEN_SIZE_SNAPPED'].isna())
data.loc[final_mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[final_mask, 'CPU'].apply(get_fallback_size)

print(
    f"Filled missing values for MACBOOK. "
    f"Now MACBOOK has "
    f"{data[data['model_name'] == 'MACBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
    f"missing values"
)

Filled missing values for MACBOOK. Now MACBOOK has 2 missing values


Fill missing SCREEN_SIZE_SNAPPED values for all remaining models

In [None]:
# List of remaining model names to process
remaining_models = [
    'THINKPAD', 'ELITEBOOK', 'PAVILION', 'VIVOBOOK', 'PROBOOK', 'INSPIRON', 
    'SURFACE', 'IDEAPAD', 'ASPIRE', 'XPS', 'STEALTH', 
    'PRECISION', 'VICTUS', 'TUF', 'VOSTRO', 'ROG', 'ZBOOK', 'LEGION', 'OMEN', 
    'ZENBOOK', 'NITRO', 'GALAXY', 'YOGA', 'THINKBOOK', 'ENVY', 'DYNABOOK', 
    'PREDATOR', 'KATANA', 'MAC', 'SWIFT', 'SPECTRE', 'ALIENWARE', 'AERO', 
    'IMAC', 'BLADE', 'VECTOR', 'TRAVELMATE', 'SPIN', 'STRIX', 'COMPAQ', 
    'GF', 'OPTIPLEX', 'SWORD', 'TRANSFORMER'
]

# Process each model
for model in remaining_models:
    # Calculate the mode for this model
    model_mode = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].mode()
    
    if len(model_mode) > 0:
        model_mode_value = model_mode[0]
        
        # Count missing values before filling
        missing_before = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        # Fill missing SCREEN_SIZE_SNAPPED values for this model with its mode
        data.loc[data['model_name'] == model, 'SCREEN_SIZE_SNAPPED'] = (
            data.loc[data['model_name'] == model, 'SCREEN_SIZE_SNAPPED']
            .fillna(model_mode_value)
        )
        
        # Count missing values after filling
        missing_after = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        if missing_before > 0:
            print(f"{model}: Filled {missing_before} missing values with mode {model_mode_value}. Remaining missing: {missing_after}")
    else:
        print(f"Warning: {model} has no non-missing SCREEN_SIZE_SNAPPED values")

print("\nAll remaining models processed!")

VIVOBOOK: Filled 120 missing values with mode 15.6. Remaining missing: 0
INSPIRON: Filled 115 missing values with mode 15.6. Remaining missing: 0
SURFACE: Filled 62 missing values with mode 13.3. Remaining missing: 0
IDEAPAD: Filled 62 missing values with mode 15.6. Remaining missing: 0
ASPIRE: Filled 66 missing values with mode 15.6. Remaining missing: 0
INSPIRON: Filled 115 missing values with mode 15.6. Remaining missing: 0
SURFACE: Filled 62 missing values with mode 13.3. Remaining missing: 0
IDEAPAD: Filled 62 missing values with mode 15.6. Remaining missing: 0
ASPIRE: Filled 66 missing values with mode 15.6. Remaining missing: 0
XPS: Filled 44 missing values with mode 13.3. Remaining missing: 0
STEALTH: Filled 58 missing values with mode 15.6. Remaining missing: 0
PRECISION: Filled 63 missing values with mode 15.6. Remaining missing: 0
VICTUS: Filled 60 missing values with mode 15.6. Remaining missing: 0
TUF: Filled 39 missing values with mode 15.6. Remaining missing: 0
XPS: Fill

# 5. Clean SCREEN_RESOLUTION

Normalize text.
* Normalize SCREEN_RESOLUTION text (convert to lowercase and remove spaces)

In [None]:
data['SCREEN_RESOLUTION'] = (
    data['SCREEN_RESOLUTION']
    .astype(str)
    .str.lower()
    .str.replace(' ', '')
)

Display the frequency distribution of values in the SCREEN_RESOLUTION column after normalization.

In [None]:
pd.set_option('display.max_rows', 120)
data['SCREEN_RESOLUTION'].value_counts()

SCREEN_RESOLUTION
nan                10246
1920x1080           2740
1920x1080fhd        1916
1920x1200            239
fhd                  218
2560x1440            116
2560x1600            107
1366x768             102
2k                    99
3k                    86
3840x2160             80
fullhd                34
2560x1664             25
2880x1800             25
2256x1504             21
3024x1964             20
2.5k                  19
3840x2400             16
2880x1920             14
2.8k                  13
fhd+                  13
1440x900              12
1920x1280             11
4k                    10
3456x2234              9
2048x1080              9
3072x1920              8
qhd+                   8
2736x1824              7
1600x900               7
5120x2880              6
2880x1864              6
1536x1024              6
3koled                 6
1920x1200fhd           5
2400x1600              5
2240x1400              4
wuxga                  4
1280x800               4
3kretin

Map resolution values to standard categories (HD, FHD, QHD, 4K, ...)

In [None]:
# Normalize SCREEN_RESOLUTION into standardized resolution tiers
resolution_map = {
    # HD
    '1366x768': 'HD',
    '1280x720': 'HD',
    'hd': 'HD',

    # HD+
    '1440x900': 'HD+',
    '1600x900': 'HD+',
    '1536x1024': 'HD+',
    '1280x800': 'HD+',

    # FHD
    '1920x1080': 'FHD',
    '1920x1080fhd': 'FHD',
    'fullhd': 'FHD',
    'fhd': 'FHD',
    '1080p': 'FHD',
    'fhd1080p': 'FHD',
    '1920x1080fullhd': 'FHD',

    # WUXGA (FHD+ / 16:10)
    '1920x1200': 'WUXGA',
    '1920x1200fhd': 'WUXGA',
    '1920x1200fhd+': 'WUXGA',
    '1920x1200wuxga': 'WUXGA',
    '1920x1280': 'WUXGA',
    'fhd+': 'WUXGA',
    'fullhd+': 'WUXGA',
    'wuxga': 'WUXGA',

    # QHD / 2K
    '2560x1440': 'QHD',
    '2560x1440qhd': 'QHD',
    'qhd': 'QHD',
    'wqhd': 'QHD',
    '2k': 'QHD',
    'qhd2k': 'QHD',
    '1440p': 'QHD',
    '2048x1080': 'QHD',


    # QHD+ (16:10)
    '2560x1600': 'QHD+',
    '2560x1600qhd+': 'QHD+',
    '2400x1600': 'QHD+',
    '2240x1400': 'QHD+',
    '2560x1664': 'QHD+',
    '2256x1504': 'QHD+',
    'wqxga': 'QHD+',
    'wqxga+': 'QHD+',
    'qhd+': 'QHD+',
    '2.5k': 'QHD+',
    '2496x1664': 'QHD+',
    '2360x1640': 'QHD+',
    '2304x1536': 'QHD+',

    # 3K-class (high-density laptop panels)
    '2880x1800': '3K',
    '2880x1920': '3K',
    '3072x1920': '3K',
    '3000x2000': '3K',
    '3024x1964': '3K',
    '3200x2000': '3K',
    '2736x1824': '3K',
    '2736x1834': '3K',
    '2736x1823': '3K',
    '3456x2234': '3K',
    '3k': '3K',
    '2.8k': '3K',
    '2880x1864': '3K',
    '3koled': '3K',
    '2880x1864': '3K',
    '3kretina': '3K',

    # 4K / UHD
    '3840x2160': '4K',
    '3840x2400': '4K',
    '3456x2160': '4K',
    '3240x2160': '4K',
    '4k': '4K',
    '4kuhd': '4K',

    # 5K
    '5120x2880': '5K',
    '5k': '5K'
}

data['SCREEN_RESOLUTION_STD'] = (
    data['SCREEN_RESOLUTION']
    .str.lower()
    .str.strip()
    .map(resolution_map)
    .fillna(data['SCREEN_RESOLUTION'])
)


# resolution hierarchy (for modeling)
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K

Display the frequency distribution of the normalized SCREEN_RESOLUTION_STD values.

In [None]:
pd.set_option('display.max_rows', 120)
data['SCREEN_RESOLUTION_STD'].value_counts()

SCREEN_RESOLUTION_STD
nan              10246
FHD               4911
WUXGA              278
QHD                233
3K                 208
QHD+               205
4K                 113
HD                 107
HD+                 29
5K                   7
1920x120             2
2880x1620            2
1929x1080            2
retina3k             2
3072x1620            2
3120x2080            2
2160x1440            2
wqxga2k              2
2k8                  2
2960x1848            2
3200x1800            2
2.4k                 1
2964x1694            1
fhd+1200p            1
1080x1920            1
1920x1280qhd         1
1920x1200qhd         1
1920x1980            1
2388x1668            1
2000x1200            1
1920x180fhd          1
2304x1440            1
1800x1200            1
1336x768             1
2160x1350            1
fhd+wuxga            1
2400p                1
2520x1680            1
3kqhd+               1
qhd+4k               1
4480x2520            1
2084x1080            1
1920x1080fhd

set non standard SCREEN_RESOLUTION_STD values to nan

In [None]:
# set non standard SCREEN_RESOLUTION_STD to NaN
valid_resolutions = [
    'HD', 'HD+', 'FHD', 'WUXGA', 'QHD', 'QHD+', '3K', '4K', '5K'
]

data.loc[  # Use .loc to set values in the DataFrame where condition is met
    ~data['SCREEN_RESOLUTION_STD'].isin(valid_resolutions), 
    'SCREEN_RESOLUTION_STD'  # Column to update
] = np.nan 

# Print the frequency count of each unique value in SCREEN_RESOLUTION_STD after setting invalid ones to NaN
print(data['SCREEN_RESOLUTION_STD'].value_counts())  
# Calculate and display the percentage of missing (NaN) values in SCREEN_RESOLUTION_STD
data['SCREEN_RESOLUTION_STD'].isna().mean() * 100  

SCREEN_RESOLUTION_STD
FHD      4911
WUXGA     278
QHD       233
3K        208
QHD+      205
4K        113
HD        107
HD+        29
5K          7
Name: count, dtype: int64


62.841630063445585

Show the percentage of missing SCREEN_RESOLUTION_STD values for each model name

In [None]:
summary = data.groupby('model_name').agg(
    total_rows=('SCREEN_RESOLUTION_STD', 'size'),
    nan_rows=('SCREEN_RESOLUTION_STD', lambda s: s.isna().sum())
)

summary['percentage_nan'] = (summary['nan_rows'] / summary['total_rows']) * 100

mode_counts = (
    data.dropna(subset=['SCREEN_RESOLUTION_STD'])
      .groupby('model_name')['SCREEN_RESOLUTION_STD']
      .value_counts()
      .rename('mode_count')
      .reset_index()
      .sort_values(['model_name', 'mode_count'], ascending=[True, False])
      .drop_duplicates('model_name')
      .set_index('model_name')
      [['SCREEN_RESOLUTION_STD', 'mode_count']]
)

summary = summary.join(mode_counts).rename(columns={'SCREEN_RESOLUTION_STD': 'mode_resolution'})

summary = summary.sort_values(by='total_rows', ascending=False)

summary

Unnamed: 0_level_0,total_rows,nan_rows,percentage_nan,mode_resolution,mode_count
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
THINKPAD,2402,1419,59.07577,FHD,833.0
LATITUDE,2334,1195,51.199657,FHD,1097.0
MACBOOK,1663,1438,86.470235,QHD+,85.0
ELITEBOOK,1132,579,51.14841,FHD,520.0
PAVILION,1126,772,68.561279,FHD,309.0
VIVOBOOK,787,527,66.963151,FHD,214.0
PROBOOK,737,370,50.203528,FHD,335.0
INSPIRON,656,460,70.121951,FHD,171.0
SURFACE,491,320,65.173116,3K,49.0
IDEAPAD,481,292,60.706861,FHD,174.0


Fill missing SCREEN_RESOLUTION values with the specific mode for each model name

In [None]:
# fill missing SCREEN_RESOLUTION_STD by model_name mode
for model in summary.index:
    mode_value = summary.loc[model, 'mode_resolution']
    if pd.notna(mode_value):
        mask = (data['model_name'] == model) & (data['SCREEN_RESOLUTION_STD'].isna())
        data.loc[mask, 'SCREEN_RESOLUTION_STD'] = mode_value
        filled_count = mask.sum()
        if filled_count > 0:
            print(f"Filled {filled_count} missing values for {model} with mode {mode_value}")       
print("\nAll models processed for SCREEN_RESOLUTION_STD!")  

Filled 1419 missing values for THINKPAD with mode FHD
Filled 1195 missing values for LATITUDE with mode FHD
Filled 1438 missing values for MACBOOK with mode QHD+
Filled 579 missing values for ELITEBOOK with mode FHD
Filled 772 missing values for PAVILION with mode FHD
Filled 527 missing values for VIVOBOOK with mode FHD
Filled 370 missing values for PROBOOK with mode FHD
Filled 460 missing values for INSPIRON with mode FHD
Filled 320 missing values for SURFACE with mode 3K
Filled 292 missing values for IDEAPAD with mode FHD
Filled 246 missing values for ASPIRE with mode FHD
Filled 176 missing values for XPS with mode FHD
Filled 161 missing values for STEALTH with mode FHD
Filled 167 missing values for PRECISION with mode FHD
Filled 129 missing values for VICTUS with mode FHD
Filled 127 missing values for TUF with mode FHD
Filled 111 missing values for VOSTRO with mode FHD
Filled 122 missing values for ROG with mode FHD
Filled 126 missing values for ZBOOK with mode FHD
Filled 111 missin

Filled 73 missing values for THINKBOOK with mode FHD
Filled 70 missing values for ENVY with mode FHD
Filled 100 missing values for DYNABOOK with mode FHD
Filled 47 missing values for PREDATOR with mode QHD+
Filled 41 missing values for KATANA with mode FHD
Filled 58 missing values for MAC with mode 3K
Filled 29 missing values for SWIFT with mode FHD
Filled 23 missing values for SPECTRE with mode FHD
Filled 30 missing values for ALIENWARE with mode QHD+
Filled 18 missing values for AERO with mode FHD
Filled 21 missing values for IMAC with mode 5K
Filled 22 missing values for BLADE with mode FHD
Filled 10 missing values for VECTOR with mode FHD
Filled 14 missing values for TRAVELMATE with mode FHD
Filled 4 missing values for SPIN with mode FHD
Filled 5 missing values for STRIX with mode FHD
Filled 1 missing values for GF with mode FHD
Filled 3 missing values for SWORD with mode FHD
Filled 2 missing values for OPTIPLEX with mode FHD
Filled 1 missing values for TRANSFORMER with mode HD

Al

# 6. Encode SCREEN_RESOLUTION (for ML)

Create numeric encodings for resolution categories for machine learning

In [None]:
# Encode SCREEN_RESOLUTION_STD into numerical values for modeling using this resolution hierarchy 
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K
resolution_encoding = {
    'HD': 1,
    'HD+': 2,
    'FHD': 3,
    'WUXGA': 4,
    'QHD': 5,
    'QHD+': 6,
    '3K': 7,
    '4K': 8,
    '5K': 9
}

data['SCREEN_RESOLUTION_ENC'] = data['SCREEN_RESOLUTION_STD'].map(resolution_encoding)

# 7. Final Check

Verify data types and display sample rows of cleaned data

In [None]:
data[['SCREEN_SIZE_SNAPPED','SCREEN_RESOLUTION_STD','SCREEN_RESOLUTION_ENC']].info()
data[['SCREEN_SIZE_SNAPPED','SCREEN_RESOLUTION_STD']].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16392 entries, 0 to 16391
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   SCREEN_SIZE_SNAPPED    16165 non-null  float64
 1   SCREEN_RESOLUTION_STD  16024 non-null  object 
 2   SCREEN_RESOLUTION_ENC  16024 non-null  float64
dtypes: float64(2), object(1)
memory usage: 384.3+ KB


Unnamed: 0,SCREEN_SIZE_SNAPPED,SCREEN_RESOLUTION_STD
0,14.0,FHD
1,15.6,4K
2,17.3,FHD
3,14.0,FHD
4,,


## 8. Export Final Cleaned Dataset

This is the **final dataset** ready for machine learning models!

Export the fully cleaned dataset ready for modeling

In [None]:
# Export final cleaned dataset
output_file = "final_cleaned_data.csv"
data.to_csv(output_file, index=False)

print("=" * 70)
print("‚úÖ FINAL CLEANED DATASET EXPORTED!")
print("=" * 70)
print(f"üìÅ Output file: {output_file}")
print(f"üìä Total rows: {len(data)}")
print(f"üìã Total columns: {len(data.columns)}")
print("\nüéØ This dataset is now ready for machine learning models!")
print("=" * 70)

# Show summary statistics
print("\nüìà Dataset Summary:")
print(f"  - CPU mapped: {(data['mapped_cpu_name'] != 'NA').sum()} rows")
print(f"  - RAM filled: {data['RAM_SIZE'].notna().sum()} rows")
print(f"  - Storage filled: {((data['SSD_SIZE'].notna()) | (data['HDD_SIZE'].notna())).sum()} rows")
print(f"  - Screen size cleaned: {data['SCREEN_SIZE'].notna().sum()} rows")
print(f"  - GPU data: {data['gpu_name'].notna().sum()} rows")
print("=" * 70)

‚úÖ FINAL CLEANED DATASET EXPORTED!
üìÅ Output file: final_cleaned_data.csv
üìä Total rows: 16392
üìã Total columns: 28

üéØ This dataset is now ready for machine learning models!

üìà Dataset Summary:
  - CPU mapped: 16392 rows
  - RAM filled: 16329 rows
  - Storage filled: 16303 rows
  - Screen size cleaned: 14232 rows
  - GPU data: 15690 rows
