# This notebook is for data pre-processing

## Libraries

In [19]:
import numpy as np
import pandas as pd

## Data

In [20]:
data = pd.read_csv('data.csv')
data.shape

(16394, 14)

## General Cleaning

In [21]:
# Removing exact duplicate rows (rows identical across all columns)
# Count duplicates before removal
dup_count = data.duplicated().sum()
print(f"Found {dup_count} exact duplicate rows")
if dup_count > 0:
    # Drop exact duplicates and reset index
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    print(f"Removed {dup_count} duplicates. New shape: {data.shape}")
else:
    print('No duplicate rows found.')

Found 1 exact duplicate rows
Removed 1 duplicates. New shape: (16393, 14)


In [22]:
# Detect groups where all attributes except RAM or storage (SSD/HDD) match,
# but RAM or storage is null in some rows and not in others.
# This helps find near-duplicates where only RAM/storage information is missing in some entries
cols = data.columns.tolist()
# flexible matching for RAM-like columns (case-insensitive, substring match)
ram_cols = [c for c in cols if 'ram' in c.lower() or 'memory' in c.lower()]
# storage detection expanded to SSD, HDD, storage, drive, disk
storage_cols = [c for c in cols if any(k in c.lower() for k in ['ssd', 'hdd', 'storage', 'drive', 'disk'])]
print('Detected ram-like columns:', ram_cols)
print('Detected storage-like columns (SSD/HDD):', storage_cols)
# Group by the specific keys you requested (flexible matching):
# price, date created, etat, model name, city, gpu, cpu, screen size, screen frequency, resolution
def find_col(variants, cols):
    """Return the first column whose name contains any of the provided variants (case-insensitive)."""
    variants = [v.lower() for v in variants]
    for c in cols:
        cname = c.lower()
        if any(v in cname for v in variants):
            return c
    return None
# mapping desired keys to lists of variants to match
desired_keys = {
    'price': ['price'],
    'date_created': ['date', 'create', 'created'],
    'etat': ['etat'],
    'model': ['model', 'model name', 'model_name'],
    'city': ['city'],
    'gpu': ['gpu', 'graphics'],
    'cpu': ['cpu', 'processor'],
    'screen_size': ['screen', 'size'],
    'screen_freq': ['frequency', 'hz', 'screen', 'refresh'],
    'resolution': ['resolution', 'res']
}
found_keys = {k: find_col(v, cols) for k, v in desired_keys.items()}
print('Found column mapping for grouping:')
for k, col in found_keys.items():
    print(f'  {k} -> {col}')
# Check for missing requested columns
missing = [k for k, col in found_keys.items() if col is None]
if missing:
    print('Warning: Could not find these requested grouping columns in the dataset:', missing)
# Build key_cols from the requested keys that were found (preserve order)
requested_order = ['price','date_created','etat','model','city','gpu','cpu','screen_size','screen_freq','resolution']
key_cols = [found_keys[k] for k in requested_order if found_keys.get(k)]
# If we don't have enough requested columns, fall back to grouping by all non-ram/non-storage cols
if len(key_cols) < 2:
    print('Not enough requested grouping columns found (need at least 2). Falling back to grouping by all non-ram/non-storage columns.')
    key_cols = [c for c in cols if c not in ram_cols + storage_cols]
print('Using key columns for grouping:', key_cols)
groups = data.groupby(key_cols, dropna=False)
problem_groups = []
for key, grp in groups:
    if len(grp) < 2:
        continue
    # check RAM columns for mixed null vs non-null within group
    ram_issue = False
    if ram_cols:
        ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
    # check storage columns (SSD/HDD) for mixed null vs non-null within group
    storage_issue = False
    if storage_cols:
        storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
    if ram_issue or storage_issue:
        problem_groups.append((key, grp))
# Auto-resolution: several possible strategies
# 1) If exactly one row has any storage info and it matches '1000' (HDD 1000), copy it to others and drop source
# 2) If multiple rows have storage info but across different storage columns (e.g., one row has SSD, another has HDD),
#    merge storage fields into a single target row and delete the redundant row(s).
resolved = 0
for key, grp in problem_groups:
    storage_existing = [c for c in storage_cols if c in grp.columns]
    if not storage_existing:
        continue
    # which rows in this group have any non-null storage value
    nonnull_mask = grp[storage_existing].notna().any(axis=1)
    rows_with_storage = grp[nonnull_mask]
    # Strategy A: single storage row -> maybe HDD 1000 pattern
    if len(rows_with_storage) == 1:
        src_idx = rows_with_storage.index[0]
        src_vals = rows_with_storage.iloc[0][storage_existing]
        def looks_like_1000(x):
            if pd.isna(x):
                return False
            s = str(x).lower().replace(' ', '')
            return ('1000' in s) or ('1tb' in s) or ('1000gb' in s)
        if any(looks_like_1000(v) for v in src_vals.dropna()):
            target_idxs = [i for i in grp.index if i != src_idx]
            for c in storage_existing:
                mask = data.loc[target_idxs, c].isna()
                if mask.any():
                    data.loc[target_idxs, c] = data.loc[target_idxs, c].where(~mask, other=data.loc[src_idx, c])
            data.drop(index=src_idx, inplace=True)
            resolved += 1
            print(f'Resolved group {key}: moved storage from index {src_idx} to {target_idxs} and deleted index {src_idx}')
        else:
            print(f'Group {key}: single storage row found but did not match 1000 pattern; skipped auto-resolution')
    # Strategy B: multiple rows with storage but complementary across columns -> merge into one row
    else:
        # determine non-null positions per storage column
        col_nonnull_counts = {c: grp[c].notna().sum() for c in storage_existing}
        total_nonnull_cells = sum(col_nonnull_counts.values())
        # If non-null cells are distributed across rows without overlap (each storage cell non-null appears in only one row),
        # we can merge them. Check that no row has two storage columns non-null (optional), and that total_nonnull_cells <= len(rows_with_storage) * len(storage_existing)
        # Simpler heuristic: if each storage column has at most one non-null entry and the number of rows_with_storage equals the number of distinct non-null rows,
        distinct_rows_with_storage = set(rows_with_storage.index.tolist())
        if all(v <= 1 for v in col_nonnull_counts.values()):
            # pick target as the first row that has any storage (prefer the one with SSD if present)
            target_idx = None
            # try to prefer a row that has SSD (or first storage column) non-null
            preferred = storage_existing[0] if storage_existing else None
            if preferred is not None:
                candidates = rows_with_storage[rows_with_storage[preferred].notna()].index.tolist()
                if candidates:
                    target_idx = candidates[0]
            if target_idx is None:
                target_idx = rows_with_storage.index[0]
            other_rows = [i for i in rows_with_storage.index if i != target_idx]
            # copy complementary storage values into target where null
            for other in other_rows:
                for c in storage_existing:
                    if pd.isna(data.at[target_idx, c]) and not pd.isna(data.at[other, c]):
                        data.at[target_idx, c] = data.at[other, c]
                # after copying, drop the other row
                data.drop(index=other, inplace=True)
                resolved += 1
                print(f'Merged storage from index {other} into {target_idx} and deleted {other} for group {key}')
        else:
            print(f'Group {key}: storage columns have multiple non-null entries per column; skipped auto-merge')
# After attempting auto-resolution, reset index if any changes were made
if resolved > 0:
    data.reset_index(drop=True, inplace=True)
    print(f'Auto-resolved {resolved} groups. New data shape: {data.shape}')
    # Recompute groups and problem_groups on the mutated dataframe so printed results reflect changes
    groups = data.groupby(key_cols, dropna=False)
    new_problem_groups = []
    for key, grp in groups:
        if len(grp) < 2:
            continue
        ram_issue = False
        if ram_cols:
            ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
        storage_issue = False
        if storage_cols:
            storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
        if ram_issue or storage_issue:
            new_problem_groups.append((key, grp))
    problem_groups = new_problem_groups
# Finally, print any remaining problem groups for manual review
if not problem_groups:
    print('No groups found where RAM or storage are null in some rows and not in others while other attributes match.')
else:
    print(f'Found {len(problem_groups)} potential groups (post-resolution):')
    for i, (key, grp) in enumerate(problem_groups, 1):
        print('\n---')
        print(f'Group {i} key: {key}')
        # show indices and relevant columns to help decide which rows to drop
        display_cols = key_cols + ram_cols + storage_cols
        # ensure ordering and existence
        display_cols = [c for c in display_cols if c in grp.columns]
        grp_display = grp[display_cols].copy()
        grp_display['_index'] = grp_display.index
        print(grp_display.to_string(index=False))
        print('---')

# Add a unique integer primary key column named 'id' (starting at 1). If 'id' exists, overwrite it after warning.
if 'id' in data.columns:
    print("Column 'id' already exists in the dataset; it will be overwritten with new sequential IDs.")
# Ensure index is contiguous before assigning IDs
data.reset_index(drop=True, inplace=True)
data.insert(0, 'id', range(1, len(data) + 1))
print(f"Added 'id' column as primary key. Data shape now: {data.shape}")

Detected ram-like columns: ['RAM_SIZE', 'RAM_TYPE']
Detected storage-like columns (SSD/HDD): ['SSD_SIZE', 'HDD_SIZE']
Found column mapping for grouping:
  price -> price_preview
  date_created -> created_at
  etat -> spec_Etat
  model -> model_name
  city -> city
  gpu -> DEDICATED_GPU
  cpu -> CPU
  screen_size -> RAM_SIZE
  screen_freq -> SCREEN_SIZE
  resolution -> SCREEN_RESOLUTION
Using key columns for grouping: ['price_preview', 'created_at', 'spec_Etat', 'model_name', 'city', 'DEDICATED_GPU', 'CPU', 'RAM_SIZE', 'SCREEN_SIZE', 'SCREEN_RESOLUTION']
Merged storage from index 10345 into 10344 and deleted 10345 for group (75000.0, '2024 11 01T15:06:57.000Z', 'JAMAIS UTILIS', 'VOSTRO', 'EZZOUAR', nan, '11TH GEN INTEL CORE I3 1145G4', '8GB', 14.1, nan)
Auto-resolved 1 groups. New data shape: (16392, 14)
Merged storage from index 10345 into 10344 and deleted 10345 for group (75000.0, '2024 11 01T15:06:57.000Z', 'JAMAIS UTILIS', 'VOSTRO', 'EZZOUAR', nan, '11TH GEN INTEL CORE I3 1145G4', 

*
*
*
*
*
*
*
*
*
*
*
*
*
*

In [23]:
data.shape

(16392, 15)

Remove the SCREEN_FREQUENCY column as it has limited utility

In [None]:
data.drop(columns=['SCREEN_FREQUENCY'], inplace=True)

Convert and normalize SCREEN_SIZE values (replace commas with decimals and extract numeric values)

In [None]:
data['SCREEN_SIZE'] = (
    data['SCREEN_SIZE']
    .astype(str)
    .str.replace(',', '.', regex=False)
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

Replace screen sizes outside the valid range (10-20 inches) with NaN

In [None]:
data.loc[
    (data['SCREEN_SIZE'] < 10) | (data['SCREEN_SIZE'] > 20),
    'SCREEN_SIZE'
] = np.nan

Define canonical screen sizes and snap values to the nearest standard size if within tolerance

In [None]:
canonical_sizes = np.array([
    11.6, 12.5, 13.3, 14.0, 15.0, 15.6, 16.0, 17.3      # we can add 14.1 and 16.1
])
# these standard sizes ~80% of the data
# Adding 14.1 & 16.1 improves coverage by ~1.9% only.

def snap_screen_size(x):
    if pd.isna(x):
        return np.nan
    diff = np.abs(canonical_sizes - x)
    min_diff = diff.min()
    min_diff = np.round(min_diff, 2)
    if min_diff <= 0.3:
        return canonical_sizes[diff.argmin()]
    return x  # keep rare but valid sizes

data['SCREEN_SIZE_SNAPPED'] = data['SCREEN_SIZE'].apply(snap_screen_size)


Fill missing SCREEN_SIZE values for most common model names using model-specific mode

In [None]:
# Calculate the mode for LATITUDE model
latitude_mode = data[data['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].mode()

if len(latitude_mode) > 0:
    latitude_mode_value = latitude_mode[0]
    print(f"LATITUDE mode SCREEN_SIZE_SNAPPED: {latitude_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for LATITUDE with its mode
    data.loc[data['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'] = data.loc[data['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'].fillna(latitude_mode_value)

    print(f"Filled missing values for LATITUDE. Now LATITUDE has {data[data['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].isna().sum()} missing values")
else:
    print("Warning: LATITUDE model has no non-missing SCREEN_SIZE_SNAPPED values")

# Calculate the mode for THINKPAD model
thinkpad_mode = data[data['model_name'] == 'THINKPAD']['SCREEN_SIZE_SNAPPED'].mode()

if len(thinkpad_mode) > 0:
    thinkpad_mode_value = thinkpad_mode[0]
    print(f"THINKPAD mode SCREEN_SIZE_SNAPPED: {thinkpad_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for THINKPAD with its mode
    data.loc[data['model_name'] == 'THINKPAD', 'SCREEN_SIZE_SNAPPED'] = (
        data.loc[data['model_name'] == 'THINKPAD', 'SCREEN_SIZE_SNAPPED']
        .fillna(thinkpad_mode_value)
    )

    print(
        f"Filled missing values for THINKPAD. "
        f"Now THINKPAD has "
        f"{data[data['model_name'] == 'THINKPAD']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: THINKPAD model has no non-missing SCREEN_SIZE_SNAPPED values")


# Calculate the mode for ELITEBOOK model
elitebook_mode = data[data['model_name'] == 'ELITEBOOK']['SCREEN_SIZE'].mode()

if len(elitebook_mode) > 0:
    elitebook_mode_value = elitebook_mode[0]
    print(f"ELITEBOOK mode SCREEN_SIZE: {elitebook_mode_value}")

    # Fill missing SCREEN_SIZE values for ELITEBOOK with its mode
    data.loc[data['model_name'] == 'ELITEBOOK', 'SCREEN_SIZE'] = (
        data.loc[data['model_name'] == 'ELITEBOOK', 'SCREEN_SIZE']
        .fillna(elitebook_mode_value)
    )

    print(
        f"Filled missing values for ELITEBOOK. "
        f"Now ELITEBOOK has "
        f"{data[data['model_name'] == 'ELITEBOOK']['SCREEN_SIZE'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: ELITEBOOK model has no non-missing SCREEN_SIZE values")


# Calculate the mode for PAVILION model
pavilion_mode = data[data['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].mode()

if len(pavilion_mode) > 0:
    pavilion_mode_value = pavilion_mode[0]
    print(f"PAVILION mode SCREEN_SIZE_SNAPPED: {pavilion_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PAVILION with its mode
    data.loc[data['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED'] = (
        data.loc[data['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED']
        .fillna(pavilion_mode_value)
    )

    print(
        f"Filled missing values for PAVILION. "
        f"Now PAVILION has "
        f"{data[data['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PAVILION model has no non-missing SCREEN_SIZE_SNAPPED values")


# Calculate the mode for PAVILION model
pavilion_mode = data[data['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].mode()

if len(pavilion_mode) > 0:
    pavilion_mode_value = pavilion_mode[0]
    print(f"PAVILION mode SCREEN_SIZE_SNAPPED: {pavilion_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PAVILION with its mode
    data.loc[data['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED'] = (
        data.loc[data['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED']
        .fillna(pavilion_mode_value)
    )

    print(
        f"Filled missing values for PAVILION. "
        f"Now PAVILION has "
        f"{data[data['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PAVILION model has no non-missing SCREEN_SIZE_SNAPPED values")


# Calculate the mode for PROBOOK model
probook_mode = data[data['model_name'] == 'PROBOOK']['SCREEN_SIZE_SNAPPED'].mode()

if len(probook_mode) > 0:
    probook_mode_value = probook_mode[0]
    print(f"PROBOOK mode SCREEN_SIZE_SNAPPED: {probook_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PROBOOK with its mode
    data.loc[data['model_name'] == 'PROBOOK', 'SCREEN_SIZE_SNAPPED'] = (
        data.loc[data['model_name'] == 'PROBOOK', 'SCREEN_SIZE_SNAPPED']
        .fillna(probook_mode_value)
    )

    print(
        f"Filled missing values for PROBOOK. "
        f"Now PROBOOK has "
        f"{data[data['model_name'] == 'PROBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PROBOOK model has no non-missing SCREEN_SIZE_SNAPPED values")


LATITUDE mode SCREEN_SIZE_SNAPPED: 14.0
Filled missing values for LATITUDE. Now LATITUDE has 0 missing values


The Dynamic "Mode" Strategy
This script automates the mapping by calculating the most common screen size for every CPU present in the MACBOOK subset.

In [None]:
# 1. Create a mapping table: Most frequent Screen Size for every CPU
# We filter for MacBooks and drop rows where screen size is missing to find the 'Mode'
macbook_data = data[data['model_name'] == 'MACBOOK'].dropna(subset=['SCREEN_SIZE_SNAPPED'])

# This calculates the mode (most common value) for each CPU group
cpu_mode_mapping = macbook_data.groupby('CPU')['SCREEN_SIZE_SNAPPED'].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else None
).to_dict()

# 2. Fill the missing values using the dynamic map
# 'mask' identifies exactly which rows need filling
mask = (data['model_name'] == 'MACBOOK') & (data['SCREEN_SIZE_SNAPPED'].isna())

# Map the CPU names in those rows to our calculated modes
data.loc[mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[mask, 'CPU'].map(cpu_mode_mapping)

print(f"Filled missing values for {mask.sum()} MacBook records.")

Filled missing values for 158 MacBook records.


Refined Script with Keyword Fallback
This version handles the 130+ variations by falling back to general categories if the specific string doesn't have a known screen size.

In [None]:
def get_fallback_size(cpu_string):
    """Assigns a screen size based on architectural keywords if exact match fails."""
    cpu_string = str(cpu_string).upper()
    if 'M1 MAX' in cpu_string or 'M2 MAX' in cpu_string or 'M3 MAX' in cpu_string:
        return 16.0
    elif 'M1 PRO' in cpu_string or 'M2 PRO' in cpu_string:
        return 14.0 # Most common Pro size in newer models
    elif 'I9' in cpu_string:
        return 16.0
    elif 'M1' in cpu_string or 'M2' in cpu_string or 'M3' in cpu_string or 'I5' in cpu_string:
        return 13.3
    elif 'I7' in cpu_string:
        return 15.0
    return np.nan

# Apply the specific mapping first
data.loc[mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[mask, 'CPU'].map(cpu_mode_mapping)

# Apply the fallback for any remaining NaNs in MacBooks
final_mask = (data['model_name'] == 'MACBOOK') & (data['SCREEN_SIZE_SNAPPED'].isna())
data.loc[final_mask, 'SCREEN_SIZE_SNAPPED'] = data.loc[final_mask, 'CPU'].apply(get_fallback_size)

print(
    f"Filled missing values for MACBOOK. "
    f"Now MACBOOK has "
    f"{data[data['model_name'] == 'MACBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
    f"missing values"
)

Filled missing values for MACBOOK. Now MACBOOK has 2 missing values


Fill missing SCREEN_SIZE_SNAPPED values for all remaining models

In [None]:
# List of remaining model names to process
remaining_models = [
    'VIVOBOOK', 'INSPIRON', 'SURFACE', 'IDEAPAD', 'ASPIRE', 'XPS', 'STEALTH', 
    'PRECISION', 'VICTUS', 'TUF', 'VOSTRO', 'ROG', 'ZBOOK', 'LEGION', 'OMEN', 
    'ZENBOOK', 'NITRO', 'GALAXY', 'YOGA', 'THINKBOOK', 'ENVY', 'DYNABOOK', 
    'PREDATOR', 'KATANA', 'MAC', 'SWIFT', 'SPECTRE', 'ALIENWARE', 'AERO', 
    'IMAC', 'BLADE', 'VECTOR', 'TRAVELMATE', 'SPIN', 'STRIX', 'COMPAQ', 
    'GF', 'OPTIPLEX', 'SWORD', 'TRANSFORMER'
]

# Process each model
for model in remaining_models:
    # Calculate the mode for this model
    model_mode = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].mode()
    
    if len(model_mode) > 0:
        model_mode_value = model_mode[0]
        
        # Count missing values before filling
        missing_before = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        # Fill missing SCREEN_SIZE_SNAPPED values for this model with its mode
        data.loc[data['model_name'] == model, 'SCREEN_SIZE_SNAPPED'] = (
            data.loc[data['model_name'] == model, 'SCREEN_SIZE_SNAPPED']
            .fillna(model_mode_value)
        )
        
        # Count missing values after filling
        missing_after = data[data['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        if missing_before > 0:
            print(f"{model}: Filled {missing_before} missing values with mode {model_mode_value}. Remaining missing: {missing_after}")
    else:
        print(f"Warning: {model} has no non-missing SCREEN_SIZE_SNAPPED values")

print("\nAll remaining models processed!")

VIVOBOOK: Filled 120 missing values with mode 15.6. Remaining missing: 0
INSPIRON: Filled 115 missing values with mode 15.6. Remaining missing: 0
SURFACE: Filled 62 missing values with mode 13.3. Remaining missing: 0
IDEAPAD: Filled 62 missing values with mode 15.6. Remaining missing: 0
ASPIRE: Filled 66 missing values with mode 15.6. Remaining missing: 0
XPS: Filled 44 missing values with mode 13.3. Remaining missing: 0
STEALTH: Filled 58 missing values with mode 15.6. Remaining missing: 0
PRECISION: Filled 63 missing values with mode 15.6. Remaining missing: 0
VICTUS: Filled 60 missing values with mode 15.6. Remaining missing: 0
TUF: Filled 39 missing values with mode 15.6. Remaining missing: 0
VOSTRO: Filled 20 missing values with mode 15.6. Remaining missing: 0
ROG: Filled 41 missing values with mode 17.3. Remaining missing: 0
ZBOOK: Filled 28 missing values with mode 15.6. Remaining missing: 0
LEGION: Filled 45 missing values with mode 16.0. Remaining missing: 0
OMEN: Filled 40 mi

## Clean SCREEN_RESOLUTION

Normalize text.
* Normalize SCREEN_RESOLUTION text (convert to lowercase and remove spaces)

In [None]:
data['SCREEN_RESOLUTION'] = (
    data['SCREEN_RESOLUTION']
    .astype(str)
    .str.lower()
    .str.replace(' ', '')
)

Map resolution values to standard categories (HD, FHD, QHD, 4K, ...)

In [None]:
# Normalize SCREEN_RESOLUTION into standardized resolution tiers
resolution_map = {
    # HD
    '1366x768': 'HD',
    '1280x720': 'HD',
    'hd': 'HD',

    # HD+
    '1440x900': 'HD+',
    '1600x900': 'HD+',
    '1536x1024': 'HD+',
    '1280x800': 'HD+',

    # FHD
    '1920x1080': 'FHD',
    '1920x1080fhd': 'FHD',
    'fullhd': 'FHD',
    'fhd': 'FHD',
    '1080p': 'FHD',
    'fhd1080p': 'FHD',
    '1920x1080fullhd': 'FHD',

    # WUXGA (FHD+ / 16:10)
    '1920x1200': 'WUXGA',
    '1920x1200fhd': 'WUXGA',
    '1920x1200fhd+': 'WUXGA',
    '1920x1200wuxga': 'WUXGA',
    '1920x1280': 'WUXGA',
    'fhd+': 'WUXGA',
    'fullhd+': 'WUXGA',
    'wuxga': 'WUXGA',

    # QHD / 2K
    '2560x1440': 'QHD',
    '2560x1440qhd': 'QHD',
    'qhd': 'QHD',
    'wqhd': 'QHD',
    '2k': 'QHD',
    'qhd2k': 'QHD',
    '1440p': 'QHD',
    '2048x1080': 'QHD',


    # QHD+ (16:10)
    '2560x1600': 'QHD+',
    '2560x1600qhd+': 'QHD+',
    '2400x1600': 'QHD+',
    '2240x1400': 'QHD+',
    '2560x1664': 'QHD+',
    '2256x1504': 'QHD+',
    'wqxga': 'QHD+',
    'wqxga+': 'QHD+',
    'qhd+': 'QHD+',
    '2.5k': 'QHD+',
    '2496x1664': 'QHD+',
    '2360x1640': 'QHD+',
    '2304x1536': 'QHD+',

    # 3K-class (high-density laptop panels)
    '2880x1800': '3K',
    '2880x1920': '3K',
    '3072x1920': '3K',
    '3000x2000': '3K',
    '3024x1964': '3K',
    '3200x2000': '3K',
    '2736x1824': '3K',
    '2736x1834': '3K',
    '2736x1823': '3K',
    '3456x2234': '3K',
    '3k': '3K',
    '2.8k': '3K',
    '2880x1864': '3K',
    '3koled': '3K',
    '2880x1864': '3K',
    '3kretina': '3K',

    # 4K / UHD
    '3840x2160': '4K',
    '3840x2400': '4K',
    '3456x2160': '4K',
    '3240x2160': '4K',
    '4k': '4K',
    '4kuhd': '4K',

    # 5K
    '5120x2880': '5K',
    '5k': '5K'
}

data['SCREEN_RESOLUTION_STD'] = (
    data['SCREEN_RESOLUTION']
    .str.lower()
    .str.strip()
    .map(resolution_map)
    .fillna(data['SCREEN_RESOLUTION'])
)


# resolution hierarchy (for modeling)
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K

set non standard SCREEN_RESOLUTION_STD values to nan

In [None]:
# set non standard SCREEN_RESOLUTION_STD to NaN
valid_resolutions = [
    'HD', 'HD+', 'FHD', 'WUXGA', 'QHD', 'QHD+', '3K', '4K', '5K'
]

data.loc[  # Use .loc to set values in the DataFrame where condition is met
    ~data['SCREEN_RESOLUTION_STD'].isin(valid_resolutions), 
    'SCREEN_RESOLUTION_STD'  # Column to update
] = np.nan 

# Print the frequency count of each unique value in SCREEN_RESOLUTION_STD after setting invalid ones to NaN
print(data['SCREEN_RESOLUTION_STD'].value_counts())  
# Calculate and display the percentage of missing (NaN) values in SCREEN_RESOLUTION_STD
data['SCREEN_RESOLUTION_STD'].isna().mean() * 100  

SCREEN_RESOLUTION_STD
FHD      4911
WUXGA     278
QHD       233
3K        208
QHD+      205
4K        113
HD        107
HD+        29
5K          7
Name: count, dtype: int64


62.846163230450166

Fill missing SCREEN_RESOLUTION values with the specific mode for each model name

In [None]:

summary = data.groupby('model_name').agg(
    total_rows=('SCREEN_RESOLUTION_STD', 'size'),
    nan_rows=('SCREEN_RESOLUTION_STD', lambda s: s.isna().sum())
)

summary['percentage_nan'] = (summary['nan_rows'] / summary['total_rows']) * 100


# fill missing SCREEN_RESOLUTION_STD by model_name mode
for model in summary.index:
    mode_value = summary.loc[model, 'mode_resolution']
    if pd.notna(mode_value):
        mask = (data['model_name'] == model) & (data['SCREEN_RESOLUTION_STD'].isna())
        data.loc[mask, 'SCREEN_RESOLUTION_STD'] = mode_value
        filled_count = mask.sum()
        if filled_count > 0:
            print(f"Filled {filled_count} missing values for {model} with mode {mode_value}")       
print("\nAll models processed for SCREEN_RESOLUTION_STD!")  

Filled 1419 missing values for THINKPAD with mode FHD
Filled 1195 missing values for LATITUDE with mode FHD
Filled 1439 missing values for MACBOOK with mode QHD+
Filled 579 missing values for ELITEBOOK with mode FHD
Filled 772 missing values for PAVILION with mode FHD
Filled 527 missing values for VIVOBOOK with mode FHD
Filled 370 missing values for PROBOOK with mode FHD
Filled 460 missing values for INSPIRON with mode FHD
Filled 320 missing values for SURFACE with mode 3K
Filled 292 missing values for IDEAPAD with mode FHD
Filled 246 missing values for ASPIRE with mode FHD
Filled 176 missing values for XPS with mode FHD
Filled 161 missing values for STEALTH with mode FHD
Filled 167 missing values for PRECISION with mode FHD
Filled 129 missing values for VICTUS with mode FHD
Filled 127 missing values for TUF with mode FHD
Filled 112 missing values for VOSTRO with mode FHD
Filled 122 missing values for ROG with mode FHD
Filled 126 missing values for ZBOOK with mode FHD
Filled 111 missin

## Encode SCREEN_RESOLUTION (for ML)

In [None]:
# Encode SCREEN_RESOLUTION_STD into numerical values for modeling using this resolution hierarchy 
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K
resolution_encoding = {
    'HD': 1,
    'HD+': 2,
    'FHD': 3,
    'WUXGA': 4,
    'QHD': 5,
    'QHD+': 6,
    '3K': 7,
    '4K': 8,
    '5K': 9
}

data['SCREEN_RESOLUTION_ENC'] = data['SCREEN_RESOLUTION_STD'].map(resolution_encoding)

*
*
*
*
*
*
*
*
*
*
*
*
*
*

In [24]:
# Export the cleaned data with IDs to CSV
data.to_csv('data_cleaned.csv', index=False)
print(f'Exported cleaned data to data_cleaned.csv with shape: {data.shape}')

Exported cleaned data to data_cleaned.csv with shape: (16392, 15)
