# This notebook is for data pre-processing

## Libraries

In [19]:
import numpy as np
import pandas as pd

## Data

In [20]:
data = pd.read_csv('data.csv')
data.shape

(16394, 14)

## General Cleaning

In [21]:
# Removing exact duplicate rows (rows identical across all columns)
# Count duplicates before removal
dup_count = data.duplicated().sum()
print(f"Found {dup_count} exact duplicate rows")
if dup_count > 0:
    # Drop exact duplicates and reset index
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    print(f"Removed {dup_count} duplicates. New shape: {data.shape}")
else:
    print('No duplicate rows found.')

Found 1 exact duplicate rows
Removed 1 duplicates. New shape: (16393, 14)


In [22]:
# Detect groups where all attributes except RAM or storage (SSD/HDD) match,
# but RAM or storage is null in some rows and not in others.
# This helps find near-duplicates where only RAM/storage information is missing in some entries
cols = data.columns.tolist()
# flexible matching for RAM-like columns (case-insensitive, substring match)
ram_cols = [c for c in cols if 'ram' in c.lower() or 'memory' in c.lower()]
# storage detection expanded to SSD, HDD, storage, drive, disk
storage_cols = [c for c in cols if any(k in c.lower() for k in ['ssd', 'hdd', 'storage', 'drive', 'disk'])]
print('Detected ram-like columns:', ram_cols)
print('Detected storage-like columns (SSD/HDD):', storage_cols)
# Group by the specific keys you requested (flexible matching):
# price, date created, etat, model name, city, gpu, cpu, screen size, screen frequency, resolution
def find_col(variants, cols):
    """Return the first column whose name contains any of the provided variants (case-insensitive)."""
    variants = [v.lower() for v in variants]
    for c in cols:
        cname = c.lower()
        if any(v in cname for v in variants):
            return c
    return None
# mapping desired keys to lists of variants to match
desired_keys = {
    'price': ['price'],
    'date_created': ['date', 'create', 'created'],
    'etat': ['etat'],
    'model': ['model', 'model name', 'model_name'],
    'city': ['city'],
    'gpu': ['gpu', 'graphics'],
    'cpu': ['cpu', 'processor'],
    'screen_size': ['screen', 'size'],
    'screen_freq': ['frequency', 'hz', 'screen', 'refresh'],
    'resolution': ['resolution', 'res']
}
found_keys = {k: find_col(v, cols) for k, v in desired_keys.items()}
print('Found column mapping for grouping:')
for k, col in found_keys.items():
    print(f'  {k} -> {col}')
# Check for missing requested columns
missing = [k for k, col in found_keys.items() if col is None]
if missing:
    print('Warning: Could not find these requested grouping columns in the dataset:', missing)
# Build key_cols from the requested keys that were found (preserve order)
requested_order = ['price','date_created','etat','model','city','gpu','cpu','screen_size','screen_freq','resolution']
key_cols = [found_keys[k] for k in requested_order if found_keys.get(k)]
# If we don't have enough requested columns, fall back to grouping by all non-ram/non-storage cols
if len(key_cols) < 2:
    print('Not enough requested grouping columns found (need at least 2). Falling back to grouping by all non-ram/non-storage columns.')
    key_cols = [c for c in cols if c not in ram_cols + storage_cols]
print('Using key columns for grouping:', key_cols)
groups = data.groupby(key_cols, dropna=False)
problem_groups = []
for key, grp in groups:
    if len(grp) < 2:
        continue
    # check RAM columns for mixed null vs non-null within group
    ram_issue = False
    if ram_cols:
        ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
    # check storage columns (SSD/HDD) for mixed null vs non-null within group
    storage_issue = False
    if storage_cols:
        storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
    if ram_issue or storage_issue:
        problem_groups.append((key, grp))
# Auto-resolution: several possible strategies
# 1) If exactly one row has any storage info and it matches '1000' (HDD 1000), copy it to others and drop source
# 2) If multiple rows have storage info but across different storage columns (e.g., one row has SSD, another has HDD),
#    merge storage fields into a single target row and delete the redundant row(s).
resolved = 0
for key, grp in problem_groups:
    storage_existing = [c for c in storage_cols if c in grp.columns]
    if not storage_existing:
        continue
    # which rows in this group have any non-null storage value
    nonnull_mask = grp[storage_existing].notna().any(axis=1)
    rows_with_storage = grp[nonnull_mask]
    # Strategy A: single storage row -> maybe HDD 1000 pattern
    if len(rows_with_storage) == 1:
        src_idx = rows_with_storage.index[0]
        src_vals = rows_with_storage.iloc[0][storage_existing]
        def looks_like_1000(x):
            if pd.isna(x):
                return False
            s = str(x).lower().replace(' ', '')
            return ('1000' in s) or ('1tb' in s) or ('1000gb' in s)
        if any(looks_like_1000(v) for v in src_vals.dropna()):
            target_idxs = [i for i in grp.index if i != src_idx]
            for c in storage_existing:
                mask = data.loc[target_idxs, c].isna()
                if mask.any():
                    data.loc[target_idxs, c] = data.loc[target_idxs, c].where(~mask, other=data.loc[src_idx, c])
            data.drop(index=src_idx, inplace=True)
            resolved += 1
            print(f'Resolved group {key}: moved storage from index {src_idx} to {target_idxs} and deleted index {src_idx}')
        else:
            print(f'Group {key}: single storage row found but did not match 1000 pattern; skipped auto-resolution')
    # Strategy B: multiple rows with storage but complementary across columns -> merge into one row
    else:
        # determine non-null positions per storage column
        col_nonnull_counts = {c: grp[c].notna().sum() for c in storage_existing}
        total_nonnull_cells = sum(col_nonnull_counts.values())
        # If non-null cells are distributed across rows without overlap (each storage cell non-null appears in only one row),
        # we can merge them. Check that no row has two storage columns non-null (optional), and that total_nonnull_cells <= len(rows_with_storage) * len(storage_existing)
        # Simpler heuristic: if each storage column has at most one non-null entry and the number of rows_with_storage equals the number of distinct non-null rows,
        distinct_rows_with_storage = set(rows_with_storage.index.tolist())
        if all(v <= 1 for v in col_nonnull_counts.values()):
            # pick target as the first row that has any storage (prefer the one with SSD if present)
            target_idx = None
            # try to prefer a row that has SSD (or first storage column) non-null
            preferred = storage_existing[0] if storage_existing else None
            if preferred is not None:
                candidates = rows_with_storage[rows_with_storage[preferred].notna()].index.tolist()
                if candidates:
                    target_idx = candidates[0]
            if target_idx is None:
                target_idx = rows_with_storage.index[0]
            other_rows = [i for i in rows_with_storage.index if i != target_idx]
            # copy complementary storage values into target where null
            for other in other_rows:
                for c in storage_existing:
                    if pd.isna(data.at[target_idx, c]) and not pd.isna(data.at[other, c]):
                        data.at[target_idx, c] = data.at[other, c]
                # after copying, drop the other row
                data.drop(index=other, inplace=True)
                resolved += 1
                print(f'Merged storage from index {other} into {target_idx} and deleted {other} for group {key}')
        else:
            print(f'Group {key}: storage columns have multiple non-null entries per column; skipped auto-merge')
# After attempting auto-resolution, reset index if any changes were made
if resolved > 0:
    data.reset_index(drop=True, inplace=True)
    print(f'Auto-resolved {resolved} groups. New data shape: {data.shape}')
    # Recompute groups and problem_groups on the mutated dataframe so printed results reflect changes
    groups = data.groupby(key_cols, dropna=False)
    new_problem_groups = []
    for key, grp in groups:
        if len(grp) < 2:
            continue
        ram_issue = False
        if ram_cols:
            ram_issue = any(grp[rc].isna().any() and (~grp[rc].isna()).any() for rc in ram_cols)
        storage_issue = False
        if storage_cols:
            storage_issue = any(grp[sc].isna().any() and (~grp[sc].isna()).any() for sc in storage_cols)
        if ram_issue or storage_issue:
            new_problem_groups.append((key, grp))
    problem_groups = new_problem_groups
# Finally, print any remaining problem groups for manual review
if not problem_groups:
    print('No groups found where RAM or storage are null in some rows and not in others while other attributes match.')
else:
    print(f'Found {len(problem_groups)} potential groups (post-resolution):')
    for i, (key, grp) in enumerate(problem_groups, 1):
        print('\n---')
        print(f'Group {i} key: {key}')
        # show indices and relevant columns to help decide which rows to drop
        display_cols = key_cols + ram_cols + storage_cols
        # ensure ordering and existence
        display_cols = [c for c in display_cols if c in grp.columns]
        grp_display = grp[display_cols].copy()
        grp_display['_index'] = grp_display.index
        print(grp_display.to_string(index=False))
        print('---')

# Add a unique integer primary key column named 'id' (starting at 1). If 'id' exists, overwrite it after warning.
if 'id' in data.columns:
    print("Column 'id' already exists in the dataset; it will be overwritten with new sequential IDs.")
# Ensure index is contiguous before assigning IDs
data.reset_index(drop=True, inplace=True)
data.insert(0, 'id', range(1, len(data) + 1))
print(f"Added 'id' column as primary key. Data shape now: {data.shape}")

Detected ram-like columns: ['RAM_SIZE', 'RAM_TYPE']
Detected storage-like columns (SSD/HDD): ['SSD_SIZE', 'HDD_SIZE']
Found column mapping for grouping:
  price -> price_preview
  date_created -> created_at
  etat -> spec_Etat
  model -> model_name
  city -> city
  gpu -> DEDICATED_GPU
  cpu -> CPU
  screen_size -> RAM_SIZE
  screen_freq -> SCREEN_SIZE
  resolution -> SCREEN_RESOLUTION
Using key columns for grouping: ['price_preview', 'created_at', 'spec_Etat', 'model_name', 'city', 'DEDICATED_GPU', 'CPU', 'RAM_SIZE', 'SCREEN_SIZE', 'SCREEN_RESOLUTION']
Merged storage from index 10345 into 10344 and deleted 10345 for group (75000.0, '2024 11 01T15:06:57.000Z', 'JAMAIS UTILIS', 'VOSTRO', 'EZZOUAR', nan, '11TH GEN INTEL CORE I3 1145G4', '8GB', 14.1, nan)
Auto-resolved 1 groups. New data shape: (16392, 14)
Merged storage from index 10345 into 10344 and deleted 10345 for group (75000.0, '2024 11 01T15:06:57.000Z', 'JAMAIS UTILIS', 'VOSTRO', 'EZZOUAR', nan, '11TH GEN INTEL CORE I3 1145G4', 

In [23]:
data.shape

(16392, 15)

In [24]:
# Export the cleaned data with IDs to CSV
data.to_csv('data_cleaned.csv', index=False)
print(f'Exported cleaned data to data_cleaned.csv with shape: {data.shape}')

Exported cleaned data to data_cleaned.csv with shape: (16392, 15)
