# This notebook is for data pre-processing

## Libraries

In [3]:
import numpy as np
import pandas as pd
import csv
import os
import re
from rapidfuzz import process, fuzz

## Data

In [6]:
data = pd.read_csv('data.csv')
cpus_data = pd.read_csv('cleaned_all_cpus_laptop.csv')

## General Cleaning

## Cpu column handling

In [7]:
# ------------------ NORMALIZATION ------------------

def normalize(s):
    if not s or pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r'intel|processor|core|cpu', '', s)
    s = s.replace('-', ' ')
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

# ------------------ PREPARE CLEANED CPUS ------------------

# Create normalized CPU names from cpus_data
cpus_data['norm'] = cpus_data['name'].apply(normalize)

# Get column names (handle different possible column names for tdp)
tdp_col = 'tdp(W)' if 'tdp(W)' in cpus_data.columns else 'tdp'

cpus = cpus_data[['name', 'norm', 'cores', 'cpumark', tdp_col]].copy()
cpus.columns = ['cpu_name', 'norm', 'cores', 'cpu_mark', 'tdp']
cpu_norms = cpus['norm'].tolist()

# ------------------ NORMALIZE DATA CPU NAMES ------------------

# Find the CPU column in data
cpu_col = None
for col in ['cpu_name', 'CPU', 'cpu', 'Cpu']:
    if col in data.columns:
        cpu_col = col
        break

if cpu_col is None:
    raise ValueError("No CPU column found in data")

data['norm_cpu'] = data[cpu_col].apply(normalize)

# ------------------ MATCH & MAP ------------------

MATCH_THRESHOLD = 50

matched = unmatched = 0
scores = []
results = []

for idx, row in data.iterrows():
    n = row['norm_cpu']
    
    if not n:
        results.append({
            'mapped_cpu_name': 'NA',
            'match_score': 0,
            'cores': 'NA',
            'cpu_mark': 'NA',
            'tdp': 'NA',
            'gpu_name': 'NA'
        })
        unmatched += 1
        scores.append(0)
        continue

    match = process.extractOne(
        n,
        cpu_norms,
        scorer=fuzz.token_set_ratio
    )

    if match:
        _, score, match_idx = match
        scores.append(score)

        if score >= MATCH_THRESHOLD:
            cpu = cpus.iloc[match_idx]
            results.append({
                'mapped_cpu_name': cpu['cpu_name'],
                'match_score': score,
                'cores': cpu['cores'],
                'cpu_mark': cpu['cpu_mark'],
                'tdp': cpu['tdp'],
                'gpu_name': cpus_data.iloc[match_idx].get('gpu_name', 'NA')
            })
            matched += 1
        else:
            results.append({
                'mapped_cpu_name': 'NA',
                'match_score': score,
                'cores': 'NA',
                'cpu_mark': 'NA',
                'tdp': 'NA',
                'gpu_name': 'NA'
            })
            unmatched += 1
    else:
        results.append({
            'mapped_cpu_name': 'NA',
            'match_score': 0,
            'cores': 'NA',
            'cpu_mark': 'NA',
            'tdp': 'NA',
            'gpu_name': 'NA'
        })
        unmatched += 1
        scores.append(0)

# ------------------ MERGE RESULTS ------------------

results_df = pd.DataFrame(results)
data_merged = pd.concat([data.reset_index(drop=True), results_df], axis=1)

# Drop temporary column
data_merged = data_merged.drop(columns=['norm_cpu'])

# ------------------ SAVE OUTPUT ------------------

OUT_FN = 'data_for_youcef.csv'
data_merged.to_csv(OUT_FN, index=False)

# ------------------ REPORT ------------------

total = len(data)
avg_score = sum(scores) / len(scores) if scores else 0
print(
    f'Wrote {OUT_FN} ({total} rows). '
    f'Matched: {matched}, Unmatched: {unmatched}, '
    f'Avg score: {avg_score:.1f}'
)

Wrote data_for_youcef.csv (16394 rows). Matched: 16292, Unmatched: 102, Avg score: 85.4
