# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load the FULLY cleaned dataset (data2026)
df = pd.read_csv('data2026.csv')

In [3]:
# list the columns and the number of distinct vlues for each column
print("\nDataset columns:")
print(df.columns.to_list())


Dataset columns:
['id', 'price_preview', 'created_at', 'city', 'spec_Etat', 'model_name', 'DEDICATED_GPU', 'CPU', 'RAM_SIZE', 'SSD_SIZE', 'HDD_SIZE', 'SCREEN_SIZE', 'SCREEN_RESOLUTION', 'RAM_TYPE', 'mapped_cpu_name', 'match_score', 'cores', 'cpu_mark', 'tdp', 'gpu_name', 'match_type', 'gpu_match_score', 'gpu_g3d_mark', 'gpu_g2d_mark', 'gpu_tdp', 'SCREEN_SIZE_SNAPPED', 'SCREEN_RESOLUTION_STD', 'SCREEN_RESOLUTION_ENC', 'price_corrected', 'cpu_manufacturer', 'cpu_family', 'cpu_generation_raw', 'cpu_generation_normalized', 'cpu_mfg_AMD', 'cpu_mfg_Apple', 'cpu_mfg_Intel', 'cpu_mfg_Other', 'cpu_mfg_Qualcomm', 'cpu_mfg_Unknown', 'cpu_fam_AMD_A_Series', 'cpu_fam_AMD_Athlon', 'cpu_fam_AMD_FX', 'cpu_fam_AMD_Other', 'cpu_fam_AMD_Ryzen_3', 'cpu_fam_AMD_Ryzen_5', 'cpu_fam_AMD_Ryzen_7', 'cpu_fam_AMD_Ryzen_9', 'cpu_fam_Apple_M1', 'cpu_fam_Apple_M1_Max', 'cpu_fam_Apple_M1_Pro', 'cpu_fam_Apple_M2', 'cpu_fam_Apple_M2_Max', 'cpu_fam_Apple_M2_Pro', 'cpu_fam_Apple_M3', 'cpu_fam_Apple_M3_Max', 'cpu_fam_App

In [4]:
# remove the following columns from all three datasets (if they exist)
columns_to_remove = ['SCREEN_SIZE', 'SCREEN_RESOLUTION', 'SCREEN_RESOLUTION_ENC', 'price_preview', 'match_score', 
                     'match_type', 'gpu_match_score', 'cpu_generation_normalized', 'created_at', 'CPU', 
                     'DEDICATED_GPU', 'gpu_g3d_mark', 'gpu_g2d_mark', 'gpu_tdp', 'cpu_manufacturer', 'cpu_family',
                     'cpu_mark', 'tdp','city', 'cpu_mfg_AMD', 'cpu_mfg_Apple', 'cpu_mfg_Intel', 'cpu_mfg_Other', 
                     'cpu_mfg_Qualcomm', 'cpu_mfg_Unknown']
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# rename some columns for better clarity
df.rename(columns={'SCREEN_SIZE_SNAPPED': 'screen_size',
                   'SCREEN_RESOLUTION_STD': 'screen_resolution',
                   'price_corrected': 'price',
                   'cpu_generation_raw': 'cpu_generation',
                   'RAM_SIZE': 'ram_size', 
                   'SSD_SIZE': 'ssd_size', 
                   'HDD_SIZE': 'hdd_size', 
                   'RAM_TYPE': 'ram_type'
                   }, inplace=True)


# list the columns and the number of distinct vlues for each column
print("\nDataset columns:")
print(df.columns.to_list())

print("\nDataset columns and number of distinct values:")
print("-" * 50)
for col in df.columns:
    num_unique = df[col].nunique()
    print(f"{col}: {num_unique} distinct values")

# number of columns
print("-" * 50)
print("Number of columns:", len(df.columns))



Dataset columns:
['id', 'spec_Etat', 'model_name', 'ram_size', 'ssd_size', 'hdd_size', 'ram_type', 'mapped_cpu_name', 'cores', 'gpu_name', 'screen_size', 'screen_resolution', 'price', 'cpu_generation', 'cpu_fam_AMD_A_Series', 'cpu_fam_AMD_Athlon', 'cpu_fam_AMD_FX', 'cpu_fam_AMD_Other', 'cpu_fam_AMD_Ryzen_3', 'cpu_fam_AMD_Ryzen_5', 'cpu_fam_AMD_Ryzen_7', 'cpu_fam_AMD_Ryzen_9', 'cpu_fam_Apple_M1', 'cpu_fam_Apple_M1_Max', 'cpu_fam_Apple_M1_Pro', 'cpu_fam_Apple_M2', 'cpu_fam_Apple_M2_Max', 'cpu_fam_Apple_M2_Pro', 'cpu_fam_Apple_M3', 'cpu_fam_Apple_M3_Max', 'cpu_fam_Apple_M3_Pro', 'cpu_fam_Apple_M4', 'cpu_fam_Apple_M4_Max', 'cpu_fam_Apple_M4_Pro', 'cpu_fam_Intel_Atom', 'cpu_fam_Intel_Celeron', 'cpu_fam_Intel_Core_M', 'cpu_fam_Intel_Other', 'cpu_fam_Intel_Pentium', 'cpu_fam_Intel_Ultra_5', 'cpu_fam_Intel_Ultra_7', 'cpu_fam_Intel_Ultra_9', 'cpu_fam_Intel_Xeon', 'cpu_fam_Intel_i3', 'cpu_fam_Intel_i5', 'cpu_fam_Intel_i7', 'cpu_fam_Intel_i9', 'cpu_fam_Qualcomm_Snapdragon', 'cpu_fam_Unknown']

D

In [5]:
# change the vlues of 'spec_Etat'
# map 'BON TAT' to 'Bon_état', 'JAMAIS UTILIS' to 'jamais_utilisé', and 'MOYEN' to 'moyen'
df['spec_Etat'] = df['spec_Etat'].replace({
    'BON TAT': 'Bon_état',
    'JAMAIS UTILIS': 'jamais_utilisé',
    'MOYEN': 'moyen'
})

# One-hot encode spec_Etat using its values as column names, then drop original column
if 'spec_Etat' in df.columns:
    spec_etat_dummies = pd.get_dummies(df['spec_Etat'], dummy_na=False)

    # Rename all dummy columns based on their values (normalize to clean strings)
    spec_etat_dummies = spec_etat_dummies.rename(columns=lambda c: str(c).strip())

    df = pd.concat([df.drop(columns=['spec_Etat']), spec_etat_dummies], axis=1)

# now display the columns of df to verify the changes
print("\nUpdated dataset columns after mapping and encoding:")      
print(df.columns.to_list())
# diplay the number of ones in each of the new columns created from one-hot encoding
print("\nNumber of ones in each one-hot encoded column:")
for col in spec_etat_dummies.columns:
    num_ones = df[col].sum()
    print(f"{col}: {num_ones} ones")



Updated dataset columns after mapping and encoding:
['id', 'model_name', 'ram_size', 'ssd_size', 'hdd_size', 'ram_type', 'mapped_cpu_name', 'cores', 'gpu_name', 'screen_size', 'screen_resolution', 'price', 'cpu_generation', 'cpu_fam_AMD_A_Series', 'cpu_fam_AMD_Athlon', 'cpu_fam_AMD_FX', 'cpu_fam_AMD_Other', 'cpu_fam_AMD_Ryzen_3', 'cpu_fam_AMD_Ryzen_5', 'cpu_fam_AMD_Ryzen_7', 'cpu_fam_AMD_Ryzen_9', 'cpu_fam_Apple_M1', 'cpu_fam_Apple_M1_Max', 'cpu_fam_Apple_M1_Pro', 'cpu_fam_Apple_M2', 'cpu_fam_Apple_M2_Max', 'cpu_fam_Apple_M2_Pro', 'cpu_fam_Apple_M3', 'cpu_fam_Apple_M3_Max', 'cpu_fam_Apple_M3_Pro', 'cpu_fam_Apple_M4', 'cpu_fam_Apple_M4_Max', 'cpu_fam_Apple_M4_Pro', 'cpu_fam_Intel_Atom', 'cpu_fam_Intel_Celeron', 'cpu_fam_Intel_Core_M', 'cpu_fam_Intel_Other', 'cpu_fam_Intel_Pentium', 'cpu_fam_Intel_Ultra_5', 'cpu_fam_Intel_Ultra_7', 'cpu_fam_Intel_Ultra_9', 'cpu_fam_Intel_Xeon', 'cpu_fam_Intel_i3', 'cpu_fam_Intel_i5', 'cpu_fam_Intel_i7', 'cpu_fam_Intel_i9', 'cpu_fam_Qualcomm_Snapdragon',

In [6]:
# display distinct values of model_name
if 'model_name' in df.columns:
    print("\nDistinct values in 'model_name' column:")
    print(df['model_name'].unique())  

    # One-hot encode model_name using its values as column names, then drop original column
    model_name_dummies = pd.get_dummies(df['model_name'], dummy_na=False)
    model_name_dummies = model_name_dummies.rename(columns=lambda c: str(c).strip())
    df = pd.concat([df.drop(columns=['model_name']), model_name_dummies], axis=1)

# display the columns of df to verify the changes
print("\nUpdated dataset columns after one-hot encoding 'model_name':")
print(df.columns.to_list())



Distinct values in 'model_name' column:
['IDEAPAD' 'AERO' 'STEALTH' 'ROG' nan 'XPS' 'THINKPAD' 'LATITUDE' 'SWORD'
 'PAVILION' 'ELITEBOOK' 'MACBOOK' 'VIVOBOOK' 'INSPIRON' 'PROBOOK' 'ASPIRE'
 'VECTOR' 'SPECTRE' 'LEGION' 'NITRO' 'OMEN' 'ALIENWARE' 'TUF' 'GALAXY'
 'ZBOOK' 'SURFACE' 'BLADE' 'MAC' 'ZENBOOK' 'PRECISION' 'YOGA' 'PREDATOR'
 'STRIX' 'IMAC' 'KATANA' 'VICTUS' 'ENVY' 'THINKBOOK' 'TRAVELMATE' 'GF'
 'VOSTRO' 'SWIFT' 'SPIN' 'OPTIPLEX' 'DYNABOOK' 'TRANSFORMER' 'COMPAQ']

Updated dataset columns after one-hot encoding 'model_name':
['id', 'ram_size', 'ssd_size', 'hdd_size', 'ram_type', 'mapped_cpu_name', 'cores', 'gpu_name', 'screen_size', 'screen_resolution', 'price', 'cpu_generation', 'cpu_fam_AMD_A_Series', 'cpu_fam_AMD_Athlon', 'cpu_fam_AMD_FX', 'cpu_fam_AMD_Other', 'cpu_fam_AMD_Ryzen_3', 'cpu_fam_AMD_Ryzen_5', 'cpu_fam_AMD_Ryzen_7', 'cpu_fam_AMD_Ryzen_9', 'cpu_fam_Apple_M1', 'cpu_fam_Apple_M1_Max', 'cpu_fam_Apple_M1_Pro', 'cpu_fam_Apple_M2', 'cpu_fam_Apple_M2_Max', 'cpu_fam_Apple_

In [7]:
# Normalize and bucketize ram_size, then one-hot encode
if 'ram_size' in df.columns:
    # extract numeric value and unit (GB/MB), convert to GB
    ram_str = df['ram_size'].astype(str).str.strip().str.upper()
    ram_num = ram_str.str.extract(r'(\d+(?:\.\d+)?)')[0].astype(float)
    is_mb = ram_str.str.contains('MB', na=False)
    ram_gb = ram_num.where(~is_mb, ram_num / 1024)
    
    # define bins and labels
    bins = [-np.inf, 2, 4, 10, 18, 30, 64, np.inf]
    labels = ['<2GB', '2-4GB', '6-10GB', '12-18GB', '20-30GB', '32-64GB', '>64GB']
    df['ram_size_class'] = pd.cut(ram_gb, bins=bins, labels=labels, right=True)
    df['ram_size_class'] = df['ram_size_class'].astype(object).where(df['ram_size_class'].notna(), 'Unknown')
    
    # one-hot encode and drop original column
    ram_dummies = pd.get_dummies(df['ram_size_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['ram_size']), ram_dummies], axis=1)
    
    print("\nRam size classes and counts:")
    print(ram_dummies.sum().sort_values(ascending=False))



Ram size classes and counts:
12-18GB    7530
6-10GB     6015
32-64GB    1504
2-4GB      1046
<2GB        119
20-30GB      85
Unknown      63
>64GB        30
dtype: int64


In [8]:
# Normalize and bucketize ssd_size, then one-hot encode
if 'ssd_size' in df.columns:
    # extract numeric value and unit (GB/TB), convert to GB
    ssd_str = df['ssd_size'].astype(str).str.strip().str.upper()
    ssd_num = ssd_str.str.extract(r'(\d+(?:\.\d+)?)')[0].astype(float)
    is_tb = ssd_str.str.contains('TB', na=False)
    ssd_gb = ssd_num.where(~is_tb, ssd_num * 1024)
    
    # define bins and labels (GB)
    ssd_bins = [-np.inf, 128, 256, 512, 1024, 2048, 4096, 8192, np.inf]
    ssd_labels = ['<=128GB', '129-256GB', '257-512GB', '513-1024GB',
                  '1.1-2TB', '2.1-4TB', '4.1-8TB', '>8TB']
    df['ssd_size_class'] = pd.cut(ssd_gb, bins=ssd_bins, labels=ssd_labels, right=True)
    df['ssd_size_class'] = df['ssd_size_class'].astype(object).where(df['ssd_size_class'].notna(), 'Unknown')
    
    # one-hot encode and drop original column
    ssd_dummies = pd.get_dummies(df['ssd_size_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['ssd_size']), ssd_dummies], axis=1)
    
    print("\nSSD size classes and counts:")
    print(ssd_dummies.sum().sort_values(ascending=False))


SSD size classes and counts:
129-256GB     6345
257-512GB     6328
513-1024GB    1697
Unknown       1049
<=128GB        807
1.1-2TB        136
2.1-4TB         18
4.1-8TB          8
>8TB             4
dtype: int64


In [9]:
# Normalize and bucketize hdd_size, then one-hot encode
if 'hdd_size' in df.columns:
    # extract numeric value and unit (GB/TB), convert to GB
    hdd_str = df['hdd_size'].astype(str).str.strip().str.upper()
    hdd_num = hdd_str.str.extract(r'(\d+(?:\.\d+)?)')[0].astype(float)
    is_tb_hdd = hdd_str.str.contains('TB', na=False)
    hdd_gb = hdd_num.where(~is_tb_hdd, hdd_num * 1024)
    
    # define bins and labels (GB)
    hdd_bins = [-np.inf, 128, 256, 512, 1024, 2048, 4096, 8192, np.inf]
    hdd_labels = ['<=128GB', '129-256GB', '257-512GB', '513-1024GB',
                  '1.1-2TB', '2.1-4TB', '4.1-8TB', '>8TB']
    df['hdd_size_class'] = pd.cut(hdd_gb, bins=hdd_bins, labels=hdd_labels, right=True)
    df['hdd_size_class'] = df['hdd_size_class'].astype(object).where(df['hdd_size_class'].notna(), 'Unknown')
    
    # one-hot encode and drop original column
    hdd_dummies = pd.get_dummies(df['hdd_size_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['hdd_size']), hdd_dummies], axis=1)
    
    print("\nHDD size classes and counts:")
    print(hdd_dummies.sum().sort_values(ascending=False))


HDD size classes and counts:
Unknown       15129
257-512GB       600
513-1024GB      531
129-256GB        80
<=128GB          33
1.1-2TB          16
>8TB              2
2.1-4TB           1
dtype: int64


In [10]:
# Bucketize price, then one-hot encode (range-based)
if 'price' in df.columns:
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    price_bins = [-np.inf, 20000, 40000, 60000, 80000, 100000, 150000, 250000, np.inf]
    price_labels = [
        '<=20000',
        '20001-40000',
        '40001-60000',
        '60001-80000',
        '80001-100000',
        '100001-150000',
        '150001-250000',
        '>250000'
    ]
    df['price_class'] = pd.cut(df['price'], bins=price_bins, labels=price_labels, right=True)
    df['price_class'] = df['price_class'].astype(object).where(df['price_class'].notna(), 'Unknown')
    
    price_dummies = pd.get_dummies(df['price_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['price']), price_dummies], axis=1)
    
    print("\nPrice classes and counts:")
    print(price_dummies.sum().sort_values(ascending=False))


Price classes and counts:
100001-150000    3399
150001-250000    2662
60001-80000      2538
40001-60000      2351
80001-100000     2269
>250000          1346
20001-40000      1278
<=20000           474
Unknown            75
dtype: int64


In [11]:
# Normalize and bucketize screen_size, then one-hot encode
if 'screen_size' in df.columns:
    screen_num = pd.to_numeric(df['screen_size'], errors='coerce')
    screen_bins = [-np.inf, 11, 12.9, 14, 15.6, 16.9, 18.4, np.inf]
    screen_labels = ['<=11"', '11.1-12.9"', '13-14"', '14.1-15.6"', '16-16.9"', '17-18.4"', '>18.4"']
    df['screen_size_class'] = pd.cut(screen_num, bins=screen_bins, labels=screen_labels, right=True)
    df['screen_size_class'] = df['screen_size_class'].astype(object).where(df['screen_size_class'].notna(), 'Unknown')

    screen_size_dummies = pd.get_dummies(df['screen_size_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['screen_size']), screen_size_dummies], axis=1)

    print("\nScreen size classes and counts:")
    print(screen_size_dummies.sum().sort_values(ascending=False))

# One-hot encode screen_resolution, then drop original column
if 'screen_resolution' in df.columns:
    screen_res_dummies = pd.get_dummies(df['screen_resolution'], dummy_na=False)
    screen_res_dummies = screen_res_dummies.rename(columns=lambda c: str(c).strip())
    df = pd.concat([df.drop(columns=['screen_resolution']), screen_res_dummies], axis=1)

    print("\nScreen resolution classes and counts:")
    print(screen_res_dummies.sum().sort_values(ascending=False))


Screen size classes and counts:
13-14"        8145
14.1-15.6"    5668
16-16.9"      1268
17-18.4"       587
11.1-12.9"     386
Unknown        227
<=11"          100
>18.4"          11
dtype: int64

Screen resolution classes and counts:
FHD      12929
QHD+      1720
3K         586
WUXGA      278
QHD        233
4K         113
HD         108
HD+         29
5K          28
dtype: int64


In [12]:
# Normalize and bucketize ram_type, then one-hot encode
if 'ram_type' in df.columns:
    ram_raw = df['ram_type']
    ram_norm = ram_raw.astype(str).str.strip().str.upper()
    ram_norm = ram_norm.where(ram_raw.notna(), np.nan)

    ram_conditions = [
        ram_norm.isna(),
        ram_norm.eq('DDR2'),
        ram_norm.isin(['DDR3', 'DDR3L']),
        ram_norm.isin(['DDR4', 'DDR4X']),
        ram_norm.isin(['DDR5', 'DDR5X']),
        ram_norm.eq('LPDDR3'),
        ram_norm.isin(['LPDDR4', 'LPDDR4X']),
        ram_norm.isin(['LPDDR5', 'LPDDR5X']),
        ram_norm.eq('UNIFIED'),
        ram_norm.eq('DDR')
    ]
    ram_choices = [
        'Unknown',
        'DDR2',
        'DDR3/DDR3L',
        'DDR4/DDR4X',
        'DDR5/DDR5X',
        'LPDDR3',
        'LPDDR4/LPDDR4X',
        'LPDDR5/LPDDR5X',
        'Unified',
        'DDR'
    ]
    df['ram_type_class'] = np.select(ram_conditions, ram_choices, default='Unknown')

    ram_type_dummies = pd.get_dummies(df['ram_type_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['ram_type']), ram_type_dummies], axis=1)

    print("\nRAM type classes and counts:")
    print(ram_type_dummies.sum().sort_values(ascending=False))

# Bucketize cores, then one-hot encode
if 'cores' in df.columns:
    cores_num = pd.to_numeric(df['cores'], errors='coerce')
    cores_bins = [-np.inf, 2, 4, 6, 8, 10, 12, 16, np.inf]
    cores_labels = ['1-2', '3-4', '5-6', '7-8', '9-10', '11-12', '13-16', '>16']
    df['cores_class'] = pd.cut(cores_num, bins=cores_bins, labels=cores_labels, right=True)
    df['cores_class'] = df['cores_class'].astype(object).where(df['cores_class'].notna(), 'Unknown')

    cores_dummies = pd.get_dummies(df['cores_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['cores']), cores_dummies], axis=1)

    print("\nCores classes and counts:")
    print(cores_dummies.sum().sort_values(ascending=False))

# Bucketize cpu_generation, then one-hot encode
if 'cpu_generation' in df.columns:
    gen_num = pd.to_numeric(df['cpu_generation'], errors='coerce')
    gen_bins = [-np.inf, 4, 8, 11, 13, 15, np.inf]
    gen_labels = ['1-4', '5-8', '9-11', '12-13', '14-15', '>15']
    df['cpu_generation_class'] = pd.cut(gen_num, bins=gen_bins, labels=gen_labels, right=True)
    df['cpu_generation_class'] = df['cpu_generation_class'].astype(object).where(df['cpu_generation_class'].notna(), 'Unknown')

    gen_dummies = pd.get_dummies(df['cpu_generation_class'], dummy_na=False)
    df = pd.concat([df.drop(columns=['cpu_generation']), gen_dummies], axis=1)

    print("\nCPU generation classes and counts:")
    print(gen_dummies.sum().sort_values(ascending=False))


RAM type classes and counts:
DDR4/DDR4X        9707
DDR5/DDR5X        3147
LPDDR5/LPDDR5X    1136
DDR3/DDR3L        1040
Unknown            729
LPDDR4/LPDDR4X     419
DDR2                96
LPDDR3              73
DDR                 26
Unified             19
dtype: int64

Cores classes and counts:
3-4        6527
1-2        5465
5-6        2042
7-8        1397
Unknown     793
9-10         81
11-12        74
13-16        13
dtype: int64

CPU generation classes and counts:
5-8        4524
9-11       3716
12-13      3700
1-4        2267
Unknown    1410
14-15       775
dtype: int64


In [13]:
import re

def _norm_text(value):
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return None
    text = str(value).strip()
    return text if text else None

def extract_dedicated_gpu(gpu_name):
    text = _norm_text(gpu_name)
    if text is None:
        return 'Unknown'
    upper = text.upper()

    # Dedicated checks
    if any(token in upper for token in ['GEFORCE', 'GTX', 'RTX', 'QUADRO', 'PRO W']):
        return 'Dedicated'
    if re.search(r'\bMX\s*\d{3,4}\b', upper):
        return 'Dedicated'
    if re.search(r'\bRADEON\s+PRO\b', upper):
        return 'Dedicated'
    if re.search(r'\bRADEON\s+\d{3,4}M\b', upper):
        return 'Dedicated'

    # Integrated checks
    if any(token in upper for token in ['UHD', 'HD GRAPHICS', 'IRIS', 'ARC']):
        return 'Integrated'
    if 'RADEON GRAPHICS' in upper and not re.search(r'\b\d{3,4}M\b', upper):
        return 'Integrated'
    if 'VEGA' in upper:
        return 'Integrated'
    if 'ADRENO' in upper:
        return 'Integrated'
    if 'APPLE' in upper or re.search(r'\bM[1-4]\b', upper):
        return 'Integrated'

    return 'Unknown'

def extract_gpu_family(gpu_name):
    text = _norm_text(gpu_name)
    if text is None:
        return 'Unknown'
    upper = text.upper()

    # NVIDIA
    if 'RTX' in upper:
        rtx_match = re.search(r'RTX\s*([2-5]\d{2,3})', upper)
        if rtx_match:
            model = int(rtx_match.group(1))
            if model >= 4000:
                return 'NVIDIA_RTX_40_50'
            if 2000 <= model <= 3999:
                return 'NVIDIA_RTX_20_30'
        if re.search(r'RTX\s*4\d{2}', upper) or re.search(r'RTX\s*5\d{2}', upper):
            return 'NVIDIA_RTX_40_50'
        if re.search(r'RTX\s*2\d{2}', upper) or re.search(r'RTX\s*3\d{2}', upper):
            return 'NVIDIA_RTX_20_30'
    if 'GTX' in upper:
        return 'NVIDIA_GTX'
    if re.search(r'\bMX\s*\d{3,4}\b', upper):
        return 'NVIDIA_MX'
    if any(token in upper for token in ['NVIDIA', 'GEFORCE', 'QUADRO', 'TESLA']):
        return 'NVIDIA_Legacy'

    # AMD
    if 'RADEON PRO' in upper:
        return 'AMD_Radeon_Pro'
    if re.search(r'\b([6-8]\d{2})M\b', upper):
        return 'AMD_Radeon_Mobile'
    if 'RADEON GRAPHICS' in upper or 'VEGA' in upper:
        return 'AMD_Radeon_Integrated'
    if 'RADEON' in upper:
        return 'AMD_Radeon_Integrated'

    # Intel
    if 'ARC' in upper:
        return 'Intel_Arc'
    if 'IRIS' in upper:
        return 'Intel_Iris'
    if 'UHD' in upper or 'HD GRAPHICS' in upper:
        return 'Intel_Integrated'

    # Apple
    if 'APPLE' in upper or re.search(r'\bM[1-4]\b', upper):
        return 'Apple_GPU'

    # Qualcomm
    if 'ADRENO' in upper:
        return 'Qualcomm_Adreno'

    return 'Unknown'

def _apple_gpu_core_count(upper):
    match = re.search(r'(\d{1,2})\s*-?\s*CORE', upper)
    if match:
        return int(match.group(1))
    return None

def extract_gpu_tier(gpu_name):
    text = _norm_text(gpu_name)
    if text is None:
        return 'Unknown'
    upper = text.upper()

    # Legacy checks
    if re.search(r'\bGMA\b', upper) or re.search(r'\bRADEON\s+X\d{3,4}\b', upper):
        return 'Legacy'
    if re.search(r'\bGEFORCE\s+[6-8]\d{3}\b', upper):
        return 'Legacy'
    if 'QUADRO' in upper and re.search(r'\b[1-3]\d{3}\b', upper):
        return 'Legacy'

    # Apple GPU tiers by core count
    if 'APPLE' in upper or re.search(r'\bM[1-4]\b', upper):
        cores = _apple_gpu_core_count(upper)
        if cores is not None:
            if cores >= 38:
                return 'High_End'
            if 14 <= cores <= 37:
                return 'Mid_Range'
            if cores < 14:
                return 'Entry_Level'

    # NVIDIA RTX tiers
    rtx_match = re.search(r'RTX\s*([2-5]\d{2,3})', upper)
    if rtx_match:
        model = int(rtx_match.group(1))
        if model in [5090, 5080, 5070, 4090, 4080, 4070, 3090, 3080, 3070]:
            return 'High_End'
        if model in [5060, 4060, 3060, 2060, 4050, 3050, 2050]:
            return 'Mid_Range'
        return 'Entry_Level'

    # NVIDIA GTX tiers
    gtx_match = re.search(r'GTX\s*(\d{3,4})', upper)
    if gtx_match:
        model = int(gtx_match.group(1))
        if model in [1080, 1070]:
            return 'High_End'
        if model in [1660, 1650, 1060, 960, 970]:
            return 'Mid_Range'
        if model <= 980 and model not in [960, 970]:
            return 'Entry_Level'
        return 'Entry_Level'

    # NVIDIA GT / MX tiers
    if re.search(r'\bGT\s*\d{2,3}\b', upper):
        return 'Entry_Level'
    if re.search(r'\bMX\s*\d{3,4}\b', upper):
        return 'Entry_Level'

    # AMD tiers
    if re.search(r'\b(890|780)M\b', upper):
        return 'High_End'
    if re.search(r'\b(740|680|660|610)M\b', upper):
        return 'Mid_Range'
    if re.search(r'RADEON\s+PRO\s+W(6300|5500|5700)\b', upper):
        return 'High_End'
    if 'RADEON GRAPHICS' in upper or 'VEGA' in upper:
        return 'Entry_Level'

    # Intel tiers
    if 'ARC' in upper:
        return 'Mid_Range'
    if any(token in upper for token in ['UHD', 'HD GRAPHICS', 'IRIS']):
        return 'Entry_Level'

    # Qualcomm
    if 'ADRENO' in upper:
        return 'Entry_Level'

    return 'Unknown'

def _intel_generation(upper):
    match = re.search(r'\bI[3579]-\s*(\d{3,5})', upper)
    if not match:
        return None
    digits = match.group(1)
    if len(digits) >= 4:
        return int(digits[:2])
    return int(digits[0])

def _amd_ryzen_series(upper):
    match = re.search(r'RYZEN\s*[3579]\s*(\d{4,5})', upper)
    if not match:
        return None
    digits = match.group(1)
    if len(digits) >= 4:
        return int(digits[0]) * 1000
    return int(digits)

def extract_cpu_tier(mapped_cpu_name):
    text = _norm_text(mapped_cpu_name)
    if text is None:
        return 'Unknown'
    upper = text.upper()

    # Legacy CPUs
    if any(token in upper for token in ['ATOM', 'PENTIUM M', 'CORE 2 DUO', 'CORE DUO', 'SEMPRON', 'E1', 'E2', 'SNAPDRAGON', 'VIA']):
        return 'Legacy'

    # High-end
    if any(token in upper for token in ['I9', 'XEON', 'ULTRA 9', 'RYZEN 9']):
        return 'High_End'
    if re.search(r'\bI7-\d+.*\b(HX|HK)\b', upper):
        return 'High_End'
    if re.search(r'\bM[1-4]\s*(PRO|MAX|ULTRA)\b', upper):
        return 'High_End'

    # Mid-range
    if re.search(r'\bI7-\d+.*\b(H|P)\b', upper):
        return 'Mid_Range'
    if re.search(r'\bI5-\d+.*\b(H|HX|P)\b', upper):
        return 'Mid_Range'
    if any(token in upper for token in ['ULTRA 7', 'ULTRA 5']):
        return 'Mid_Range'
    if 'RYZEN 7' in upper:
        return 'Mid_Range'
    if 'RYZEN 5' in upper:
        amd_series = _amd_ryzen_series(upper)
        if amd_series is not None and amd_series >= 5000:
            return 'Mid_Range'
        return 'Entry_Level'
    if re.search(r'\bM[1-4]\b', upper) and not re.search(r'\bM[1-4]\s*(PRO|MAX|ULTRA)\b', upper):
        return 'Mid_Range'

    # Entry-level
    if 'I3' in upper:
        return 'Entry_Level'
    if re.search(r'\bI5-\d+.*\b(U|G)\b', upper):
        return 'Entry_Level'
    if any(token in upper for token in ['PENTIUM', 'CELERON', 'CORE M', 'RYZEN 3', 'ATHLON']):
        return 'Entry_Level'
    if re.search(r'\bA(6|8|10|12)\b', upper):
        return 'Entry_Level'
    intel_gen = _intel_generation(upper)
    if intel_gen is not None and intel_gen <= 4:
        return 'Entry_Level'

    return 'Unknown'

In [14]:
# Create categorical columns using extraction functions
if 'gpu_name' in df.columns:
    df['dedicated_gpu'] = df['gpu_name'].apply(extract_dedicated_gpu)
    df['gpu_family'] = df['gpu_name'].apply(extract_gpu_family)
    df['gpu_tier'] = df['gpu_name'].apply(extract_gpu_tier)

if 'mapped_cpu_name' in df.columns:
    df['cpu_tier'] = df['mapped_cpu_name'].apply(extract_cpu_tier)

# One-hot encode new categorical columns and drop originals
columns_to_encode = ['dedicated_gpu', 'gpu_family', 'gpu_tier', 'cpu_tier']
for col in columns_to_encode:
    if col in df.columns:
        dummies = pd.get_dummies(df[col], dummy_na=False)
        dummies = dummies.rename(columns=lambda c: f"{col}_{str(c).strip()}")
        df = pd.concat([df.drop(columns=[col]), dummies], axis=1)

# Drop old source columns after encoding
columns_to_drop = ['gpu_name', 'mapped_cpu_name']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print("\nGPU/CPU derived columns added and one-hot encoded.")
print("\nCurrent column count:", len(df.columns))


GPU/CPU derived columns added and one-hot encoded.

Current column count: 195


In [15]:
# display the columns of df to verify the changes and the number of columns
print("\nFinal dataset columns after all transformations:")
print(df.columns.to_list())
print(f"\nTotal number of columns: {len(df.columns)}")
print(df.shape)

# allow to print many rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# list the type of each column
print("\nColumn data types:")
print(df.dtypes)


Final dataset columns after all transformations:
['id', 'cpu_fam_AMD_A_Series', 'cpu_fam_AMD_Athlon', 'cpu_fam_AMD_FX', 'cpu_fam_AMD_Other', 'cpu_fam_AMD_Ryzen_3', 'cpu_fam_AMD_Ryzen_5', 'cpu_fam_AMD_Ryzen_7', 'cpu_fam_AMD_Ryzen_9', 'cpu_fam_Apple_M1', 'cpu_fam_Apple_M1_Max', 'cpu_fam_Apple_M1_Pro', 'cpu_fam_Apple_M2', 'cpu_fam_Apple_M2_Max', 'cpu_fam_Apple_M2_Pro', 'cpu_fam_Apple_M3', 'cpu_fam_Apple_M3_Max', 'cpu_fam_Apple_M3_Pro', 'cpu_fam_Apple_M4', 'cpu_fam_Apple_M4_Max', 'cpu_fam_Apple_M4_Pro', 'cpu_fam_Intel_Atom', 'cpu_fam_Intel_Celeron', 'cpu_fam_Intel_Core_M', 'cpu_fam_Intel_Other', 'cpu_fam_Intel_Pentium', 'cpu_fam_Intel_Ultra_5', 'cpu_fam_Intel_Ultra_7', 'cpu_fam_Intel_Ultra_9', 'cpu_fam_Intel_Xeon', 'cpu_fam_Intel_i3', 'cpu_fam_Intel_i5', 'cpu_fam_Intel_i7', 'cpu_fam_Intel_i9', 'cpu_fam_Qualcomm_Snapdragon', 'cpu_fam_Unknown', 'Bon_état', 'jamais_utilisé', 'moyen', 'AERO', 'ALIENWARE', 'ASPIRE', 'BLADE', 'COMPAQ', 'DYNABOOK', 'ELITEBOOK', 'ENVY', 'GALAXY', 'GF', 'IDEAPAD',

In [27]:
# remove columns 'Unknown'
columns_to_remove = [col for col in df.columns if 'Unknown' in col]
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')   

# display all columns named 'Uknown' to check
print("\nColumns containing 'Unknown':")
unknown_columns = [col for col in df.columns if 'Unknown' in col]
print(unknown_columns)


Columns containing 'Unknown':
[]


In [28]:
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

def _get_model_name_columns(df):
    if 'model_name_dummies' in globals():
        return [c for c in model_name_dummies.columns if c in df.columns]
    return []

def _get_gen_columns(df):
    if 'gen_dummies' in globals():
        return [c for c in gen_dummies.columns if c in df.columns]
    return []

def _is_binary_series(series):
    values = set(series.dropna().unique())
    return values <= {0, 1}

def build_experiment_df(df, drop_prefixes=None, drop_columns=None):
    data = df.copy()
    drop_cols = set()
    if drop_prefixes:
        drop_cols.update([c for c in data.columns if any(c.startswith(p) for p in drop_prefixes)])
    if drop_columns:
        drop_cols.update([c for c in drop_columns if c in data.columns])
    data = data.drop(columns=list(drop_cols), errors='ignore')

    # Keep only binary columns (handle duplicate column names)
    binary_idx = []
    for idx, col in enumerate(data.columns):
        series = data.iloc[:, idx]
        if _is_binary_series(series):
            binary_idx.append(idx)
    return data.iloc[:, binary_idx]

def run_rules_apriori(data, min_support=0.02, min_confidence=0.5, min_lift=1.0, max_len=None, top_n=20):
    frequent = apriori(data, min_support=min_support, use_colnames=True, max_len=max_len, low_memory=True)
    rules = association_rules(frequent, metric='confidence', min_threshold=min_confidence)
    rules = rules[rules['lift'] >= min_lift]
    rules = rules.sort_values(['lift', 'confidence'], ascending=False)
    return frequent, rules.head(top_n)

def run_rules_fpgrowth(data, min_support=0.02, min_confidence=0.5, min_lift=1.0, max_len=None, top_n=20):
    frequent = fpgrowth(data, min_support=min_support, use_colnames=True, max_len=max_len)
    rules = association_rules(frequent, metric='confidence', min_threshold=min_confidence)
    rules = rules[rules['lift'] >= min_lift]
    rules = rules.sort_values(['lift', 'confidence'], ascending=False)
    return frequent, rules.head(top_n)

# Build experiment datasets
model_name_cols = _get_model_name_columns(df)
gen_cols = _get_gen_columns(df)

exp_1_1 = build_experiment_df(
    df,
    drop_prefixes=['cpu_fam_', 'gpu_family_'],
    drop_columns=['cpu_generation_class'] + gen_cols
)

exp_1_2 = build_experiment_df(
    df,
    drop_prefixes=['cpu_fam_', 'gpu_family_'],
    drop_columns=['cpu_generation_class'] + gen_cols + model_name_cols
)

exp_2_1 = build_experiment_df(
    df,
    drop_prefixes=['cpu_tier_', 'gpu_tier_'],
    drop_columns=['cpu_generation_class']
)

exp_2_2 = build_experiment_df(
    df,
    drop_prefixes=['cpu_tier_', 'gpu_tier_'],
    drop_columns=['cpu_generation_class'] + model_name_cols
 )

print("Experiment datasets ready:")
print("exp_1_1:", exp_1_1.shape)
print("exp_1_2:", exp_1_2.shape)
print("exp_2_1:", exp_2_1.shape)
print("exp_2_2:", exp_2_2.shape)


Experiment datasets ready:
exp_1_1: (16392, 121)
exp_1_2: (16392, 75)
exp_2_1: (16392, 165)
exp_2_2: (16392, 119)


In [29]:
# Example run (Apriori)
# experiment 1-1
frequent_1_1, rules_1_1 = run_rules_apriori(exp_1_1, min_support=0.05, min_confidence=0.6, min_lift=1.1)
print("\nTop rules - Experiment 1-1 (Apriori)")
display(rules_1_1)


Top rules - Experiment 1-1 (Apriori)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
320,"(QHD+, 13-14"")",(MACBOOK),0.071315,0.101452,0.068692,0.963216,9.494314,0.061457,24.42797,0.963377
323,(MACBOOK),"(QHD+, 13-14"")",0.101452,0.071315,0.068692,0.67709,9.494314,0.061457,2.875983,0.995688
1598,"(QHD+, 13-14"", dedicated_gpu_Integrated)",(MACBOOK),0.053502,0.101452,0.051184,0.95667,9.429791,0.045756,20.737544,0.944485
327,"(QHD+, dedicated_gpu_Integrated)",(MACBOOK),0.073389,0.101452,0.070095,0.955112,9.414431,0.06265,20.017654,0.964569
330,(MACBOOK),"(QHD+, dedicated_gpu_Integrated)",0.101452,0.073389,0.070095,0.69092,9.414431,0.06265,2.997964,0.994694
1600,"(QHD+, dedicated_gpu_Integrated)","(MACBOOK, 13-14"")",0.073389,0.074244,0.051184,0.697423,9.393722,0.045735,3.059574,0.964317
1603,"(MACBOOK, 13-14"")","(QHD+, dedicated_gpu_Integrated)",0.074244,0.073389,0.051184,0.6894,9.393722,0.045735,2.983294,0.965206
1601,"(QHD+, 13-14"")","(MACBOOK, dedicated_gpu_Integrated)",0.071315,0.076745,0.051184,0.717707,9.351876,0.04571,3.270562,0.96165
1602,"(MACBOOK, dedicated_gpu_Integrated)","(QHD+, 13-14"")",0.076745,0.071315,0.051184,0.666932,9.351876,0.04571,2.788271,0.967305
321,"(MACBOOK, 13-14"")",(QHD+),0.074244,0.104929,0.068692,0.925226,8.817619,0.060902,11.970342,0.957693


In [30]:
# Example run (Apriori)
# experiment 1-2
frequent_1_2, rules_1_2 = run_rules_apriori(exp_1_2, min_support=0.05, min_confidence=0.6, min_lift=1.1)
print("\nTop rules - Experiment 1-2 (Apriori)")
display(rules_1_2)


Top rules - Experiment 1-2 (Apriori)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
834,(LPDDR5/LPDDR5X),"(dedicated_gpu_Integrated, gpu_tier_Mid_Range)",0.069302,0.084553,0.051245,0.739437,8.745198,0.045385,3.513335,0.951599
833,"(dedicated_gpu_Integrated, gpu_tier_Mid_Range)",(LPDDR5/LPDDR5X),0.084553,0.069302,0.051245,0.606061,8.745198,0.045385,2.362541,0.967453
2638,"(cpu_tier_Mid_Range, 13-14"")","(gpu_tier_Mid_Range, dedicated_gpu_Integrated)",0.074427,0.084553,0.050939,0.684426,8.094599,0.044646,2.900896,0.946938
2639,"(gpu_tier_Mid_Range, dedicated_gpu_Integrated)","(cpu_tier_Mid_Range, 13-14"")",0.084553,0.074427,0.050939,0.602453,8.094599,0.044646,2.328212,0.957413
2640,"(13-14"", gpu_tier_Mid_Range)","(cpu_tier_Mid_Range, dedicated_gpu_Integrated)",0.070766,0.105722,0.050939,0.719828,6.808663,0.043458,3.191883,0.918099
2637,"(13-14"", gpu_tier_Mid_Range, dedicated_gpu_Int...",(cpu_tier_Mid_Range),0.05948,0.14562,0.050939,0.85641,5.881138,0.042278,5.950148,0.882453
852,"(dedicated_gpu_Integrated, gpu_tier_Mid_Range)",(cpu_tier_Mid_Range),0.084553,0.14562,0.06979,0.825397,5.668163,0.057477,4.893268,0.899644
712,"(13-14"", gpu_tier_Mid_Range)",(cpu_tier_Mid_Range),0.070766,0.14562,0.054966,0.776724,5.333918,0.044661,3.826568,0.874398
2215,"(257-512GB, dedicated_gpu_Dedicated)","(gpu_tier_Mid_Range, FHD)",0.103038,0.117374,0.063995,0.621078,5.291426,0.051901,2.329304,0.90418
2675,"(14.1-15.6"", gpu_tier_Mid_Range)","(dedicated_gpu_Dedicated, FHD)",0.075098,0.163372,0.060273,0.8026,4.9127,0.048004,4.238225,0.861114


In [31]:
# Example run (Apriori)
# experiment 2-1
frequent_2_1, rules_2_1 = run_rules_apriori(exp_2_1, min_support=0.05, min_confidence=0.5, min_lift=1.05)
print("\nTop rules - Experiment 2-1 (Apriori)")
display(rules_2_1)


Top rules - Experiment 2-1 (Apriori)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
12752,"(QHD+, 1-4, dedicated_gpu_Integrated)","(MACBOOK, gpu_family_Apple_GPU)",0.060273,0.058992,0.054051,0.896761,15.201353,0.050495,9.11486,0.994136
12768,"(MACBOOK, gpu_family_Apple_GPU)","(QHD+, 1-4, dedicated_gpu_Integrated)",0.058992,0.060273,0.054051,0.916236,15.201353,0.050495,11.218713,0.992783
12764,"(QHD+, gpu_family_Apple_GPU)","(MACBOOK, 1-4, dedicated_gpu_Integrated)",0.054112,0.065947,0.054051,0.998873,15.146642,0.050482,828.505185,0.987409
12757,"(MACBOOK, 1-4, dedicated_gpu_Integrated)","(QHD+, gpu_family_Apple_GPU)",0.065947,0.054112,0.054051,0.819611,15.146642,0.050482,5.243616,0.99992
4455,"(MACBOOK, gpu_family_Apple_GPU)","(QHD+, 1-4)",0.058992,0.060578,0.054051,0.916236,15.124811,0.050477,11.215071,0.992429
12758,"(MACBOOK, dedicated_gpu_Integrated, gpu_family...","(QHD+, 1-4)",0.058992,0.060578,0.054051,0.916236,15.124811,0.050477,11.215071,0.992429
4452,"(QHD+, 1-4)","(MACBOOK, gpu_family_Apple_GPU)",0.060578,0.058992,0.054051,0.892246,15.124811,0.050477,8.732904,0.994105
12763,"(QHD+, 1-4)","(MACBOOK, dedicated_gpu_Integrated, gpu_family...",0.060578,0.058992,0.054051,0.892246,15.124811,0.050477,8.732904,0.994105
4454,"(MACBOOK, 1-4)","(QHD+, gpu_family_Apple_GPU)",0.066252,0.054112,0.054051,0.815838,15.076906,0.050466,5.136173,0.99992
12767,"(MACBOOK, 1-4)","(QHD+, dedicated_gpu_Integrated, gpu_family_Ap...",0.066252,0.054112,0.054051,0.815838,15.076906,0.050466,5.136173,0.99992


In [32]:
# Example run (Apriori)
# experiment 2-1
frequent_2_2, rules_2_2 = run_rules_apriori(exp_2_2, min_support=0.05, min_confidence=0.5, min_lift=1.05)
print("\nTop rules - Experiment 2-2 (Apriori)")
display(rules_2_2)


Top rules - Experiment 2-2 (Apriori)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6801,"(QHD+, 1-4, dedicated_gpu_Integrated)",(gpu_family_Apple_GPU),0.060273,0.061859,0.054112,0.897773,14.513116,0.050383,9.177058,0.990817
6811,(gpu_family_Apple_GPU),"(QHD+, 1-4, dedicated_gpu_Integrated)",0.061859,0.060273,0.054112,0.874753,14.513116,0.050383,7.503015,0.992492
1626,"(QHD+, 1-4)",(gpu_family_Apple_GPU),0.060578,0.061859,0.054112,0.893253,14.440039,0.050364,8.78843,0.990767
6805,"(QHD+, 1-4)","(dedicated_gpu_Integrated, gpu_family_Apple_GPU)",0.060578,0.061859,0.054112,0.893253,14.440039,0.050364,8.78843,0.990767
1630,(gpu_family_Apple_GPU),"(QHD+, 1-4)",0.061859,0.060578,0.054112,0.874753,14.440039,0.050364,7.500579,0.99212
6809,"(dedicated_gpu_Integrated, gpu_family_Apple_GPU)","(QHD+, 1-4)",0.061859,0.060578,0.054112,0.874753,14.440039,0.050364,7.500579,0.99212
1635,(gpu_family_Apple_GPU),"(QHD+, dedicated_gpu_Integrated)",0.061859,0.073389,0.054112,0.874753,11.919334,0.049572,7.398292,0.976509
6808,"(1-4, gpu_family_Apple_GPU)","(QHD+, dedicated_gpu_Integrated)",0.061859,0.073389,0.054112,0.874753,11.919334,0.049572,7.398292,0.976509
1631,"(QHD+, dedicated_gpu_Integrated)",(gpu_family_Apple_GPU),0.073389,0.061859,0.054112,0.737323,11.919334,0.049572,3.571465,0.98866
6806,"(QHD+, dedicated_gpu_Integrated)","(1-4, gpu_family_Apple_GPU)",0.073389,0.061859,0.054112,0.737323,11.919334,0.049572,3.571465,0.98866


In [33]:
# Example run (FP-Growth)
# experiment 1-1
fp_frequent_1_1, fp_rules_1_1 = run_rules_fpgrowth(exp_1_1, min_support=0.05, min_confidence=0.6, min_lift=1.1)
print("\nTop rules - Experiment 1-1 (FP-Growth)")
display(fp_rules_1_1)


Top rules - Experiment 1-1 (FP-Growth)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
10453,"(QHD+, 13-14"")",(MACBOOK),0.071315,0.101452,0.068692,0.963216,9.494314,0.061457,24.42797,0.963377
10456,(MACBOOK),"(QHD+, 13-14"")",0.101452,0.071315,0.068692,0.67709,9.494314,0.061457,2.875983,0.995688
10461,"(QHD+, 13-14"", dedicated_gpu_Integrated)",(MACBOOK),0.053502,0.101452,0.051184,0.95667,9.429791,0.045756,20.737544,0.944485
10468,"(QHD+, dedicated_gpu_Integrated)",(MACBOOK),0.073389,0.101452,0.070095,0.955112,9.414431,0.06265,20.017654,0.964569
10471,(MACBOOK),"(QHD+, dedicated_gpu_Integrated)",0.101452,0.073389,0.070095,0.69092,9.414431,0.06265,2.997964,0.994694
10463,"(QHD+, dedicated_gpu_Integrated)","(MACBOOK, 13-14"")",0.073389,0.074244,0.051184,0.697423,9.393722,0.045735,3.059574,0.964317
10466,"(MACBOOK, 13-14"")","(QHD+, dedicated_gpu_Integrated)",0.074244,0.073389,0.051184,0.6894,9.393722,0.045735,2.983294,0.965206
10464,"(QHD+, 13-14"")","(MACBOOK, dedicated_gpu_Integrated)",0.071315,0.076745,0.051184,0.717707,9.351876,0.04571,3.270562,0.96165
10465,"(MACBOOK, dedicated_gpu_Integrated)","(QHD+, 13-14"")",0.076745,0.071315,0.051184,0.666932,9.351876,0.04571,2.788271,0.967305
10454,"(MACBOOK, 13-14"")",(QHD+),0.074244,0.104929,0.068692,0.925226,8.817619,0.060902,11.970342,0.957693


In [34]:
# Example run (FP-Growth)
# experiment 1-2
fp_frequent_1_2, fp_rules_1_2 = run_rules_fpgrowth(exp_1_2, min_support=0.05, min_confidence=0.6, min_lift=1.1)
print("\nTop rules - Experiment 1-1 (FP-Growth)")
display(fp_rules_1_1)


Top rules - Experiment 1-1 (FP-Growth)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
10453,"(QHD+, 13-14"")",(MACBOOK),0.071315,0.101452,0.068692,0.963216,9.494314,0.061457,24.42797,0.963377
10456,(MACBOOK),"(QHD+, 13-14"")",0.101452,0.071315,0.068692,0.67709,9.494314,0.061457,2.875983,0.995688
10461,"(QHD+, 13-14"", dedicated_gpu_Integrated)",(MACBOOK),0.053502,0.101452,0.051184,0.95667,9.429791,0.045756,20.737544,0.944485
10468,"(QHD+, dedicated_gpu_Integrated)",(MACBOOK),0.073389,0.101452,0.070095,0.955112,9.414431,0.06265,20.017654,0.964569
10471,(MACBOOK),"(QHD+, dedicated_gpu_Integrated)",0.101452,0.073389,0.070095,0.69092,9.414431,0.06265,2.997964,0.994694
10463,"(QHD+, dedicated_gpu_Integrated)","(MACBOOK, 13-14"")",0.073389,0.074244,0.051184,0.697423,9.393722,0.045735,3.059574,0.964317
10466,"(MACBOOK, 13-14"")","(QHD+, dedicated_gpu_Integrated)",0.074244,0.073389,0.051184,0.6894,9.393722,0.045735,2.983294,0.965206
10464,"(QHD+, 13-14"")","(MACBOOK, dedicated_gpu_Integrated)",0.071315,0.076745,0.051184,0.717707,9.351876,0.04571,3.270562,0.96165
10465,"(MACBOOK, dedicated_gpu_Integrated)","(QHD+, 13-14"")",0.076745,0.071315,0.051184,0.666932,9.351876,0.04571,2.788271,0.967305
10454,"(MACBOOK, 13-14"")",(QHD+),0.074244,0.104929,0.068692,0.925226,8.817619,0.060902,11.970342,0.957693


In [35]:
# Example run (FP-Growth)
# experiment 2-1
fp_frequent_2_1, fp_rules_2_1 = run_rules_fpgrowth(exp_2_1, min_support=0.05, min_confidence=0.5, min_lift=1.05)
print("\nTop rules - Experiment 2-1 (FP-Growth)")
display(fp_rules_2_1)


Top rules - Experiment 2-1 (FP-Growth)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
35108,"(QHD+, 1-4, dedicated_gpu_Integrated)","(MACBOOK, gpu_family_Apple_GPU)",0.060273,0.058992,0.054051,0.896761,15.201353,0.050495,9.11486,0.994136
35124,"(MACBOOK, gpu_family_Apple_GPU)","(QHD+, 1-4, dedicated_gpu_Integrated)",0.058992,0.060273,0.054051,0.916236,15.201353,0.050495,11.218713,0.992783
35120,"(QHD+, gpu_family_Apple_GPU)","(MACBOOK, 1-4, dedicated_gpu_Integrated)",0.054112,0.065947,0.054051,0.998873,15.146642,0.050482,828.505185,0.987409
35113,"(MACBOOK, 1-4, dedicated_gpu_Integrated)","(QHD+, gpu_family_Apple_GPU)",0.065947,0.054112,0.054051,0.819611,15.146642,0.050482,5.243616,0.99992
35097,"(MACBOOK, gpu_family_Apple_GPU)","(QHD+, 1-4)",0.058992,0.060578,0.054051,0.916236,15.124811,0.050477,11.215071,0.992429
35114,"(MACBOOK, dedicated_gpu_Integrated, gpu_family...","(QHD+, 1-4)",0.058992,0.060578,0.054051,0.916236,15.124811,0.050477,11.215071,0.992429
35094,"(QHD+, 1-4)","(MACBOOK, gpu_family_Apple_GPU)",0.060578,0.058992,0.054051,0.892246,15.124811,0.050477,8.732904,0.994105
35119,"(QHD+, 1-4)","(MACBOOK, dedicated_gpu_Integrated, gpu_family...",0.060578,0.058992,0.054051,0.892246,15.124811,0.050477,8.732904,0.994105
35096,"(MACBOOK, 1-4)","(QHD+, gpu_family_Apple_GPU)",0.066252,0.054112,0.054051,0.815838,15.076906,0.050466,5.136173,0.99992
35123,"(MACBOOK, 1-4)","(QHD+, dedicated_gpu_Integrated, gpu_family_Ap...",0.066252,0.054112,0.054051,0.815838,15.076906,0.050466,5.136173,0.99992


In [36]:
# Example run (FP-Growth)
# experiment 2-1
fp_frequent_2_2, fp_rules_2_2 = run_rules_fpgrowth(exp_2_2, min_support=0.05, min_confidence=0.5, min_lift=1.05)
print("\nTop rules - Experiment 2-2 (FP-Growth)")
display(fp_rules_2_2)


Top rules - Experiment 2-2 (FP-Growth)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
33453,"(QHD+, 1-4, dedicated_gpu_Integrated)",(gpu_family_Apple_GPU),0.060273,0.061859,0.054112,0.897773,14.513116,0.050383,9.177058,0.990817
33463,(gpu_family_Apple_GPU),"(QHD+, 1-4, dedicated_gpu_Integrated)",0.061859,0.060273,0.054112,0.874753,14.513116,0.050383,7.503015,0.992492
33443,"(QHD+, 1-4)",(gpu_family_Apple_GPU),0.060578,0.061859,0.054112,0.893253,14.440039,0.050364,8.78843,0.990767
33457,"(QHD+, 1-4)","(dedicated_gpu_Integrated, gpu_family_Apple_GPU)",0.060578,0.061859,0.054112,0.893253,14.440039,0.050364,8.78843,0.990767
33447,(gpu_family_Apple_GPU),"(QHD+, 1-4)",0.061859,0.060578,0.054112,0.874753,14.440039,0.050364,7.500579,0.99212
33461,"(dedicated_gpu_Integrated, gpu_family_Apple_GPU)","(QHD+, 1-4)",0.061859,0.060578,0.054112,0.874753,14.440039,0.050364,7.500579,0.99212
33452,(gpu_family_Apple_GPU),"(QHD+, dedicated_gpu_Integrated)",0.061859,0.073389,0.054112,0.874753,11.919334,0.049572,7.398292,0.976509
33460,"(1-4, gpu_family_Apple_GPU)","(QHD+, dedicated_gpu_Integrated)",0.061859,0.073389,0.054112,0.874753,11.919334,0.049572,7.398292,0.976509
33448,"(QHD+, dedicated_gpu_Integrated)",(gpu_family_Apple_GPU),0.073389,0.061859,0.054112,0.737323,11.919334,0.049572,3.571465,0.98866
33458,"(QHD+, dedicated_gpu_Integrated)","(1-4, gpu_family_Apple_GPU)",0.073389,0.061859,0.054112,0.737323,11.919334,0.049572,3.571465,0.98866


In [37]:
# export the results as csv to anlyze them
rules_1_1.to_csv('rules_1_1_apriori.csv', index=False)
rules_1_2.to_csv('rules_1_2_apriori.csv', index=False)
rules_2_1.to_csv('rules_2_1_apriori.csv', index=False)
rules_2_2.to_csv('rules_2_2_apriori.csv', index=False)
fp_rules_1_1.to_csv('rules_1_1_fpgrowth.csv', index=False)
fp_rules_1_2.to_csv('rules_1_2_fpgrowth.csv', index=False)
fp_rules_2_1.to_csv('rules_2_1_fpgrowth.csv', index=False)
fp_rules_2_2.to_csv('rules_2_2_fpgrowth.csv', index=False)
print("\nRules exported to CSV files.")


Rules exported to CSV files.
