Import required libraries for data manipulation and numerical operations

In [2]:
import pandas as pd
import numpy as np

# 1. Load Dataset

Upload dataset from Google Colab

In [5]:
# from google.colab import files
# uploaded = files.upload()

Read the uploaded CSV file into a pandas DataFrame

In [None]:
# df = pd.read_csv(next(iter(uploaded)))  # for Google Colab file upload
df = pd.read_csv("data_cleaned.csv")
dfcpus = pd.read_csv("cleaned_all_cpus_laptop.csv")
dfgpus = pd.read_csv("cleaned_gpus.csv")

# 2. Data Overview (EDA)

Column info

In [10]:
# display the columns of the dataframe
# display the columns of the original dataset
print("Columns in the dataframe (original dataset):")
print(df.columns.tolist())

print("\n-----------------------------------\n")

# display the columns of the cpu dataset
print("\nColumns in the cpu dataframe:")
print(dfcpus.columns.tolist())

print("\n-----------------------------------\n")

# display the columns of the gpu dataset
print("\nColumns in the gpu dataframe:")
print(dfgpus.columns.tolist())

Columns in the dataframe (original dataset):
['price_preview', 'created_at', 'city', 'spec_Etat', 'model_name', 'DEDICATED_GPU', 'CPU', 'RAM_SIZE', 'SSD_SIZE', 'HDD_SIZE', 'SCREEN_SIZE', 'SCREEN_FREQUENCY', 'SCREEN_RESOLUTION', 'RAM_TYPE', 'mapped_cpu_name', 'match_score', 'cores', 'cpu_mark', 'tdp', 'gpu_name']

-----------------------------------


Columns in the cpu dataframe:
['name', 'cpumark', 'tdp', 'cat', 'cores', 'gpu_name']

-----------------------------------


Columns in the gpu dataframe:
['gpu_name', 'g3d_mark', 'g2d_mark', 'tdp(w)']


dataset overview

In [12]:
# display an overview of the original dataframe
print(df.head(10))
print("\n-----------------------------------\n")
print("\nDataframe shape:\n")
print(f"The dataframe has {df.shape[0]} rows and {df.shape[1]} columns.")

print("\n-----------------------------------\n")

# display and overview of cpu dataframe
print(dfcpus.head(3))
print("\n-----------------------------------\n")
print("\nCPU Dataframe shape:\n")
print(f"The CPU dataframe has {dfcpus.shape[0]} rows and {dfcpus.shape[1]} columns.")

print("\n-----------------------------------\n")

# display an overview of gpu dataframe
print(dfgpus.head(3))
print("\n-----------------------------------\n")
print("\nGPU Dataframe shape:\n")
print(f"The GPU dataframe has {dfgpus.shape[0]} rows and {dfgpus.shape[1]} columns.")  

   price_preview                created_at            city      spec_Etat  \
0     75000000.0  2021 10 01T18:01:44.000Z        EL TAREF        BON TAT   
1     33500000.0  2021 11 10T21:24:14.000Z           COLLO  JAMAIS UTILIS   
2     17000000.0  2021 09 11T20:27:59.000Z        MECHERIA            NaN   
3     12000000.0  2025 03 06T00:28:39.000Z        ES SENIA            NaN   
4     11000000.0  2024 10 09T18:10:21.000Z      TIZI OUZOU        BON TAT   
5      9999999.0  2025 02 18T21:30:18.000Z      MOHAMMADIA            NaN   
6      9900000.0  2025 04 29T19:42:16.000Z  CHELGHOUM LAID            NaN   
7      9000000.0  2025 07 01T17:26:43.000Z    ALGER CENTRE        BON TAT   
8      8976378.0  2025 02 23T10:25:42.000Z      MOSTAGANEM  JAMAIS UTILIS   
9      8400000.0  2024 12 11T23:17:24.000Z  HAMMA BOUZIANE        BON TAT   

  model_name            DEDICATED_GPU                            CPU RAM_SIZE  \
0    IDEAPAD                      NaN             INTEL CORE I5 750S   

Display data types and info for screen-related columns

In [14]:
df[['SCREEN_SIZE','SCREEN_FREQUENCY','SCREEN_RESOLUTION']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16394 entries, 0 to 16393
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SCREEN_SIZE        14247 non-null  float64
 1   SCREEN_FREQUENCY   1053 non-null   object 
 2   SCREEN_RESOLUTION  6146 non-null   object 
dtypes: float64(1), object(2)
memory usage: 384.4+ KB


Missing values (%)
Calculate the percentage of missing values in each column

In [16]:
df[['SCREEN_SIZE','SCREEN_FREQUENCY','SCREEN_RESOLUTION']].isna().mean() * 100

SCREEN_SIZE          13.096255
SCREEN_FREQUENCY     93.576918
SCREEN_RESOLUTION    62.510675
dtype: float64

Descriptive statistics
Get descriptive statistics for SCREEN_SIZE

In [18]:
df['SCREEN_SIZE'].describe()

count    14247.000000
mean        14.724774
std          5.627484
min          1.000000
25%         14.000000
50%         14.000000
75%         15.600000
max        500.000000
Name: SCREEN_SIZE, dtype: float64

Value distributions
Display value distributions for resolution and frequency columns

In [20]:
df['SCREEN_RESOLUTION'].value_counts(dropna=False)
df['SCREEN_FREQUENCY'].value_counts(dropna=False)

SCREEN_FREQUENCY
NaN      15341
144Hz      430
120Hz      162
240Hz      145
165Hz      144
60Hz       119
360Hz       22
300Hz       12
90Hz         9
180Hz        2
64Hz         2
480Hz        1
244Hz        1
75Hz         1
24Hz         1
45Hz         1
2.4Hz        1
Name: count, dtype: int64

# 3. Drop SCREEN_FREQUENCY

Remove the SCREEN_FREQUENCY column as it has limited utility

In [23]:
df.drop(columns=['SCREEN_FREQUENCY'], inplace=True)

# 4. Clean SCREEN_SIZE

Convert and normalize SCREEN_SIZE values (replace commas with decimals and extract numeric values)

In [26]:
df['SCREEN_SIZE'] = (
    df['SCREEN_SIZE']
    .astype(str)
    .str.replace(',', '.', regex=False)
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

Remove impossible values

Replace screen sizes outside the valid range (10-20 inches) with NaN

In [29]:
df.loc[
    (df['SCREEN_SIZE'] < 10) | (df['SCREEN_SIZE'] > 20),
    'SCREEN_SIZE'
] = np.nan

show some stats about SCREEN_SIZE after normalization

Show value counts and statistics for normalized SCREEN_SIZE

In [32]:
pd.set_option('display.max_rows', 100)

#df['SCREEN_SIZE'].describe()
#df['SCREEN_SIZE'].mode()
#df['SCREEN_SIZE'].unique()
df['SCREEN_SIZE'].value_counts()



SCREEN_SIZE
14.000    4459
15.600    3128
15.000    1436
13.000    1402
16.000    1017
13.300     930
17.000     292
17.300     218
14.100     177
12.000     144
12.500     124
13.600     111
13.500      86
16.100      83
11.600      61
11.000      53
15.400      43
13.400      41
13.100      36
18.000      31
10.000      30
12.300      29
15.300      20
14.500      17
12.400      16
16.200      15
14.200      14
15.500      14
13.800      12
16.300      11
15.700      10
10.100      10
14.400       8
19.000       7
12.900       5
16.500       4
10.500       4
14.300       4
14.600       3
13.900       3
19.500       2
15.800       2
12.100       2
11.500       2
16.400       2
14.700       1
13.200       1
16.600       1
12.200       1
10.600       1
13.140       1
15.100       1
17.200       1
13.700       1
10.300       1
14.150       1
16.900       1
12.350       1
17.600       1
15.900       1
17.100       1
20.000       1
18.500       1
17.700       1
12.513       1
Name: count, 

Snap the values to the nearest canonical size

Define canonical screen sizes and snap values to the nearest standard size if within tolerance

In [35]:
canonical_sizes = np.array([
    11.6, 12.5, 13.3, 14.0, 15.0, 15.6, 16.0, 17.3      # we can add 14.1 and 16.1
])
# these standard sizes ~80% of the data
# Adding 14.1 & 16.1 improves coverage by ~1.9% only.

def snap_screen_size(x):
    if pd.isna(x):
        return np.nan
    diff = np.abs(canonical_sizes - x)
    min_diff = diff.min()
    min_diff = np.round(min_diff, 2)
    if min_diff <= 0.3:
        return canonical_sizes[diff.argmin()]
    return x  # keep rare but valid sizes

df['SCREEN_SIZE_SNAPPED'] = df['SCREEN_SIZE'].apply(snap_screen_size)


Analyze the coverage of canonical sizes and percentage of missing values

In [37]:
print(df['SCREEN_SIZE_SNAPPED'].value_counts())

is_canonical = df['SCREEN_SIZE_SNAPPED'].isin(canonical_sizes)
canonical_pct = is_canonical.mean() * 100

none_pct = df['SCREEN_SIZE_SNAPPED'].isna().mean() * 100

print(f"percentage of canonical sizes: {canonical_pct:.2f}%")
print(f"percentage of none values: {none_pct:.2f}%")


SCREEN_SIZE_SNAPPED
14.0    4671
15.6    3215
13.3    2608
15.0    1438
16.0    1129
17.3     513
12.5     172
12.0     144
11.6      63
11.0      53
18.0      31
10.0      30
14.5      17
10.1      10
14.4       8
19.0       7
12.9       5
16.5       4
10.5       4
14.6       3
19.5       2
12.1       2
16.4       2
10.6       1
18.5       1
10.3       1
16.9       1
20.0       1
17.7       1
16.6       1
Name: count, dtype: int64
percentage of canonical sizes: 84.23%
percentage of none values: 13.76%


Check value counts by model name to understand data distribution

In [39]:
print(df['model_name'].value_counts())

model_name
THINKPAD       2402
LATITUDE       2334
MACBOOK        1664
ELITEBOOK      1132
PAVILION       1126
VIVOBOOK        787
PROBOOK         737
INSPIRON        656
SURFACE         491
IDEAPAD         481
ASPIRE          341
XPS             287
STEALTH         257
PRECISION       248
VICTUS          236
TUF             216
VOSTRO          213
ROG             198
ZBOOK           193
LEGION          187
OMEN            176
ZENBOOK         171
NITRO           168
GALAXY          144
YOGA            139
THINKBOOK       135
ENVY            130
DYNABOOK        118
PREDATOR         74
KATANA           63
MAC              62
SWIFT            60
SPECTRE          36
ALIENWARE        35
AERO             33
IMAC             32
BLADE            30
VECTOR           20
TRAVELMATE       18
SPIN             10
STRIX             7
GF                4
SWORD             4
OPTIPLEX          4
COMPAQ            4
TRANSFORMER       3
Name: count, dtype: int64


Display row counts and percentage of missing values grouped by model name

In [41]:
summary = df.groupby('model_name').agg(
    total_rows=('SCREEN_SIZE', 'size'),
    nan_rows=('SCREEN_SIZE', lambda s: s.isna().sum())
)

summary['percentage_nan'] = (summary['nan_rows'] / summary['total_rows']) * 100
summary = summary.sort_values(by='total_rows', ascending=False)

summary


Unnamed: 0_level_0,total_rows,nan_rows,percentage_nan
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
THINKPAD,2402,274,11.407161
LATITUDE,2334,180,7.712082
MACBOOK,1664,158,9.495192
ELITEBOOK,1132,96,8.480565
PAVILION,1126,201,17.850799
VIVOBOOK,787,120,15.247776
PROBOOK,737,55,7.462687
INSPIRON,656,115,17.530488
SURFACE,491,62,12.627291
IDEAPAD,481,62,12.889813


Check snapped screen size values for a specific model (LATITUDE)

In [43]:
result = df.loc[df['model_name'] == "LATITUDE", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    1240
13.3     411
15.6     284
15.0     143
12.5      28
12.0      23
16.0       9
11.0       4
11.6       4
17.3       2
14.5       2
14.6       1
12.1       1
12.9       1
10.0       1
Name: count, dtype: int64


Fill missing SCREEN_SIZE values for LATITUDE using model-specific mode

In [45]:
# Calculate the mode for LATITUDE model
latitude_mode = df[df['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].mode()

if len(latitude_mode) > 0:
    latitude_mode_value = latitude_mode[0]
    print(f"LATITUDE mode SCREEN_SIZE_SNAPPED: {latitude_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for LATITUDE with its mode
    df.loc[df['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'] = df.loc[df['model_name'] == 'LATITUDE', 'SCREEN_SIZE_SNAPPED'].fillna(latitude_mode_value)

    print(f"Filled missing values for LATITUDE. Now LATITUDE has {df[df['model_name'] == 'LATITUDE']['SCREEN_SIZE_SNAPPED'].isna().sum()} missing values")
else:
    print("Warning: LATITUDE model has no non-missing SCREEN_SIZE_SNAPPED values")

LATITUDE mode SCREEN_SIZE_SNAPPED: 14.0
Filled missing values for LATITUDE. Now LATITUDE has 0 missing values


Check snapped screen size values for a specific model (THINKPAD)

In [47]:
result = df.loc[df['model_name'] == "THINKPAD", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    1132
13.3     328
15.6     294
16.0     129
15.0     124
12.5      40
12.0      36
17.3      20
11.6      16
11.0       4
16.4       1
10.3       1
19.5       1
19.0       1
10.1       1
Name: count, dtype: int64


Fill missing SCREEN_SIZE values for THINKPAD using model-specific mode

In [49]:
# Calculate the mode for THINKPAD model
thinkpad_mode = df[df['model_name'] == 'THINKPAD']['SCREEN_SIZE_SNAPPED'].mode()

if len(thinkpad_mode) > 0:
    thinkpad_mode_value = thinkpad_mode[0]
    print(f"THINKPAD mode SCREEN_SIZE_SNAPPED: {thinkpad_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for THINKPAD with its mode
    df.loc[df['model_name'] == 'THINKPAD', 'SCREEN_SIZE_SNAPPED'] = (
        df.loc[df['model_name'] == 'THINKPAD', 'SCREEN_SIZE_SNAPPED']
        .fillna(thinkpad_mode_value)
    )

    print(
        f"Filled missing values for THINKPAD. "
        f"Now THINKPAD has "
        f"{df[df['model_name'] == 'THINKPAD']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: THINKPAD model has no non-missing SCREEN_SIZE_SNAPPED values")

THINKPAD mode SCREEN_SIZE_SNAPPED: 14.0
Filled missing values for THINKPAD. Now THINKPAD has 0 missing values


Check snapped screen size values for a specific model (MACBOOK)

In [51]:
result = df.loc[df['model_name'] == "MACBOOK", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
13.3    899
16.0    205
14.0    198
15.0    115
15.6     58
11.0     17
12.0      8
12.9      2
17.3      1
12.5      1
10.0      1
11.6      1
Name: count, dtype: int64


List all the cpus of MACBOOK laptops and the counts for each one 

In [53]:
pd.set_option('display.max_rows', 140)

macbook_cpus = df.loc[df['model_name'] == 'MACBOOK', 'CPU']
print(macbook_cpus.value_counts())

CPU
INTEL CORE I5                     254
APPLE M1                          248
APPLE M2                          211
APPLE M3                          151
INTEL CORE I7                     106
APPLE M1 PRO                       89
APPLE M4                           68
APPLE M3 PRO                       42
APPLE M2 PRO                       37
APPLE M1 MAX                       35
APPLE M3 MAX                       29
INTEL CORE I9                      28
APPLE M4 PRO                       28
APPLE M2 MAX                       23
INTEL CORE I5 1.8GHZ               13
INTEL CORE I5 1.6GHZ               13
INTEL CORE I5 2.3GHZ               12
INTEL CORE I3                      11
INTEL CORE I5 2.7GHZ                9
INTEL CORE I5 5350U                 8
INTEL CORE I7 9750H                 8
INTEL CORE I5 5257U                 8
INTEL CORE I5 8210Y                 8
INTEL CORE I7 2.6GHZ                7
INTEL CORE M3 8100Y                 7
8TH GEN INTEL CORE I5               6
APPLE M4

show the count of resolution for each cpu from macbook cpus

In [55]:
# show the count of resolution for each cpu from macbook cpus
macbook_df = df[df['model_name'] == 'MACBOOK']

for cpu in macbook_df['CPU'].unique():
    print(f"\n{cpu}:")
    print(macbook_df[macbook_df['CPU'] == cpu]['SCREEN_SIZE_SNAPPED'].value_counts())


INTEL CORE I5:
SCREEN_SIZE_SNAPPED
13.3    214
14.0      8
11.0      3
15.0      2
12.0      1
10.0      1
15.6      1
Name: count, dtype: int64

APPLE M1 MAX:
SCREEN_SIZE_SNAPPED
16.0    27
14.0     5
Name: count, dtype: int64

APPLE M2:
SCREEN_SIZE_SNAPPED
13.3    150
15.0     20
15.6     10
11.0      6
16.0      3
14.0      2
12.9      2
Name: count, dtype: int64

APPLE M3 MAX:
SCREEN_SIZE_SNAPPED
16.0    13
14.0    13
Name: count, dtype: int64

APPLE M1:
SCREEN_SIZE_SNAPPED
13.3    217
14.0      2
11.0      2
16.0      1
15.0      1
Name: count, dtype: int64

APPLE M4 MAX:
SCREEN_SIZE_SNAPPED
16.0    4
14.0    2
Name: count, dtype: int64

APPLE M2 MAX:
SCREEN_SIZE_SNAPPED
16.0    13
14.0     8
13.3     1
Name: count, dtype: int64

INTEL CORE I5 2310:
SCREEN_SIZE_SNAPPED
16.0    1
Name: count, dtype: int64

APPLE M4 PRO:
SCREEN_SIZE_SNAPPED
14.0    14
16.0    12
Name: count, dtype: int64

APPLE M4:
SCREEN_SIZE_SNAPPED
13.3    33
14.0    17
15.6     5
15.0     4
16.0     2
11.0     

Fill missing SCREEN_SIZE_SNAPPED values for MACBOOK using cpu name.

The Dynamic "Mode" Strategy
This script automates the mapping by calculating the most common screen size for every CPU present in the MACBOOK subset.

In [58]:
# 1. Create a mapping table: Most frequent Screen Size for every CPU
# We filter for MacBooks and drop rows where screen size is missing to find the 'Mode'
macbook_data = df[df['model_name'] == 'MACBOOK'].dropna(subset=['SCREEN_SIZE_SNAPPED'])

# This calculates the mode (most common value) for each CPU group
cpu_mode_mapping = macbook_data.groupby('CPU')['SCREEN_SIZE_SNAPPED'].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else None
).to_dict()

# 2. Fill the missing values using the dynamic map
# 'mask' identifies exactly which rows need filling
mask = (df['model_name'] == 'MACBOOK') & (df['SCREEN_SIZE_SNAPPED'].isna())

# Map the CPU names in those rows to our calculated modes
df.loc[mask, 'SCREEN_SIZE_SNAPPED'] = df.loc[mask, 'CPU'].map(cpu_mode_mapping)

print(f"Filled missing values for {mask.sum()} MacBook records.")

Filled missing values for 158 MacBook records.


Refined Script with Keyword Fallback
This version handles the 130+ variations by falling back to general categories if the specific string doesn't have a known screen size.

In [60]:
def get_fallback_size(cpu_string):
    """Assigns a screen size based on architectural keywords if exact match fails."""
    cpu_string = str(cpu_string).upper()
    if 'M1 MAX' in cpu_string or 'M2 MAX' in cpu_string or 'M3 MAX' in cpu_string:
        return 16.0
    elif 'M1 PRO' in cpu_string or 'M2 PRO' in cpu_string:
        return 14.0 # Most common Pro size in newer models
    elif 'I9' in cpu_string:
        return 16.0
    elif 'M1' in cpu_string or 'M2' in cpu_string or 'M3' in cpu_string or 'I5' in cpu_string:
        return 13.3
    elif 'I7' in cpu_string:
        return 15.0
    return np.nan

# Apply the specific mapping first
df.loc[mask, 'SCREEN_SIZE_SNAPPED'] = df.loc[mask, 'CPU'].map(cpu_mode_mapping)

# Apply the fallback for any remaining NaNs in MacBooks
final_mask = (df['model_name'] == 'MACBOOK') & (df['SCREEN_SIZE_SNAPPED'].isna())
df.loc[final_mask, 'SCREEN_SIZE_SNAPPED'] = df.loc[final_mask, 'CPU'].apply(get_fallback_size)

print(
    f"Filled missing values for MACBOOK. "
    f"Now MACBOOK has "
    f"{df[df['model_name'] == 'MACBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
    f"missing values"
)

Filled missing values for MACBOOK. Now MACBOOK has 2 missing values


Fill missing SCREEN_SIZE_SNAPPED values for MACBOOK using model-specific mode

In [62]:
# Calculate the mode for MACBOOK model
# macbook_mode = df[df['model_name'] == 'MACBOOK']['SCREEN_SIZE_SNAPPED'].mode()
# 
# if len(macbook_mode) > 0:
#     macbook_mode_value = macbook_mode[0]
#     print(f"MACBOOK mode SCREEN_SIZE_SNAPPED: {macbook_mode_value}")
# 
#     # Fill missing SCREEN_SIZE_SNAPPED values for MACBOOK with its mode
#     df.loc[df['model_name'] == 'MACBOOK', 'SCREEN_SIZE_SNAPPED'] = (
#         df.loc[df['model_name'] == 'MACBOOK', 'SCREEN_SIZE_SNAPPED']
#         .fillna(macbook_mode_value)
#     )
# 
#     print(
#         f"Filled missing values for MACBOOK. "
#         f"Now MACBOOK has "
#         f"{df[df['model_name'] == 'MACBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
#         f"missing values"
#     )
# else:
#     print("Warning: MACBOOK model has no non-missing SCREEN_SIZE_SNAPPED values")

Check snapped screen size values for a specific model (ELITEBOOK)

In [64]:
result = df.loc[df['model_name'] == "ELITEBOOK", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    577
13.3    249
15.6    112
16.0     44
15.0     41
12.0      8
12.5      5
Name: count, dtype: int64


Fill missing SCREEN_SIZE_SNAPPED values for ELITEBOOK using model-specific mode

In [66]:
# Calculate the mode for ELITEBOOK model
elitebook_mode = df[df['model_name'] == 'ELITEBOOK']['SCREEN_SIZE'].mode()

if len(elitebook_mode) > 0:
    elitebook_mode_value = elitebook_mode[0]
    print(f"ELITEBOOK mode SCREEN_SIZE: {elitebook_mode_value}")

    # Fill missing SCREEN_SIZE values for ELITEBOOK with its mode
    df.loc[df['model_name'] == 'ELITEBOOK', 'SCREEN_SIZE'] = (
        df.loc[df['model_name'] == 'ELITEBOOK', 'SCREEN_SIZE']
        .fillna(elitebook_mode_value)
    )

    print(
        f"Filled missing values for ELITEBOOK. "
        f"Now ELITEBOOK has "
        f"{df[df['model_name'] == 'ELITEBOOK']['SCREEN_SIZE'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: ELITEBOOK model has no non-missing SCREEN_SIZE values")

ELITEBOOK mode SCREEN_SIZE: 14.0
Filled missing values for ELITEBOOK. Now ELITEBOOK has 0 missing values


Check snapped screen size values for a specific model (PAVILION)

In [68]:
result = df.loc[df['model_name'] == "PAVILION", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
15.6    387
14.0    197
15.0    170
17.3     55
13.3     53
16.0     33
12.5     11
12.0      5
11.6      4
19.0      2
10.0      2
12.1      1
16.9      1
16.5      1
14.5      1
18.5      1
18.0      1
Name: count, dtype: int64


Fill missing SCREEN_SIZE_SNAPPED values for PAVILION using model-specific mode

In [70]:
# Calculate the mode for PAVILION model
pavilion_mode = df[df['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].mode()

if len(pavilion_mode) > 0:
    pavilion_mode_value = pavilion_mode[0]
    print(f"PAVILION mode SCREEN_SIZE_SNAPPED: {pavilion_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PAVILION with its mode
    df.loc[df['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED'] = (
        df.loc[df['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED']
        .fillna(pavilion_mode_value)
    )

    print(
        f"Filled missing values for PAVILION. "
        f"Now PAVILION has "
        f"{df[df['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PAVILION model has no non-missing SCREEN_SIZE_SNAPPED values")


PAVILION mode SCREEN_SIZE_SNAPPED: 15.6
Filled missing values for PAVILION. Now PAVILION has 0 missing values


Check snapped screen size values for a specific model (PAVILION)

In [72]:
result = df.loc[df['model_name'] == "PAVILION", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
15.6    588
14.0    197
15.0    170
17.3     55
13.3     53
16.0     33
12.5     11
12.0      5
11.6      4
19.0      2
10.0      2
12.1      1
16.9      1
16.5      1
14.5      1
18.5      1
18.0      1
Name: count, dtype: int64


Fill missing SCREEN_SIZE_SNAPPED values for PAVILION using model-specific mode

In [74]:
# Calculate the mode for PAVILION model
pavilion_mode = df[df['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].mode()

if len(pavilion_mode) > 0:
    pavilion_mode_value = pavilion_mode[0]
    print(f"PAVILION mode SCREEN_SIZE_SNAPPED: {pavilion_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PAVILION with its mode
    df.loc[df['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED'] = (
        df.loc[df['model_name'] == 'PAVILION', 'SCREEN_SIZE_SNAPPED']
        .fillna(pavilion_mode_value)
    )

    print(
        f"Filled missing values for PAVILION. "
        f"Now PAVILION has "
        f"{df[df['model_name'] == 'PAVILION']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PAVILION model has no non-missing SCREEN_SIZE_SNAPPED values")

PAVILION mode SCREEN_SIZE_SNAPPED: 15.6
Filled missing values for PAVILION. Now PAVILION has 0 missing values


Check snapped screen size values for a specific model (INSPIRON)

In [76]:
result = df.loc[df['model_name'] == "INSPIRON", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
15.6    183
14.0    139
15.0    101
16.0     51
13.3     39
17.3     18
12.5      4
12.0      2
10.0      2
19.5      1
11.6      1
Name: count, dtype: int64


Check snapped screen size values for a specific model (VIVOBOOK)

In [78]:
result = df.loc[df['model_name'] == "VIVOBOOK", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
15.6    239
14.0    177
15.0    103
17.3     63
16.0     46
11.6     14
13.3     10
11.0      3
12.0      3
18.0      2
10.1      2
16.5      2
12.5      1
10.0      1
19.0      1
Name: count, dtype: int64


Check snapped screen size values for a specific model (PROBOOK)

In [80]:
result = df.loc[df['model_name'] == "PROBOOK", 'SCREEN_SIZE_SNAPPED']
print(result.value_counts())

SCREEN_SIZE_SNAPPED
14.0    262
15.6    231
15.0     64
13.3     55
16.0     43
17.3     10
11.6      8
12.0      4
11.0      4
12.5      1
Name: count, dtype: int64


Fill missing SCREEN_SIZE_SNAPPED values for PROBOOK using model-specific mode

In [82]:
# Calculate the mode for PROBOOK model
probook_mode = df[df['model_name'] == 'PROBOOK']['SCREEN_SIZE_SNAPPED'].mode()

if len(probook_mode) > 0:
    probook_mode_value = probook_mode[0]
    print(f"PROBOOK mode SCREEN_SIZE_SNAPPED: {probook_mode_value}")

    # Fill missing SCREEN_SIZE_SNAPPED values for PROBOOK with its mode
    df.loc[df['model_name'] == 'PROBOOK', 'SCREEN_SIZE_SNAPPED'] = (
        df.loc[df['model_name'] == 'PROBOOK', 'SCREEN_SIZE_SNAPPED']
        .fillna(probook_mode_value)
    )

    print(
        f"Filled missing values for PROBOOK. "
        f"Now PROBOOK has "
        f"{df[df['model_name'] == 'PROBOOK']['SCREEN_SIZE_SNAPPED'].isna().sum()} "
        f"missing values"
    )
else:
    print("Warning: PROBOOK model has no non-missing SCREEN_SIZE_SNAPPED values")

PROBOOK mode SCREEN_SIZE_SNAPPED: 14.0
Filled missing values for PROBOOK. Now PROBOOK has 0 missing values


Fill missing SCREEN_SIZE_SNAPPED values for all remaining models

In [84]:
# List of remaining model names to process
remaining_models = [
    'VIVOBOOK', 'INSPIRON', 'SURFACE', 'IDEAPAD', 'ASPIRE', 'XPS', 'STEALTH', 
    'PRECISION', 'VICTUS', 'TUF', 'VOSTRO', 'ROG', 'ZBOOK', 'LEGION', 'OMEN', 
    'ZENBOOK', 'NITRO', 'GALAXY', 'YOGA', 'THINKBOOK', 'ENVY', 'DYNABOOK', 
    'PREDATOR', 'KATANA', 'MAC', 'SWIFT', 'SPECTRE', 'ALIENWARE', 'AERO', 
    'IMAC', 'BLADE', 'VECTOR', 'TRAVELMATE', 'SPIN', 'STRIX', 'COMPAQ', 
    'GF', 'OPTIPLEX', 'SWORD', 'TRANSFORMER'
]

# Process each model
for model in remaining_models:
    # Calculate the mode for this model
    model_mode = df[df['model_name'] == model]['SCREEN_SIZE_SNAPPED'].mode()
    
    if len(model_mode) > 0:
        model_mode_value = model_mode[0]
        
        # Count missing values before filling
        missing_before = df[df['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        # Fill missing SCREEN_SIZE_SNAPPED values for this model with its mode
        df.loc[df['model_name'] == model, 'SCREEN_SIZE_SNAPPED'] = (
            df.loc[df['model_name'] == model, 'SCREEN_SIZE_SNAPPED']
            .fillna(model_mode_value)
        )
        
        # Count missing values after filling
        missing_after = df[df['model_name'] == model]['SCREEN_SIZE_SNAPPED'].isna().sum()
        
        if missing_before > 0:
            print(f"{model}: Filled {missing_before} missing values with mode {model_mode_value}. Remaining missing: {missing_after}")
    else:
        print(f"Warning: {model} has no non-missing SCREEN_SIZE_SNAPPED values")

print("\nAll remaining models processed!")

VIVOBOOK: Filled 120 missing values with mode 15.6. Remaining missing: 0
INSPIRON: Filled 115 missing values with mode 15.6. Remaining missing: 0
SURFACE: Filled 62 missing values with mode 13.3. Remaining missing: 0
IDEAPAD: Filled 62 missing values with mode 15.6. Remaining missing: 0
ASPIRE: Filled 66 missing values with mode 15.6. Remaining missing: 0
XPS: Filled 44 missing values with mode 13.3. Remaining missing: 0
STEALTH: Filled 58 missing values with mode 15.6. Remaining missing: 0
PRECISION: Filled 63 missing values with mode 15.6. Remaining missing: 0
VICTUS: Filled 60 missing values with mode 15.6. Remaining missing: 0
TUF: Filled 39 missing values with mode 15.6. Remaining missing: 0
VOSTRO: Filled 20 missing values with mode 15.6. Remaining missing: 0
ROG: Filled 41 missing values with mode 17.3. Remaining missing: 0
ZBOOK: Filled 28 missing values with mode 15.6. Remaining missing: 0
LEGION: Filled 45 missing values with mode 16.0. Remaining missing: 0
OMEN: Filled 40 mi

*
*
*   
*   
*
*
*
*
*
*

# 5. Clean SCREEN_RESOLUTION

Normalize text.
* Normalize SCREEN_RESOLUTION text (convert to lowercase and remove spaces)

In [88]:
df['SCREEN_RESOLUTION'] = (
    df['SCREEN_RESOLUTION']
    .astype(str)
    .str.lower()
    .str.replace(' ', '')
)

Display the frequency distribution of values in the SCREEN_RESOLUTION column after normalization.

In [90]:
pd.set_option('display.max_rows', 120)
df['SCREEN_RESOLUTION'].value_counts()

SCREEN_RESOLUTION
nan                10248
1920x1080           2740
1920x1080fhd        1916
1920x1200            239
fhd                  218
2560x1440            116
2560x1600            107
1366x768             102
2k                    99
3k                    86
3840x2160             80
fullhd                34
2560x1664             25
2880x1800             25
2256x1504             21
3024x1964             20
2.5k                  19
3840x2400             16
2880x1920             14
2.8k                  13
fhd+                  13
1440x900              12
1920x1280             11
4k                    10
3456x2234              9
2048x1080              9
3072x1920              8
qhd+                   8
2736x1824              7
1600x900               7
5120x2880              6
2880x1864              6
1536x1024              6
3koled                 6
1920x1200fhd           5
2400x1600              5
2240x1400              4
wuxga                  4
1280x800               4
3kretin

Map resolution values to standard categories (HD, FHD, QHD, 4K, ...)

In [92]:
# Normalize SCREEN_RESOLUTION into standardized resolution tiers
resolution_map = {
    # HD
    '1366x768': 'HD',
    '1280x720': 'HD',
    'hd': 'HD',

    # HD+
    '1440x900': 'HD+',
    '1600x900': 'HD+',
    '1536x1024': 'HD+',
    '1280x800': 'HD+',

    # FHD
    '1920x1080': 'FHD',
    '1920x1080fhd': 'FHD',
    'fullhd': 'FHD',
    'fhd': 'FHD',
    '1080p': 'FHD',
    'fhd1080p': 'FHD',
    '1920x1080fullhd': 'FHD',

    # WUXGA (FHD+ / 16:10)
    '1920x1200': 'WUXGA',
    '1920x1200fhd': 'WUXGA',
    '1920x1200fhd+': 'WUXGA',
    '1920x1200wuxga': 'WUXGA',
    '1920x1280': 'WUXGA',
    'fhd+': 'WUXGA',
    'fullhd+': 'WUXGA',
    'wuxga': 'WUXGA',

    # QHD / 2K
    '2560x1440': 'QHD',
    '2560x1440qhd': 'QHD',
    'qhd': 'QHD',
    'wqhd': 'QHD',
    '2k': 'QHD',
    'qhd2k': 'QHD',
    '1440p': 'QHD',
    '2048x1080': 'QHD',


    # QHD+ (16:10)
    '2560x1600': 'QHD+',
    '2560x1600qhd+': 'QHD+',
    '2400x1600': 'QHD+',
    '2240x1400': 'QHD+',
    '2560x1664': 'QHD+',
    '2256x1504': 'QHD+',
    'wqxga': 'QHD+',
    'wqxga+': 'QHD+',
    'qhd+': 'QHD+',
    '2.5k': 'QHD+',
    '2496x1664': 'QHD+',
    '2360x1640': 'QHD+',
    '2304x1536': 'QHD+',

    # 3K-class (high-density laptop panels)
    '2880x1800': '3K',
    '2880x1920': '3K',
    '3072x1920': '3K',
    '3000x2000': '3K',
    '3024x1964': '3K',
    '3200x2000': '3K',
    '2736x1824': '3K',
    '2736x1834': '3K',
    '2736x1823': '3K',
    '3456x2234': '3K',
    '3k': '3K',
    '2.8k': '3K',
    '2880x1864': '3K',
    '3koled': '3K',
    '2880x1864': '3K',
    '3kretina': '3K',

    # 4K / UHD
    '3840x2160': '4K',
    '3840x2400': '4K',
    '3456x2160': '4K',
    '3240x2160': '4K',
    '4k': '4K',
    '4kuhd': '4K',

    # 5K
    '5120x2880': '5K',
    '5k': '5K'
}

df['SCREEN_RESOLUTION_STD'] = (
    df['SCREEN_RESOLUTION']
    .str.lower()
    .str.strip()
    .map(resolution_map)
    .fillna(df['SCREEN_RESOLUTION'])
)


# resolution hierarchy (for modeling)
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K

Display the frequency distribution of the normalized SCREEN_RESOLUTION_STD values.

In [94]:
pd.set_option('display.max_rows', 120)
df['SCREEN_RESOLUTION_STD'].value_counts()

SCREEN_RESOLUTION_STD
nan              10248
FHD               4911
WUXGA              278
QHD                233
3K                 208
QHD+               205
4K                 113
HD                 107
HD+                 29
5K                   7
1920x120             2
2880x1620            2
1929x1080            2
retina3k             2
3072x1620            2
3120x2080            2
2160x1440            2
wqxga2k              2
2k8                  2
2960x1848            2
3200x1800            2
2.4k                 1
2964x1694            1
fhd+1200p            1
1080x1920            1
1920x1280qhd         1
1920x1200qhd         1
1920x1980            1
2388x1668            1
2000x1200            1
1920x180fhd          1
2304x1440            1
1800x1200            1
1336x768             1
2160x1350            1
fhd+wuxga            1
2400p                1
2520x1680            1
3kqhd+               1
qhd+4k               1
4480x2520            1
2084x1080            1
1920x1080fhd

set non standard SCREEN_RESOLUTION_STD values to nan

In [96]:
# set non standard SCREEN_RESOLUTION_STD to NaN
valid_resolutions = [
    'HD', 'HD+', 'FHD', 'WUXGA', 'QHD', 'QHD+', '3K', '4K', '5K'
]

df.loc[  # Use .loc to set values in the DataFrame where condition is met
    ~df['SCREEN_RESOLUTION_STD'].isin(valid_resolutions), 
    'SCREEN_RESOLUTION_STD'  # Column to update
] = np.nan 

# Print the frequency count of each unique value in SCREEN_RESOLUTION_STD after setting invalid ones to NaN
print(df['SCREEN_RESOLUTION_STD'].value_counts())  
# Calculate and display the percentage of missing (NaN) values in SCREEN_RESOLUTION_STD
df['SCREEN_RESOLUTION_STD'].isna().mean() * 100  

SCREEN_RESOLUTION_STD
FHD      4911
WUXGA     278
QHD       233
3K        208
QHD+      205
4K        113
HD        107
HD+        29
5K          7
Name: count, dtype: int64


62.846163230450166

Show the percentage of missing SCREEN_RESOLUTION_STD values for each model name

In [98]:
summary = df.groupby('model_name').agg(
    total_rows=('SCREEN_RESOLUTION_STD', 'size'),
    nan_rows=('SCREEN_RESOLUTION_STD', lambda s: s.isna().sum())
)

summary['percentage_nan'] = (summary['nan_rows'] / summary['total_rows']) * 100

mode_counts = (
    df.dropna(subset=['SCREEN_RESOLUTION_STD'])
      .groupby('model_name')['SCREEN_RESOLUTION_STD']
      .value_counts()
      .rename('mode_count')
      .reset_index()
      .sort_values(['model_name', 'mode_count'], ascending=[True, False])
      .drop_duplicates('model_name')
      .set_index('model_name')
      [['SCREEN_RESOLUTION_STD', 'mode_count']]
)

summary = summary.join(mode_counts).rename(columns={'SCREEN_RESOLUTION_STD': 'mode_resolution'})

summary = summary.sort_values(by='total_rows', ascending=False)

summary

Unnamed: 0_level_0,total_rows,nan_rows,percentage_nan,mode_resolution,mode_count
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
THINKPAD,2402,1419,59.07577,FHD,833.0
LATITUDE,2334,1195,51.199657,FHD,1097.0
MACBOOK,1664,1439,86.478365,QHD+,85.0
ELITEBOOK,1132,579,51.14841,FHD,520.0
PAVILION,1126,772,68.561279,FHD,309.0
VIVOBOOK,787,527,66.963151,FHD,214.0
PROBOOK,737,370,50.203528,FHD,335.0
INSPIRON,656,460,70.121951,FHD,171.0
SURFACE,491,320,65.173116,3K,49.0
IDEAPAD,481,292,60.706861,FHD,174.0


Fill missing SCREEN_RESOLUTION values with the specific mode for each model name

In [100]:
# fill missing SCREEN_RESOLUTION_STD by model_name mode
for model in summary.index:
    mode_value = summary.loc[model, 'mode_resolution']
    if pd.notna(mode_value):
        mask = (df['model_name'] == model) & (df['SCREEN_RESOLUTION_STD'].isna())
        df.loc[mask, 'SCREEN_RESOLUTION_STD'] = mode_value
        filled_count = mask.sum()
        if filled_count > 0:
            print(f"Filled {filled_count} missing values for {model} with mode {mode_value}")       
print("\nAll models processed for SCREEN_RESOLUTION_STD!")  

Filled 1419 missing values for THINKPAD with mode FHD
Filled 1195 missing values for LATITUDE with mode FHD
Filled 1439 missing values for MACBOOK with mode QHD+
Filled 579 missing values for ELITEBOOK with mode FHD
Filled 772 missing values for PAVILION with mode FHD
Filled 527 missing values for VIVOBOOK with mode FHD
Filled 370 missing values for PROBOOK with mode FHD
Filled 460 missing values for INSPIRON with mode FHD
Filled 320 missing values for SURFACE with mode 3K
Filled 292 missing values for IDEAPAD with mode FHD
Filled 246 missing values for ASPIRE with mode FHD
Filled 176 missing values for XPS with mode FHD
Filled 161 missing values for STEALTH with mode FHD
Filled 167 missing values for PRECISION with mode FHD
Filled 129 missing values for VICTUS with mode FHD
Filled 127 missing values for TUF with mode FHD
Filled 112 missing values for VOSTRO with mode FHD
Filled 122 missing values for ROG with mode FHD
Filled 126 missing values for ZBOOK with mode FHD
Filled 111 missin

# 6. Encode SCREEN_RESOLUTION (for ML)

Create numeric encodings for resolution categories for machine learning

In [103]:
# Encode SCREEN_RESOLUTION_STD into numerical values for modeling using this resolution hierarchy 
# HD < HD+ < FHD < WUXGA < QHD < QHD+ < 3K < 4K < 5K
resolution_encoding = {
    'HD': 1,
    'HD+': 2,
    'FHD': 3,
    'WUXGA': 4,
    'QHD': 5,
    'QHD+': 6,
    '3K': 7,
    '4K': 8,
    '5K': 9
}

df['SCREEN_RESOLUTION_ENC'] = df['SCREEN_RESOLUTION_STD'].map(resolution_encoding)

# 7. Final Check

Verify data types and display sample rows of cleaned data

In [106]:
df[['SCREEN_SIZE_SNAPPED','SCREEN_RESOLUTION_STD','SCREEN_RESOLUTION_ENC']].info()
df[['SCREEN_SIZE_SNAPPED','SCREEN_RESOLUTION_STD']].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16394 entries, 0 to 16393
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   SCREEN_SIZE_SNAPPED    16167 non-null  float64
 1   SCREEN_RESOLUTION_STD  16026 non-null  object 
 2   SCREEN_RESOLUTION_ENC  16026 non-null  float64
dtypes: float64(2), object(1)
memory usage: 384.4+ KB


Unnamed: 0,SCREEN_SIZE_SNAPPED,SCREEN_RESOLUTION_STD
0,14.0,FHD
1,15.6,4K
2,17.3,FHD
3,14.0,FHD
4,,


# 8. Save Clean Dataset

Export the cleaned dataset to a CSV file

In [109]:
df.to_csv("laptops_algeria_cleaned.csv", index=False)