In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# import seaborn as sns

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
data = pd.read_json("./data/raw.json")
data.set_index("product_id", inplace=True)

In [3]:
len(data)

1599

In [4]:
len(list(data.iterrows()))

1599

In [5]:
data.duplicated().sum()

5

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data.columns

Index(['mrp', 'model_name', 'screen_size', 'display_resolution', 'os',
       'hard_disk_type', 'hard_drive_size', 'ram_memory', 'processor_brand',
       'processor_name', 'processor_speed', 'processor_count', 'display_type',
       'product_dimensions', 'batteries', 'form_factor', 'audio_details',
       'speaker_details', 'connector_types', 'graphics chipset',
       'graphics type', 'graphics ram type', 'graphics details', 'brand',
       'no of 5 star', 'no of 4 star', 'no of 3 star', 'no of 2 star',
       'no of 1 star'],
      dtype='object')

In [8]:
na_counts = data.isna().sum().sort_values(ascending=False)
# filtered_columns = na_counts[na_counts > 100].index.tolist()
na_counts

display_type          1222
speaker_details        751
graphics chipset       386
audio_details          297
graphics details       254
display_resolution     251
graphics ram type      163
graphics type           14
mrp                      0
no of 2 star             0
no of 3 star             0
no of 4 star             0
no of 5 star             0
brand                    0
connector_types          0
batteries                0
form_factor              0
model_name               0
product_dimensions       0
processor_count          0
processor_speed          0
processor_name           0
processor_brand          0
ram_memory               0
hard_drive_size          0
hard_disk_type           0
os                       0
screen_size              0
no of 1 star             0
dtype: int64

In [9]:
data["brand"] = data["brand"].str.lower()
data["brand"].value_counts()

brand
asus                      370
dell                      328
lenovo                    309
hp                        294
acer                       80
msi                        75
samsung                    38
microsoft                  27
honor                      16
mi                          9
avita                       9
xiaomi                      8
lg                          6
in-viraat-vaio-appario      4
fujitsu                     4
agb                         3
redmi                       3
infinix                     2
huawei                      2
axl                         2
oboe                        1
coconics                    1
razer                       1
nexstgo                     1
dp                          1
Name: count, dtype: int64

In [10]:
data["form_factor"].value_counts()

form_factor
Laptop                              763
Ultra-Portable                      224
Netbook                             162
Convertible                          89
Gaming Laptop                        82
Traditional Laptop                   42
Thin & Light                         41
Notebook                             39
Chromebook                           28
Clamshell                            26
Gaming                               23
Thin and Light                       15
laptop                               12
Thin & light Laptop                  10
Compact                               6
Laptop, Chromebook                    4
2-In-1 Laptop Tablet, Chromebook      4
Thin and Light Laptop                 3
Ultrabook                             3
Dual-Screen Gaming Laptop             3
Detachable                            3
Traditional                           2
Compact, Ultrabook                    2
Traditional Laptops                   1
Laptop, Convertible         

In [11]:
simplified_form_factors = {
    "laptop": "Laptop",
    "Traditional": "Laptop",
    "Traditional Laptop": "Laptop",
    "Traditional Laptops": "Laptop",
    "Small Form Factor": "Laptop",
    "traditional laptop": "Laptop",
    "Standard laptop": "Laptop",
    'Gaming': 'Gaming Laptop'
}

data["form_factor"] = (
    data["form_factor"].map(simplified_form_factors).fillna(data["form_factor"])
)

In [12]:
data["form_factor"].value_counts()

form_factor
Laptop                              822
Ultra-Portable                      224
Netbook                             162
Gaming Laptop                       105
Convertible                          89
Thin & Light                         41
Notebook                             39
Chromebook                           28
Clamshell                            26
Thin and Light                       15
Thin & light Laptop                  10
Compact                               6
Laptop, Chromebook                    4
2-In-1 Laptop Tablet, Chromebook      4
Dual-Screen Gaming Laptop             3
Detachable                            3
Thin and Light Laptop                 3
Ultrabook                             3
Compact, Ultrabook                    2
Laptop, Convertible                   1
Convertible, Chromebook               1
Clamshell (Gaming Laptop)             1
2-In-1 Laptop Tablet                  1
Traditional laptop                    1
Name: count, dtype: int64

In [13]:
data["display_type"].value_counts()

display_type
LED                207
FHD                 85
LCD                 35
AMOLED              24
OLED                18
Pixel Sense          2
LED-Backlit LCD      1
Name: count, dtype: int64

In [14]:
def fillna_by_columns(row_to_fill: str, search_columns: list):
    for index, row in data[data[row_to_fill].isnull()].iterrows():
        filters = []
        for column in search_columns:
            filters.append(data[column] == row[column])

        manufacturer_df = data[np.all(filters, axis=0)]

        if len(manufacturer_df) <= 1:
            continue

        mode_result = manufacturer_df[row_to_fill].mode()
        if mode_result.empty:
            continue

        most_common_value = mode_result.values[0]
        data.at[index, row_to_fill] = most_common_value
        print(index, most_common_value)


fillna_by_columns("display_type", ["brand", "form_factor"])

B0BBMQ12Z2 LED
B0BXPWS41N LED
B099ZZX3QW FHD
B0BKT92YSL LCD
B0BLH4C9Y6 LED
B0BR45L4CQ OLED
B09XN8B8RN LCD
B0BRXZDLGG LED
B0C3MFSVZR LED
B0BTWDF3KB LED
B0C3D99LTP LED
B08M61GDYJ LED
B0C1N8MY5Q FHD
B0BY2JC6YM LED
B09PVDB3BN LED
B0C1GCY7ZX LED
B0B5RBWMP2 LED
B0C2YJFMWP LED
B0B31R74HP LED
B091YHJ1KC LED
B0BLNYWN7M LED
B0C3V2ML49 LED
B0BTWDLRBQ LED
B08Q7T6PTX LED
B09S3Y2C9F LED
B09VSTWG7Z AMOLED
B0BW92J1KM LCD
B09VJ32NNC FHD
B08Y96J1ZJ LED
B0BW9NMXP5 LCD
B0BVQS5K7B FHD
B0BWYQ8P7W LCD
B09LZ1R4X2 LED
B09M41GR4K LED
B0BW3CXL1B FHD
B07HDV3PJ7 LCD
B09BKDKNT3 OLED
B0C4TSS2XR FHD
B078T74GKF LED
B09F3QJF4W LED
B0B61GC7FP LED
B08TH79N2P LED
B0BTWCN881 LED
B09MKPZNHX LED
B09S3S77PY OLED
B09M423NVT LED
B0876H4VHJ LED
B09XXNKLBF LED
B0C4TVNXH2 FHD
B0BHYR44GT LED
B07YSTT8Z5 OLED
B0B65Y9PPM LCD
B09JSLMQP8 LED


B0C2YHVM7J LED
B0B6F5TDK4 LED
B09JZR85MV LED
B0BW39J8C3 FHD
B0BVMLP18M LED
B09NLTJ932 FHD
B0C37C3THC LED
B09WF3WKYF LED
B09MM4FPMR LED
B0B8NZMFZF LED
B09CRMSHD9 LED
B0BDMB2833 FHD
B0C4ZHFYJ6 LED
B085Q1X8R7 LED
B0B3HWHTH7 LCD
B0C1GFN51P LED
B0C1GGLHNV LED
B0C4ZJQ3D3 LED
B09GRMFRBW LED
B08DDJJJW7 LED
B0BQJD5P52 LED
B0C7H6SSC8 LED
B0BPT6PSCM LED
B07PC2LG1T LED
B0BWF381Z8 LED
B093PCT6Z1 LCD
B09HQ32GB2 LCD
B0C2Z6Y742 LED
B0C1GGLV9Z LED
B0B2WRR9HD LED
B09BCM46NS LCD
B0C1GH58K3 LED
B0C1GFC916 LED
B081P3QVWC LED
B09XXNBCJ7 LED
B09RMTMBSM FHD
B0BN4JZVRR LCD
B0BLP44F62 LED
B0BCKPX12F LED
B0C6L414JM LED
B0B1DTBVKW OLED
B092G3Q3QV LED
B09RMVDV9W LED
B08HSNRFTX LED
B0BW9Q8S57 LCD
B09TSRGZL3 LED
B09GRHWTQY OLED
B084SCJY1J LED
B0BD7TP2N4 LED
B0C1GH1K4H LED
B08J13GT5D OLED
B0BX944S9L LCD
B0C1434CCN LED
B0BQN2HV46 LED
B0BTWFKBDQ LED
B09XX2G3ND LCD
B0B2K86YZW LCD
B0BZYNJZWD LED
B0B31RV4M3 LED
B0BLJGL83W LCD
B093H9BJQH LED
B0C28FKJXG LCD
B07TSC356D OLED
B0B1J9JFRD FHD
B0B8JGHBS9 LED
B099F6BLDC LCD
B0BYD6

In [15]:
data["display_type"].value_counts()

display_type
LED                944
LCD                245
FHD                190
OLED                61
AMOLED              29
Pixel Sense          5
LED-Backlit LCD      1
Name: count, dtype: int64

In [16]:
data[["graphics chipset", "graphics details"]]

Unnamed: 0_level_0,graphics chipset,graphics details
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0BV74GVWT,,
B0BBMQ12Z2,NVIDIA,NVIDIA GeForce RTX 3060
B0BXPWS41N,AMD,AMD Radeon™ Graphics
B09NQ9Z2S6,Intel,
B099ZZX3QW,Intel,Intel Iris Xe Graphics
B0BKT92YSL,Intel,NVIDIA GeForce RTX 3060
B0BLH4C9Y6,Intel,Intel UHD Graphics 600
B0BR45L4CQ,Intel,
B09XN8B8RN,Intel,
B09MFTGNZ4,Intel,Intel UHD Graphics


In [17]:
for index, row in data.iterrows():
    graphics_chipset = row["graphics chipset"]
    graphics_details = row["graphics details"]

    if pd.isnull(graphics_chipset) and not pd.isnull(graphics_details):
        first_word = graphics_details.split()[0]
        data.at[index, "graphics chipset"] = first_word

In [18]:
fillna_by_columns("graphics type", ["brand", "form_factor"])

B078T74GKF Integrated
B08FSBBGWL Integrated
B07SNCFD7D Integrated
B08B5YBHY5 Integrated
B087N3PM7Y Integrated
B07WZQFQTK Integrated
B07HPNT89Z Integrated
B092PZGYYV Integrated


In [19]:
data["graphics type"].value_counts()

graphics type
Integrated                                                                                   1107
Dedicated                                                                                     454
Intel UHD Graphics                                                                              8
Intel HD Graphics                                                                               4
AMD Radeon Graphics                                                                             3
Integrated Intel Iris Xe Graphics functions as UHD Graphics                                     2
Intel UHD GRaphics 600                                                                          2
Iris Xe graphics                                                                                1
AMD Radeon RX Vega                                                                              1
Intel Integrated Graphics                                                                       1
AMD Ra

In [20]:
simplified_types = {
    "Integrated": "Integrated",
    "Dedicated": "Dedicated",
    "Intel UHD Graphics": "Integrated",
    "Intel HD Graphics": "Integrated",
    "AMD Radeon Graphics": "Dedicated",
    "AMD Radeon HD": "Dedicated",
    "AMD Radeon RX Vega": "Dedicated",
    "AMD Radeon Vega": "Dedicated",
    "Intel Iris Xe graphics": "Integrated",
    "Intel Integrated Graphics": "Integrated",
    "Intel Iris Xe": "Integrated",
}

data["graphics type"] = (
    data["graphics type"].map(simplified_types).fillna(data["graphics type"])
)

In [21]:
data.isna().sum().sort_values(ascending=False)

speaker_details       751
audio_details         297
graphics details      254
display_resolution    251
graphics ram type     163
display_type          119
graphics chipset       41
graphics type           6
mrp                     0
no of 2 star            0
no of 3 star            0
no of 4 star            0
no of 5 star            0
brand                   0
connector_types         0
batteries               0
form_factor             0
model_name              0
product_dimensions      0
processor_count         0
processor_speed         0
processor_name          0
processor_brand         0
ram_memory              0
hard_drive_size         0
hard_disk_type          0
os                      0
screen_size             0
no of 1 star            0
dtype: int64

In [22]:
data[["form_factor", "display_type"]][data["display_type"].isna()]

Unnamed: 0_level_0,form_factor,display_type
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B09NQ9Z2S6,Chromebook,
B09MFTGNZ4,Laptop,
B08XTB1NNH,Compact,
B0BHYR5G1Z,Gaming Laptop,
B0BYKC7RDC,Netbook,
B0BLYX9Z1P,Netbook,
B07SGD19GL,Chromebook,
B0B5129PHL,Thin & light Laptop,
B09NPYS559,Convertible,
B08MLNJ7BK,Laptop,


In [23]:
fillna_by_columns("display_type", ["brand", "form_factor"])

In [24]:
data.dropna(subset=["display_type"], inplace=True)

In [25]:
data["graphics ram type"].value_counts()

graphics ram type
Shared                    550
GDDR6                     387
VRAM                      124
DDR4 SDRAM                 89
DDR5 SDRAM                 49
DDR3 SDRAM                 43
DRAM                       38
GDDR5                      12
GDDR4                      11
SDRAM                       7
WRAM                        6
DDR DRAM                    6
DDR SDRAM                   4
DIMM                        3
DDR5 RAM                    1
72-Pin EDO SIMM Memory      1
SODIMM                      1
Name: count, dtype: int64

In [26]:
len(data)

1475

In [27]:
mapping = {
    "Shared": "Shared",
    "GDDR6": "GDDR6",
    "VRAM": "VRAM",
    "DDR4 SDRAM": "DDR4 SDRAM",
    "DDR3 SDRAM": "DDR3 SDRAM",
    "DDR5 SDRAM": "DDR5 SDRAM",
    "DRAM": "DRAM",
    "GDDR5": "GDDR5",
    "GDDR4": "GDDR4",
    "WRAM": "Other",
    "SDRAM": "Other",
    "DDR SDRAM": "Other",
    "DDR DRAM": "Other",
    "DIMM": "Other",
    "72-Pin EDO SIMM Memory": "Other",
    "DDR5 RAM": "Other",
    "DDR3L-1600 SDRAM": "Other",
    "GDDR3": "Other",
    "SODIMM": "Other",
}

data["graphics ram type"] = data["graphics ram type"].map(mapping)

In [28]:
data["graphics ram type"].value_counts()

graphics ram type
Shared        550
GDDR6         387
VRAM          124
DDR4 SDRAM     89
DDR5 SDRAM     49
DDR3 SDRAM     43
DRAM           38
Other          29
GDDR5          12
GDDR4          11
Name: count, dtype: int64

In [29]:
fillna_by_columns("graphics ram type", ["brand", "form_factor", "graphics chipset"])

B09M41GR4K Shared
B078T74GKF VRAM
B08TH79N2P VRAM
B09DX8V2SR Shared
B07YSTT8Z5 DDR3 SDRAM
B0B4WK14KK Shared
B09CRMSHD9 Shared
B08DDJJJW7 Shared
B09BCM46NS GDDR6
B081P3QVWC VRAM
B092G3Q3QV VRAM
B09RMVDV9W Shared
B09Y5XMQKP Shared
B07TSC356D Shared
B09VNM698Z Shared
B08BY1J249 Shared
B08MQFBYYQ Shared
B09HQ6KN13 VRAM
B0B4N6JVMW VRAM
B09Z2NZGR6 GDDR6
B09PBTLW9F VRAM
B087N73SHT VRAM
B09V1DYYV6 DDR5 SDRAM
B09YHF7WHL Shared
B0B56B2QQ3 Shared
B07SNCFD7D DDR4 SDRAM
B08HWB94ZF VRAM
B0B752D4Z7 VRAM
B081TWCG7C VRAM
B07D4ZMXXF Shared
B08B5YBHY5 VRAM
B08WPKF2TT VRAM
B08259C169 VRAM
B0B58W9TH1 VRAM
B09G6W6C7Z Shared
B08KLM5VY6 VRAM
B0B3J21ZVB VRAM
B09VT6BVJF VRAM
B0746NCNSG DRAM
B076V3YMRY Shared
B08GJV73L6 Other
B09C386RGG VRAM
B08K8F7CPL VRAM
B087N3PM7Y VRAM
B08L5VBBCJ Other
B07WZQFQTK VRAM
B096W33V4T Shared
B07WFM7FHJ VRAM
B0891TPQ5V Shared
B088CDKJ7V DRAM
B09RQK4HJY Shared
B09DPWDWYD DDR3 SDRAM
B09FPBZQDL Shared
B07F32YFDL VRAM
B0B6F6PM6C VRAM
B0746NCNSG DRAM
B0943S58GC DDR4 SDRAM
B08KHWLFXJ Oth

In [30]:
data.isna().sum().sort_values(ascending=False)

speaker_details       692
audio_details         270
display_resolution    246
graphics details      235
graphics chipset       41
graphics ram type      12
graphics type           4
mrp                     0
form_factor             0
no of 2 star            0
no of 3 star            0
no of 4 star            0
no of 5 star            0
brand                   0
connector_types         0
batteries               0
model_name              0
product_dimensions      0
display_type            0
processor_count         0
processor_speed         0
processor_name          0
processor_brand         0
ram_memory              0
hard_drive_size         0
hard_disk_type          0
os                      0
screen_size             0
no of 1 star            0
dtype: int64

In [31]:
data["display_resolution"].value_counts()

display_resolution
1920 x 1080 pixels              281
1920 x 1080 Pixels              233
1920 X 1080 (FHD) Pixels         76
1920x1080                        52
1080p                            39
HD (1366 x 768) Pixels           35
1366 x 768 Pixels                33
1920 x 1200 pixels               31
1920 x 1080                      30
2880 x 1800                      25
1920X1080 Pixels                 24
1366 x 768                       23
2560 x 1440 Pixels               18
1366x768                         18
1920*1080                        18
1920x1080 Pixels                 16
2560 x 1600 pixels               16
1366 x 768 pixels                16
1920 x 1200 Pixels               16
2560 x 1440 pixels               15
2880 x 1800 Pixels               14
2880 x 1800 pixels               14
HD (1366 X 768) Pixels           14
1920 x 1080 (Full HD)            10
1366 X 768 (HD) Pixels            9
Full HD (1920 X 1080) Pixels      9
2560 x 1600 Pixels                7
1080p, 12

In [32]:
def extract_screen_res(row):
    resolution = str(row["display_resolution"])

    if pd.isna(resolution):
        return None, None

    matches = re.findall(r"(\d+)\s*x\s*(\d+)", resolution)

    if matches:
        first_number, second_number = matches[0]
        return first_number.strip(), second_number.strip()

    return None, None

In [33]:
data[["screen_res_w", "screen_res_h"]] = data.apply(
    extract_screen_res, axis=1, result_type="expand"
)

In [34]:
data.dropna(subset=["display_resolution"], inplace=True)
data.drop(columns=["display_resolution"], inplace=True)

In [35]:
len(data)

1229

In [36]:
data.dropna(subset=["screen_res_h"], inplace=True)

In [37]:
len(data)

994

In [38]:
data.drop(columns=["speaker_details", "audio_details"], inplace=True)

In [39]:
def extract_dimensions(row):
    dimensions = str(row["product_dimensions"])

    if pd.isna(dimensions):
        return None
    try:
        seperations = [
            value.split()[0] for value in dimensions.split(";")[0].split(" x ")
        ]
        length, width, height = sorted(map(float, seperations), reverse=True)

        weight, type = str(dimensions.split(";")[-1]).split()
        if "k" not in type.lower():
            weight = float(weight) / 1000
        weight = float(weight)

        return [length, width, height, weight]
    except:
        return None

In [40]:
data[["length", "width", "height", "weight"]] = data.apply(
    extract_dimensions, axis=1, result_type="expand"
)

In [41]:
data.dropna(inplace=True)

In [42]:
len(data)

853

In [43]:
data.isna().sum().sort_values(ascending=False)

mrp                   0
model_name            0
height                0
width                 0
length                0
screen_res_h          0
screen_res_w          0
no of 1 star          0
no of 2 star          0
no of 3 star          0
no of 4 star          0
no of 5 star          0
brand                 0
graphics details      0
graphics ram type     0
graphics type         0
graphics chipset      0
connector_types       0
form_factor           0
batteries             0
product_dimensions    0
display_type          0
processor_count       0
processor_speed       0
processor_name        0
processor_brand       0
ram_memory            0
hard_drive_size       0
hard_disk_type        0
os                    0
screen_size           0
weight                0
dtype: int64

In [44]:
data.columns

Index(['mrp', 'model_name', 'screen_size', 'os', 'hard_disk_type',
       'hard_drive_size', 'ram_memory', 'processor_brand', 'processor_name',
       'processor_speed', 'processor_count', 'display_type',
       'product_dimensions', 'batteries', 'form_factor', 'connector_types',
       'graphics chipset', 'graphics type', 'graphics ram type',
       'graphics details', 'brand', 'no of 5 star', 'no of 4 star',
       'no of 3 star', 'no of 2 star', 'no of 1 star', 'screen_res_w',
       'screen_res_h', 'length', 'width', 'height', 'weight'],
      dtype='object')

In [45]:
def get_screen_size_in(inch: str):
    if pd.isna(inch):
        return None

    try:
        num, type = str(inch).split()
        if "cm" in type.lower():
            num = float(num) / 2.54
    except:
        num = str(inch).strip()
    return round(float(num), 1)


data["screen_size"] = data["screen_size"].apply(get_screen_size_in)

In [46]:
data["hard_drive_size"]

product_id
B0BXPWS41N          512 GB
B099ZZX3QW            1 TB
B0BKT92YSL            1 TB
B0BLH4C9Y6          256 GB
B0BRXZDLGG            1 TB
B0C3MFSVZR          512 GB
B0BTWDF3KB            1 TB
B0C3D99LTP          512 GB
B08M61GDYJ          512 GB
B0C1N8MY5Q          512 GB
B0C1GCY7ZX          512 GB
B09WRS73T6          512 GB
B0C2YJFMWP          512 GB
B0B31R74HP          512 GB
B091YHJ1KC          512 GB
B0BLNYWN7M          512 GB
B0C3V2ML49          512 GB
B0BTWDLRBQ          512 GB
B08Q7T6PTX            1 TB
B09S3Y2C9F            1 TB
B09VJ32NNC          512 GB
B08Y96J1ZJ          512 GB
B0BVQS5K7B          512 GB
B0BWYQ8P7W          512 GB
B0BW3CXL1B            1 TB
B07HDV3PJ7          512 GB
B09BKDKNT3          128 GB
B0B61GC7FP          512 GB
B08TH79N2P          256 GB
B0BTWCN881          512 GB
B0876H4VHJ          256 GB
B09XXNKLBF          512 GB
B07YSTT8Z5          128 GB
B0B1F21WVP          512 GB
B09JSLMQP8          320 GB
B0C2YHVM7J          512 GB
B0B6F5TDK4       

In [47]:
def get_hard_drive(row):
    inch = row["hard_drive_size"]
    print(inch)

    if pd.isna(inch):
        return None, None

    try:
        num, type = str(inch).split()
        return [int(num), type.strip().lower()]
    except:
        return None, None


data[["hard_drive_size_value", "hard_drive_size_unit"]] = data.apply(
    get_hard_drive, axis=1, result_type="expand"
)

data[["hard_drive_size_value", "hard_drive_size_unit"]]

512 GB
1 TB
1 TB
256 GB
1 TB
512 GB
1 TB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
1 TB
1 TB
512 GB
512 GB
512 GB
512 GB
1 TB
512 GB
128 GB
512 GB
256 GB
512 GB
256 GB
512 GB
128 GB
512 GB
320 GB
512 GB
512 GB
1 TB
512 GB
1 TB
500 GB
512 GB
512 GB
64 GB
512 GB
1 TB
512 GB
256 GB
512 GB
16 GB
512 GB
256
1 TB
512 GB
512 GB
512 GB
512 GB
512 GB
256 GB
512 GB
512 GB
512 GB
1 TB
1 TB
512 GB
64 GB
512 GB
500 GB
1 TB
128 GB
64.0, 1.0 GB
256 GB
512 GB
1 TB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
512 GB
256 GB
512 GB
512 GB
1 TB
256 GB
8 GB
512 GB
256
1 TB
512 GB
256
512 GB
1 TB
512 GB
128 GB
512 GB
1 TB
128 GB
512 GB
512 GB
1 TB
1 TB
512 GB
512 GB
512 GB
512 GB
1550 GB
1 TB
512 GB
512 GB
512 GB
1 TB
512 GB
256 GB
512 GB
512 GB
128 GB
512 GB
512 GB
1 TB
512 GB
512 GB
512 GB
512 GB
512 GB
1 TB
512 GB
512 GB
128 GB
1 TB
512 GB
1 TB
1 TB
512 GB
1 TB
1 TB
512 GB
512 GB
1 TB
512 GB
320 GB
512 GB
512 GB
1 TB
512 GB
512 GB
128 GB
512 GB
512 GB
512 GB
512 GB
256 GB


Unnamed: 0_level_0,hard_drive_size_value,hard_drive_size_unit
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0BXPWS41N,512.0,gb
B099ZZX3QW,1.0,tb
B0BKT92YSL,1.0,tb
B0BLH4C9Y6,256.0,gb
B0BRXZDLGG,1.0,tb
B0C3MFSVZR,512.0,gb
B0BTWDF3KB,1.0,tb
B0C3D99LTP,512.0,gb
B08M61GDYJ,512.0,gb
B0C1N8MY5Q,512.0,gb


In [48]:
data.isna().sum().sort_values(ascending=False)

hard_drive_size_unit     15
hard_drive_size_value    15
no of 1 star              0
graphics details          0
brand                     0
no of 5 star              0
no of 4 star              0
no of 3 star              0
no of 2 star              0
screen_res_w              0
model_name                0
screen_res_h              0
length                    0
width                     0
height                    0
weight                    0
graphics ram type         0
mrp                       0
graphics chipset          0
connector_types           0
form_factor               0
batteries                 0
product_dimensions        0
display_type              0
processor_count           0
processor_speed           0
processor_name            0
processor_brand           0
ram_memory                0
hard_drive_size           0
hard_disk_type            0
os                        0
screen_size               0
graphics type             0
dtype: int64

In [49]:
data.dropna(inplace=True)

In [50]:
len(data)

838

In [51]:
data["ram_memory"]

product_id
B0BXPWS41N     16 GB
B099ZZX3QW      8 GB
B0BKT92YSL     16 GB
B0BLH4C9Y6      8 GB
B0BRXZDLGG     16 GB
B0C3MFSVZR      8 GB
B0BTWDF3KB     16 GB
B0C3D99LTP     16 GB
B08M61GDYJ    512 GB
B0C1N8MY5Q     16 GB
B0C1GCY7ZX      8 GB
B09WRS73T6      8 GB
B0C2YJFMWP     16 GB
B0B31R74HP     16 GB
B091YHJ1KC      8 GB
B0BLNYWN7M     16 GB
B0C3V2ML49     16 GB
B0BTWDLRBQ     16 GB
B08Q7T6PTX    256 GB
B09S3Y2C9F     16 GB
B09VJ32NNC     16 GB
B08Y96J1ZJ     16 GB
B0BVQS5K7B     16 GB
B0BWYQ8P7W      8 GB
B0BW3CXL1B     32 GB
B07HDV3PJ7     16 GB
B09BKDKNT3      4 GB
B0B61GC7FP      8 GB
B08TH79N2P      4 GB
B0BTWCN881     16 GB
B0876H4VHJ    256 GB
B09XXNKLBF     16 GB
B07YSTT8Z5    128 GB
B0B1F21WVP      8 GB
B09JSLMQP8      4 GB
B0C2YHVM7J      8 GB
B0B6F5TDK4      8 GB
B0BW39J8C3     16 GB
B0BVMLP18M      8 GB
B0C37C3THC     16 GB
B0B8NZMFZF      8 GB
B09CRMSHD9     32 GB
B0B912J6VT      8 GB
B085Q1X8R7      4 GB
B0C1GFN51P     16 GB
B0C1GGLHNV     32 GB
B09GRMFRBW      8 GB
B0

In [52]:
def extract_number(col_value):
    if pd.isnull(col_value):
        return None

    string = col_value.split()[0]
    return int(float(string))


data["ram_memory"] = data["ram_memory"].apply(extract_number).astype(int)

In [53]:
data.drop(
    columns=["product_dimensions", "hard_drive_size", "processor_speed"], inplace=True
)
data[["processor_brand", "processor_name", "processor_count"]]

Unnamed: 0_level_0,processor_brand,processor_name,processor_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0BXPWS41N,AMD,Ryzen 7,8
B099ZZX3QW,Intel,Core i5,1
B0BKT92YSL,Intel,Core i7,1
B0BLH4C9Y6,Intel,Pentium,4
B0BRXZDLGG,Intel,Core i7,10
B0C3MFSVZR,Intel,Core i3,1
B0BTWDF3KB,AMD,Ryzen 7,8
B0C3D99LTP,Intel,Core i7,1
B08M61GDYJ,Intel,Intel Core i5,1
B0C1N8MY5Q,Intel,Core i5,1


In [54]:
data.columns

Index(['mrp', 'model_name', 'screen_size', 'os', 'hard_disk_type',
       'ram_memory', 'processor_brand', 'processor_name', 'processor_count',
       'display_type', 'batteries', 'form_factor', 'connector_types',
       'graphics chipset', 'graphics type', 'graphics ram type',
       'graphics details', 'brand', 'no of 5 star', 'no of 4 star',
       'no of 3 star', 'no of 2 star', 'no of 1 star', 'screen_res_w',
       'screen_res_h', 'length', 'width', 'height', 'weight',
       'hard_drive_size_value', 'hard_drive_size_unit'],
      dtype='object')

In [55]:
data.drop(columns=["screen_size"], inplace=True)

In [56]:
data["hard_disk_type"].value_counts()

hard_disk_type
SSD                            748
HDD                             46
Solid State Drive               11
Hybrid                          10
Embedded MultiMediaCard          7
Emmc                             5
SSHD                             4
Solid State Hard Drive           3
Hybrid Drive                     2
HDD, Solid State Hard Drive      1
Flash Memory                     1
Name: count, dtype: int64

In [57]:
mapping = {
    "SSD": "SSD",
    "HDD": "HDD",
    "Solid State Drive": "SSD",
    "Hybrid": "Hybrid",
    "Embedded MultiMediaCard": "eMMC",
    "Emmc": "eMMC",
    "SSHD": "Hybrid",
    "Solid State Hard Drive": "SSD",
    "Hybrid Drive": "Hybrid",
    "HDD, Solid State Hard Drive": "Hybrid",
    "Flash Memory": "SSD",
}

# Apply the mapping to the "hard_disk_type" column
data["hard_disk_type"] = data["hard_disk_type"].map(mapping)

In [58]:
data["hard_disk_type"].value_counts()

hard_disk_type
SSD       763
HDD        46
Hybrid     17
eMMC       12
Name: count, dtype: int64

In [59]:
data[["processor_brand", "processor_name"]]

Unnamed: 0_level_0,processor_brand,processor_name
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0BXPWS41N,AMD,Ryzen 7
B099ZZX3QW,Intel,Core i5
B0BKT92YSL,Intel,Core i7
B0BLH4C9Y6,Intel,Pentium
B0BRXZDLGG,Intel,Core i7
B0C3MFSVZR,Intel,Core i3
B0BTWDF3KB,AMD,Ryzen 7
B0C3D99LTP,Intel,Core i7
B08M61GDYJ,Intel,Intel Core i5
B0C1N8MY5Q,Intel,Core i5


In [60]:
# def transform_processor(row):
#     brand, name = row['processor_brand'], row['processor_name']
#     return " ".join(sorted(list(set(brand.split() + name.split())))).lower()

# data["processor_name"] = data.apply(transform_processor , axis=1)
# data.drop(columns=["processor_brand"], inplace=True)

In [61]:
def clean_battery_type(value):
    lower_value = str(value).lower()

    if "polymer" in lower_value or "phosphate" in lower_value:
        return "Lithium Polymer"

    if "ion" in lower_value or "cobalt" in lower_value:
        return "Lithium Ion"

    if "metal" in lower_value:
        return "Lithium Metal"

    return np.nan


data["battery_type"] = data["batteries"].apply(clean_battery_type)

In [62]:
data.drop(
    columns=[
        "batteries",
        "connector_types",
        "graphics chipset",
        "graphics type",
        "graphics ram type",
        "graphics details",
        "brand",
        "model_name",
        "processor_name",
    ],
    inplace=True,
)

In [63]:
data["battery_type"].value_counts()

battery_type
Lithium Ion        508
Lithium Polymer    304
Lithium Metal       19
Name: count, dtype: int64

In [64]:
def update_os(os):
    return " ".join(os.split()[:2]).lower().strip()


data["os"] = data["os"].apply(update_os)

In [65]:
data["os"].unique()

array(['windows 11', 'windows 10', 'windows', 'windows 7', 'chrome os',
       'dos', 'mac os'], dtype=object)

In [66]:
data["processor_brand"].value_counts()

processor_brand
Intel       578
AMD         254
NVIDIA        3
MediaTek      2
IBM           1
Name: count, dtype: int64

In [67]:
len(data)

838

In [68]:
data["screen_res_w"] = data["screen_res_w"].astype(int)
data["screen_res_h"] = data["screen_res_h"].astype(int)

In [69]:
def encode_columns(data: pd.DataFrame, columns: dict[str, str]) -> pd.DataFrame:
    df = data.copy()

    for col, encoding_type in columns.items():
        if encoding_type == "one_hot":
            column = df.pop(col)
            one_hot = pd.crosstab((s := column.explode()).index, s)
            new_col_names = [
                f"{''.join(map(lambda x: x[0], col.split('_'))) }_{val}"
                for val in one_hot.columns
            ]
            one_hot.columns = new_col_names
            df = df.join(one_hot)
        elif encoding_type == "label":
            le = LabelEncoder()
            df[f"{col}"] = le.fit_transform(df[col].astype(str))
            print(
                {index: label for index, label in enumerate(le.classes_)},
                "-",
                {label: index for index, label in enumerate(le.classes_)},
            )

    return df


columns_to_encode = {
    "os": "label",
    "hard_disk_type": "label",
    "processor_brand": "label",
    "display_type": "label",
    "form_factor": "label",
    "hard_drive_size_unit": "label",
    "battery_type": "label",
}

data = encode_columns(data, columns_to_encode)

{0: 'chrome os', 1: 'dos', 2: 'mac os', 3: 'windows', 4: 'windows 10', 5: 'windows 11', 6: 'windows 7'} - {'chrome os': 0, 'dos': 1, 'mac os': 2, 'windows': 3, 'windows 10': 4, 'windows 11': 5, 'windows 7': 6}
{0: 'HDD', 1: 'Hybrid', 2: 'SSD', 3: 'eMMC'} - {'HDD': 0, 'Hybrid': 1, 'SSD': 2, 'eMMC': 3}
{0: 'AMD', 1: 'IBM', 2: 'Intel', 3: 'MediaTek', 4: 'NVIDIA'} - {'AMD': 0, 'IBM': 1, 'Intel': 2, 'MediaTek': 3, 'NVIDIA': 4}
{0: 'AMOLED', 1: 'FHD', 2: 'LCD', 3: 'LED', 4: 'OLED', 5: 'Pixel Sense'} - {'AMOLED': 0, 'FHD': 1, 'LCD': 2, 'LED': 3, 'OLED': 4, 'Pixel Sense': 5}
{0: 'Chromebook', 1: 'Clamshell', 2: 'Convertible', 3: 'Gaming Laptop', 4: 'Laptop', 5: 'Netbook', 6: 'Notebook', 7: 'Thin & Light', 8: 'Thin and Light', 9: 'Ultra-Portable'} - {'Chromebook': 0, 'Clamshell': 1, 'Convertible': 2, 'Gaming Laptop': 3, 'Laptop': 4, 'Netbook': 5, 'Notebook': 6, 'Thin & Light': 7, 'Thin and Light': 8, 'Ultra-Portable': 9}
{0: 'gb', 1: 'tb'} - {'gb': 0, 'tb': 1}
{0: 'Lithium Ion', 1: 'Lithium Met

In [70]:
data.dtypes

mrp                        int64
os                         int32
hard_disk_type             int32
ram_memory                 int32
processor_brand            int32
processor_count            int64
display_type               int32
form_factor                int32
no of 5 star               int64
no of 4 star               int64
no of 3 star               int64
no of 2 star               int64
no of 1 star               int64
screen_res_w               int32
screen_res_h               int32
length                   float64
width                    float64
height                   float64
weight                   float64
hard_drive_size_value    float64
hard_drive_size_unit       int32
battery_type               int32
dtype: object

In [71]:
data.columns

Index(['mrp', 'os', 'hard_disk_type', 'ram_memory', 'processor_brand',
       'processor_count', 'display_type', 'form_factor', 'no of 5 star',
       'no of 4 star', 'no of 3 star', 'no of 2 star', 'no of 1 star',
       'screen_res_w', 'screen_res_h', 'length', 'width', 'height', 'weight',
       'hard_drive_size_value', 'hard_drive_size_unit', 'battery_type'],
      dtype='object')

In [72]:
def label_smartphones(df):
    # Define the thresholds for success
    thresholds = {
        "no_ratings": 1000,
        "avg_rating": 4.2,
    }

    # Define the weightage for the Average Rating feature
    rating_weightage = 0.5
    no_rating_weightage = 0.8

    # Create a new column called "Label"
    df["is_success"] = False

    df["no_ratings"] = (
        df["no of 5 star"]
        + df["no of 4 star"]
        + df["no of 3 star"]
        + df["no of 2 star"]
        + df["no of 1 star"]
    )

    # Calculate average rating
    df["avg_rating"] = (
        df["no of 5 star"] * 5
        + df["no of 4 star"] * 4
        + df["no of 3 star"] * 3
        + df["no of 2 star"] * 2
        + df["no of 1 star"]
    ) / df["no_ratings"]

    # Iterate over each row and apply the labeling logic
    for index, row in df.iterrows():
        percent = 0

        if row["no_ratings"] < 50 or row["avg_rating"] < 3:
            df.at[index, "is_success"] = False
            continue

        # Assign the label based on the overall value of percent
        df.at[index, "is_success"] = True

    df.drop(
        columns=[
            "no of 5 star",
            "no of 4 star",
            "no of 3 star",
            "no of 2 star",
            "no of 1 star",
            "avg_rating",
            "no_ratings",
        ]
    )
    return df


data = label_smartphones(data)

In [73]:
data.columns

Index(['mrp', 'os', 'hard_disk_type', 'ram_memory', 'processor_brand',
       'processor_count', 'display_type', 'form_factor', 'no of 5 star',
       'no of 4 star', 'no of 3 star', 'no of 2 star', 'no of 1 star',
       'screen_res_w', 'screen_res_h', 'length', 'width', 'height', 'weight',
       'hard_drive_size_value', 'hard_drive_size_unit', 'battery_type',
       'is_success', 'no_ratings', 'avg_rating'],
      dtype='object')

In [74]:
data.drop(
    columns=[
        "no of 5 star",
        "no of 4 star",
        "no of 3 star",
        "no of 2 star",
        "no of 1 star",
        "no_ratings",
        "avg_rating",
    ], inplace=True
)

In [75]:
data.to_csv("./data/processed.csv")