In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# import seaborn as sns

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Planning Out Make DataSet how to scrape and what to scrape


In [2]:
data = pd.read_json("../../data/raw-v2.json")
data.set_index("product_id", inplace=True)
len(list(data.iterrows()))

701

In [3]:
data.duplicated().sum()

49

In [4]:
duplicated_rows = data[data.duplicated(keep=False)].sort_values("product_id")

# Display the duplicated rows
print("Duplicated Rows:")
print(duplicated_rows)

Duplicated Rows:
              mrp                 model_name  no of 5 star  no of 4 star   
product_id                                                                 
B07WFPMJCH  13999                    IQOO Z6          4911          2706  \
B07WGPKNGT  27999              IQOO Neo 7 5G          1581           643   
B07WHRNZ89  27999              IQOO Neo 7 5G          1581           643   
B07WHSR1NR  13999                    IQOO Z6          4911          2706   
B08VB2MRF8  34999   Samsung Galaxy S20 FE 5G         18327          7455   
B08VB2MRF8  34999   Samsung Galaxy S20 FE 5G         18330          7456   
B08VB2MRF8  34999   Samsung Galaxy S20 FE 5G         18312          7449   
B08VB57558  34999   Samsung Galaxy S20 FE 5G         18327          7455   
B08VB57558  34999   Samsung Galaxy S20 FE 5G         18330          7456   
B08VB57558  34999   Samsung Galaxy S20 FE 5G         18312          7449   
B09ZBFD6TJ  21999     Realme narzo 50 Pro 5G          4041          233

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
len(list(data.iterrows()))

652

In [7]:
data.columns

Index(['mrp', 'model_name', 'no of 5 star', 'no of 4 star', 'no of 3 star',
       'no of 2 star', 'no of 1 star', 'os', 'ram', 'inbuilt_storage',
       'dimensions', 'weight', 'battery_power', 'battery_type', 'camera',
       'warranty', 'form_factor', 'manufacturer'],
      dtype='object')

In [8]:
na_counts = data.isna().sum().sort_values(ascending=False)
# filtered_columns = na_counts[na_counts > 100].index.tolist()
na_counts

battery_type       19
warranty           18
form_factor         7
mrp                 0
model_name          0
camera              0
battery_power       0
weight              0
dimensions          0
inbuilt_storage     0
ram                 0
os                  0
no of 1 star        0
no of 2 star        0
no of 3 star        0
no of 4 star        0
no of 5 star        0
manufacturer        0
dtype: int64

In [9]:
len(data.columns)

18

In [10]:
data.columns

Index(['mrp', 'model_name', 'no of 5 star', 'no of 4 star', 'no of 3 star',
       'no of 2 star', 'no of 1 star', 'os', 'ram', 'inbuilt_storage',
       'dimensions', 'weight', 'battery_power', 'battery_type', 'camera',
       'warranty', 'form_factor', 'manufacturer'],
      dtype='object')

In [11]:
# only thing needed from mini table is as backup

In [12]:
mrp = "mrp"
os = ["OS", "Operating System"]
inbuilt_storage = ["Inbuilt Storage (in GB)", "Memory Storage Capacity"]
ram = "RAM"
dimensions = ["Item Dimensions", "Product Dimensions"]
weight = ["Item Weight", "Product Dimensions"]
battery_power = ["Battery Power Rating", "Battery Power", "Battery Power (In mAH)"]
battery_type = ["Batteries", "Battery type"]  # fillna_by_manufacture
camera = ["Other camera features", "Camera Description"]
warranty = "Warranty Description"  # or else 0
manufacturer = "Manufacturer"  # .lower().split()[0] # later remove
form_factor = "Form factor"

# needed for labeling
stars = ["no of 5 star", "no of 4 star", "no of 3 star", "no of 2 star", "no of 1 star"]

In [13]:
to_remove = [
    "product_id" "Item model number",
    "Battery Power Rating",  # in mAh better
    "Whats in the box",
    "What's in the box",
    "Special features",
    "Network Service Provider",
    "Other camera features",  # i will use Camera Description
    "Country of Origin",
    "Other display features",  # ITS ALL THE SAME
    "Model Name",
    "Brand",
    "Colour",
    "GPS",
    "Wireless communication technologies",  # explore and understand
    "Connectivity technologies",  # explore and understand
    "Cellular Technology",
]

In [14]:
data = data[
    [
        # "product_id",
        "mrp",
        "model_name",
        "no of 5 star",
        "no of 4 star",
        "no of 3 star",
        "no of 2 star",
        "no of 1 star",
        "os",
        "ram",
        "inbuilt_storage",
        "dimensions",
        "weight",
        "battery_power",
        "battery_type",
        "camera",
        "warranty",
        "form_factor",
        "manufacturer",
    ]
]

## STARS

In [15]:
data[
    ["no of 5 star", "no of 4 star", "no of 3 star", "no of 2 star", "no of 1 star"]
] = data[
    ["no of 5 star", "no of 4 star", "no of 3 star", "no of 2 star", "no of 1 star"]
].fillna(
    0
)

In [16]:
data.columns

Index(['mrp', 'model_name', 'no of 5 star', 'no of 4 star', 'no of 3 star',
       'no of 2 star', 'no of 1 star', 'os', 'ram', 'inbuilt_storage',
       'dimensions', 'weight', 'battery_power', 'battery_type', 'camera',
       'warranty', 'form_factor', 'manufacturer'],
      dtype='object')

In [17]:
data["manufacturer"].value_counts()

manufacturer
iQOO                                                                                                  75
Redmi                                                                                                 71
Samsung                                                                                               63
OPPO Mobiles India Pvt Ltd                                                                            63
Lava                                                                                                  36
Oppo Mobiles India Private Limited                                                                    32
Vivo                                                                                                  28
OPPO MOBILES INDIA PVT                                                                                19
SAMSUNG                                                                                               18
Dixon Technologies (India) Ltd.,Plot No.6,

In [18]:
"""
Find these and extract the previos word otherwise just take the first word of the manufacturer
Mobiles India
Technologies
Technology India
MOBILE
MOBILE DEVICES
Products
INTERNATIONAL
INDUSTRIES
Devices
"""

data["manufacturer"] = data["manufacturer"].str.lower().str.split().str[0]
data['manufacturer'] = data['manufacturer'].str.replace('s', 's-mobile')
data['manufacturer'] = data['manufacturer'].str.replace('g', 'g-mobile')
data["manufacturer"].value_counts()

manufacturer
oppo                            119
s-mobileams-mobileung-mobile    102
iqoo                             75
redmi                            71
xiaomi                           49
vivo                             44
lava                             44
g-mobile                         28
dixon                            17
oneplus-mobile                   16
for                              11
s-mobile                          8
nokia                             6
micromax                          6
g-mobileeneric                    6
g-mobile-mobile                   6
padg-mobileet                     5
1                                 4
lyf                               4
bhag-mobilewati                   4
s-mobileky                        4
bharat                            3
united                            3
jmax                              2
infinix                           2
jio                               2
tecno                             2
itel           

## use form FACTOR

In [19]:
data["form_factor"].value_counts()

form_factor
Bar                                                                    349
Smartphone                                                             117
Touchscreen Phone                                                       50
Slate                                                                   40
Touch                                                                   39
Foldable Case                                                           14
Slider                                                                  10
PALM HELD                                                                8
Phablet                                                                  3
Foldable Screen                                                          3
SmartPhone                                                               3
Palm Hended                                                              2
smartphone                                                               2
Bar ; Side Fi

In [20]:
data["form_factor"] = data["form_factor"].str.replace("Screen Touch", "Touch")
data["form_factor"] = data["form_factor"].str.lower().str.split().str[0].replace('bar,', 'bar')

In [21]:
data["form_factor"].value_counts()

form_factor
bar            352
smartphone     122
touchscreen     50
slate           40
touch           40
foldable        17
palm            10
slider          10
phablet          3
flip             1
Name: count, dtype: int64

## Battery Type

In [22]:
def clean_battery_type(value):
    lower_value = str(value).lower()

    if "polymer" in lower_value or "phosphate" in lower_value:
        return "Lithium Polymer"
    
    if "ion" in lower_value or "cobalt" in lower_value:
        return "Lithium Ion"

    return np.nan


data["battery_type"] = data["battery_type"].apply(clean_battery_type)

In [23]:
def fillna_by_manufacture(row_to_fill: str):
    for index, row in data[data[row_to_fill].isnull()].iterrows():
        manufacturer = row["manufacturer"]
        manufacturer_df = data[data["manufacturer"] == manufacturer]

        if len(manufacturer_df) <= 1:
            continue

        mode_result = manufacturer_df[row_to_fill].mode()
        if mode_result.empty:
            continue

        most_common_screen_type = mode_result.values[0]
        data.at[index, row_to_fill] = most_common_screen_type
        print(index, most_common_screen_type)


fillna_by_manufacture("battery_type")

B0BS3S3QXP Lithium Polymer
B0BZSLM45M Lithium Ion
B0BZSLM45M Lithium Ion
B0C1SXGSH4 Lithium Polymer
B0C1SG2JRF Lithium Polymer
B0BJK5C7ZW Lithium Polymer
B0B6RR8GZQ Lithium Polymer
B07X97GXJJ Lithium Ion
B0777GJ99W Lithium Ion
B0C46B4GLM Lithium Ion
B0BS3S3QXP Lithium Polymer
B0C46B4GLM Lithium Ion
B07X97GXJJ Lithium Ion
B0C1SXGSH4 Lithium Polymer
B0B59VZB13 Lithium Polymer
B07WHQPCGY Lithium Ion
B0B6RR8GZQ Lithium Polymer
B0BS3S3QXP Lithium Polymer
B0B8HJVLBK Lithium Polymer
B08444SXZJ Lithium Ion
B0C1SXGSH4 Lithium Polymer


In [24]:
data.dropna(
    subset=["battery_type", "form_factor"],
    inplace=True,
)

In [25]:
data.isna().sum().sort_values(ascending=False)

warranty           15
mrp                 0
model_name          0
form_factor         0
camera              0
battery_type        0
battery_power       0
weight              0
dimensions          0
inbuilt_storage     0
ram                 0
os                  0
no of 1 star        0
no of 2 star        0
no of 3 star        0
no of 4 star        0
no of 5 star        0
manufacturer        0
dtype: int64

In [26]:
data.dtypes

mrp                 int64
model_name         object
no of 5 star        int64
no of 4 star        int64
no of 3 star        int64
no of 2 star        int64
no of 1 star        int64
os                 object
ram                object
inbuilt_storage    object
dimensions         object
weight             object
battery_power      object
battery_type       object
camera             object
warranty           object
form_factor        object
manufacturer       object
dtype: object

## battery_power, inbuilt_storage, ram and weight

In [27]:
data["weight"]

product_id
B09TWH8YHM       215 grams
B0B4F38D6K       207 grams
B0B56YRDDT       194 grams
B0BZ466BWW       190 grams
B0BYN48MQW       192 grams
B0BMQSF1M4       189 grams
B0BBH4C5KT       201 grams
B0BZ48VZMR       190 grams
B0BBM7L888       182 grams
B09WQYFLRX       195 grams
B0BMGB2TPR       188 grams
B0BZCSNTT4       206 grams
B0B4F2TTTS       207 grams
B0BSTVXFWD       207 grams
B09ZBF5Y4J       195 grams
B0BBFJLP21       200 grams
B0BNC7C9W4       203 grams
B07WGPKNGT       193 grams
B0B3CPQ5PF       190 grams
B07WHSR1NR       194 grams
B082DSTWZ1       192 grams
B0BBN3WF7V       192 grams
B0BY8JZ22K       195 grams
B0C14MRRVN       193 grams
B0BW5SL3V1       179 grams
B07WFPMQB1       173 grams
B0BSNNWX8R       205 grams
B0BS193NXQ       197 grams
B0BQ3PYMCZ       188 grams
B09ZBFD6TJ       181 grams
B09LHX1YFX       195 grams
B082F2T5PQ       188 grams
B0BTLJM8MS       190 grams
B085J1QWFV       196 grams
B09G38DT4B       158 grams
B0BS3RBVSL         0.53 kg
B0BX9YZNNM       

In [28]:
def extract_number(col_value):
    if pd.isnull(col_value):
        return None

    string = col_value.split()[0]
    return int(float(string))


def extract_grams(row_value):
    if pd.isnull(row_value):
        return None

    string, type = str(row_value).split()
    if type == "kg":
        string = float(string) * 1000
    string = int(float(string))
    return string

In [29]:
data["battery_power"] = data["battery_power"].apply(extract_number).astype(int)
data["inbuilt_storage"] = data["inbuilt_storage"].apply(extract_number).astype(int)
data["ram"] = data["ram"].apply(extract_number).astype(int)
data["weight"] = data["weight"].apply(extract_grams)

## LENGTH,WIDTH AND HEIGHT

In [30]:
data["dimensions"]

product_id
B09TWH8YHM                 0.9 x 16.5 x 7.7 cm
B0B4F38D6K                 0.9 x 7.7 x 16.5 cm
B0B56YRDDT                 16.5 x 7.6 x 0.9 cm
B0BZ466BWW                 16.6 x 7.6 x 0.8 cm
B0BYN48MQW                 16.9 x 7.6 x 0.9 cm
B0BMQSF1M4                 16.4 x 7.6 x 0.8 cm
B0BBH4C5KT                 16.4 x 0.9 x 7.6 cm
B0BZ48VZMR                 16.6 x 7.6 x 0.8 cm
B0BBM7L888                 0.9 x 7.6 x 16.4 cm
B09WQYFLRX                 7.6 x 0.9 x 16.4 cm
B0BMGB2TPR                 0.9 x 7.6 x 16.4 cm
B0BZCSNTT4                 0.9 x 7.7 x 16.7 cm
B0B4F2TTTS                     50 x 50 x 28 cm
B0BSTVXFWD                 16.5 x 7.6 x 0.9 cm
B09ZBF5Y4J                 18.4 x 9.8 x 6.5 cm
B0BBFJLP21                 16.4 x 0.9 x 7.6 cm
B0BNC7C9W4                 16.5 x 7.6 x 0.9 cm
B07WGPKNGT                 16.5 x 7.7 x 0.9 cm
B0B3CPQ5PF                 7.3 x 0.8 x 15.9 cm
B07WHSR1NR                 16.4 x 7.6 x 0.8 cm
B082DSTWZ1                 16.6 x 7.6 x 0.8 cm
B0

In [31]:
def extract_dimensions(row):
    dimensions = str(row["dimensions"])

    if pd.isna(dimensions):
        return None

    seperations = [value.split()[0] for value in dimensions.split(" x ")]
    length, width, height = sorted(map(float, seperations), reverse=True)

    if height >= 4:
        height /= 10

    if width > 15:
        width /= 10
    
    if length > 25:
        length /= 10

    return [length, width, height]

In [32]:
data[["length", "width", "height"]] = data.apply(
    extract_dimensions, axis=1, result_type="expand"
)

In [33]:
data[["length", "width", "height", "dimensions"]]

Unnamed: 0_level_0,length,width,height,dimensions
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B09TWH8YHM,16.5,7.7,0.9,0.9 x 16.5 x 7.7 cm
B0B4F38D6K,16.5,7.7,0.9,0.9 x 7.7 x 16.5 cm
B0B56YRDDT,16.5,7.6,0.9,16.5 x 7.6 x 0.9 cm
B0BZ466BWW,16.6,7.6,0.8,16.6 x 7.6 x 0.8 cm
B0BYN48MQW,16.9,7.6,0.9,16.9 x 7.6 x 0.9 cm
B0BMQSF1M4,16.4,7.6,0.8,16.4 x 7.6 x 0.8 cm
B0BBH4C5KT,16.4,7.6,0.9,16.4 x 0.9 x 7.6 cm
B0BZ48VZMR,16.6,7.6,0.8,16.6 x 7.6 x 0.8 cm
B0BBM7L888,16.4,7.6,0.9,0.9 x 7.6 x 16.4 cm
B09WQYFLRX,16.4,7.6,0.9,7.6 x 0.9 x 16.4 cm


In [34]:
data[["length", "width", "height"]].describe()

Unnamed: 0,length,width,height
count,636.0,636.0,636.0
mean,16.280252,7.640613,0.868956
std,1.518573,1.052114,0.267598
min,5.0,1.5,0.0
25%,16.1,7.5,0.8
50%,16.4,7.6,0.9
75%,16.5,7.7,0.9
max,25.0,15.0,3.6


In [35]:
third_quartile = data["length"].quantile(0.95)
second_quartile = data["length"].quantile(0.5)
data[(second_quartile < data["length"]) & (data["length"] < third_quartile)][["length"]]

Unnamed: 0_level_0,length
product_id,Unnamed: 1_level_1
B09TWH8YHM,16.5
B0B4F38D6K,16.5
B0B56YRDDT,16.5
B0BZ466BWW,16.6
B0BYN48MQW,16.9
B0BZ48VZMR,16.6
B0BZCSNTT4,16.7
B0BSTVXFWD,16.5
B0BNC7C9W4,16.5
B07WGPKNGT,16.5


## OS INFO

In [36]:
def extract_os_info(row):
    os_info = str(row["os"]).lower()

    # Split the os info by a comma, strip each item, and remove duplicates
    os_items = list(set([item.strip() for item in os_info.split(",")]))

    raw_name = os_items[0].strip()

    for item in os_items:
        if "android" not in item or "based on android" in item or "on android" in item:
            raw_name = item.strip()

    # name = os_info.split()[0]
    # go_edition = int("go edition" in os_info)

    try:
        match = re.search(r"\d+(\.\d+)?", raw_name)
        assert match
        version = float(match.group())
        name = raw_name[: raw_name.find(str(int(version)))].strip()
    except:
        version = np.inf
        name = os_info.split()[0]

    if "android" in name:
        name = name.split()[0]

    return name, version  # , go_edition


data[["os_name", "os_version"]] = data.apply(  # , "os_go?"
    extract_os_info, axis=1, result_type="expand"
)
data["os_name"] = data["os_name"].replace("s", "s30+")

In [37]:
data[["os_name", "os_version", "os"]]

Unnamed: 0_level_0,os_name,os_version,os
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B09TWH8YHM,android,12.0,Android 12.0
B0B4F38D6K,android,12.0,Android 12.0
B0B56YRDDT,android,12.0,Android 12 HiOS 8.6
B0BZ466BWW,android,13.0,Android 13.0
B0BYN48MQW,miui,13.0,"MIUI 13, Android 12.0"
B0BMQSF1M4,android,12.0,Android 12.0
B0BBH4C5KT,miui,13.0,MIUI 13
B0BZ48VZMR,android,13.0,Android 13.0
B0BBM7L888,android,11.0,Android 11.0
B09WQYFLRX,oxygenos,inf,OxygenOS


In [38]:
data["os_name"].value_counts()

os_name
android        408
funtouch os     86
miui            67
oxygenos        48
hios            19
windows          6
s30+             2
Name: count, dtype: int64

## Warranty

In [39]:
def extract_phone_warranty(value):
    duration = 0  # Default value

    match = re.search(r"(\d+)\s*(year|month|yr|mo)", str(value), re.IGNORECASE)
    if match:
        duration_value = int(match.group(1))
        duration_unit = match.group(2).lower()

        if duration_unit in ["year", "yr"]:
            duration = duration_value * 12
        elif duration_unit in ["month", "mo"]:
            duration = duration_value

    return duration


data["phone_warranty (months)"] = data["warranty"].apply(extract_phone_warranty)

In [40]:
data[["warranty", "phone_warranty (months)"]].dropna()

Unnamed: 0_level_0,warranty,phone_warranty (months)
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B09TWH8YHM,1 year manufacturer warranty for device and 6 ...,12
B0B4F38D6K,1 year manufacturer warranty for device and 6 ...,12
B0B56YRDDT,12 Months warranty,12
B0BZ466BWW,1 Year manufacturer Warranty for handset & 6 m...,12
B0BYN48MQW,1 year manufacturer warranty for device and 6 ...,12
B0BMQSF1M4,1 year manufacturer warranty on handset & 6 mo...,12
B0BBH4C5KT,1 year manufacturer warranty for device and 6 ...,12
B0BZ48VZMR,1 Year manufacturer Warranty for handset & 6 m...,12
B0BBM7L888,1 Year manufacturer Warranty for handset & 6 m...,12
B09WQYFLRX,1 year manufacturer warranty for device and 6 ...,12


In [41]:
data["phone_warranty (months)"].describe()

count    636.000000
mean      11.364780
std        2.842138
min        0.000000
25%       12.000000
50%       12.000000
75%       12.000000
max       24.000000
Name: phone_warranty (months), dtype: float64

## Camera

In [42]:
def camera_features(desc):
    features = {}

    if pd.isna(desc):
        return None
    
    features["camera_count"] = 2

    # Number of cameras
    if "quad" in desc.lower():
        features["camera_count"] += 4
    if "triple" in desc.lower():
        features["camera_count"] += 3
    if "dual" in desc.lower():
        features["camera_count"] += 2

    features["has_front_camera_details"] = int("mp" in desc.lower()) if "front" in desc.lower() else 0  # default is no specifics
    features["has_rear_camera_details"] = int("mp" in desc.lower()) if "rear" in desc.lower() else 0  # default is no specifics

    # Check for specific features
    features["cam_has_AI"] = int(
        "ai" in desc.lower() or features["has_rear_camera_details"]
    )
    features["cam_has_OIS"] = int(
        "ois" in desc.lower() or features["has_rear_camera_details"]
    )
    features["cam_has_Zoom"] = int(
        "zoom" in desc.lower() or features["has_rear_camera_details"]
    )
    features["cam_has_HDR"] = int(
        "hdr" in desc.lower() or features["has_rear_camera_details"]
    )
    features["cam_has_Macro"] = int(
        "macro" in desc.lower() or features["has_rear_camera_details"]
    )
    features["cam_has_Portrait"] = int(
        "portrait" in desc.lower() or features["has_rear_camera_details"]
    )

    # Camera resolution
    match = re.search(r"(\d+)MP", desc)
    features["main_camera_MP"] = int(match.group(1)) if match else 12  # default is 12MP

    del features["has_rear_camera_details"], features["has_front_camera_details"]

    return pd.Series(features)

In [43]:
camera_features_df = data["camera"].apply(camera_features)

In [44]:
camera_features_df.describe()

Unnamed: 0,camera_count,cam_has_AI,cam_has_OIS,cam_has_Zoom,cam_has_HDR,cam_has_Macro,cam_has_Portrait,main_camera_MP
count,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0
mean,2.503145,0.210692,0.209119,0.207547,0.207547,0.205975,0.210692,20.056604
std,1.196447,0.408121,0.407,0.40587,0.40587,0.40473,0.408121,17.944094
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
max,11.0,1.0,1.0,1.0,1.0,1.0,1.0,108.0


In [45]:
data = pd.concat([data, camera_features_df], axis=1)

## Deletion

In [46]:
data.columns

Index(['mrp', 'model_name', 'no of 5 star', 'no of 4 star', 'no of 3 star',
       'no of 2 star', 'no of 1 star', 'os', 'ram', 'inbuilt_storage',
       'dimensions', 'weight', 'battery_power', 'battery_type', 'camera',
       'warranty', 'form_factor', 'manufacturer', 'length', 'width', 'height',
       'os_name', 'os_version', 'phone_warranty (months)', 'camera_count',
       'cam_has_AI', 'cam_has_OIS', 'cam_has_Zoom', 'cam_has_HDR',
       'cam_has_Macro', 'cam_has_Portrait', 'main_camera_MP'],
      dtype='object')

In [47]:
data.drop(
    columns=[
        "model_name",
        "dimensions",
        "manufacturer",
        "warranty",
        "camera", "os"
    ],
    inplace=True,
)

In [48]:
data.columns

Index(['mrp', 'no of 5 star', 'no of 4 star', 'no of 3 star', 'no of 2 star',
       'no of 1 star', 'ram', 'inbuilt_storage', 'weight', 'battery_power',
       'battery_type', 'form_factor', 'length', 'width', 'height', 'os_name',
       'os_version', 'phone_warranty (months)', 'camera_count', 'cam_has_AI',
       'cam_has_OIS', 'cam_has_Zoom', 'cam_has_HDR', 'cam_has_Macro',
       'cam_has_Portrait', 'main_camera_MP'],
      dtype='object')

In [49]:
print(data.describe())

                 mrp   no of 5 star  no of 4 star  no of 3 star  no of 2 star   
count     636.000000     636.000000     636.00000    636.000000    636.000000  \
mean    21808.328616    6011.894654    2785.25000    996.319182    343.328616   
std     17992.044321   19481.296991    9402.25012   3634.452196   1085.614072   
min      2790.000000       0.000000       0.00000      0.000000      0.000000   
25%      9499.000000      97.750000      38.75000     17.000000      8.000000   
50%     15990.000000     785.000000     287.00000    111.000000     41.000000   
75%     27999.000000    2671.000000    1297.25000    516.000000    216.000000   
max    164999.000000  171808.000000   84283.00000  35658.000000   9725.000000   

       no of 1 star         ram  inbuilt_storage      weight  battery_power   
count    636.000000  636.000000       636.000000  636.000000     636.000000  \
mean     895.006289   16.136792       101.202830  213.179245    4739.965409   
std     2559.590229   38.924783  

In [50]:
(
    data["no of 5 star"]
    + data["no of 4 star"]
    + data["no of 3 star"]
    + data["no of 2 star"]
    + data["no of 1 star"]
).describe()

count       636.000000
mean      11031.798742
std       36076.848649
min           0.000000
25%         191.750000
50%        1337.000000
75%        5401.500000
max      324165.000000
dtype: float64

In [51]:
def label_smartphones(df):
    # Define the thresholds for success
    thresholds = {
        "ram": 4,
        "battery_power": 4000,
        "phone_warranty (months)": 6,
        "inbuilt_storage": 64,
        "no_ratings": 1000,
        "camera_count": 2,
        "avg_rating": 4,
    }

    # Define the weightage for the Average Rating feature
    rating_weightage = 0.3
    no_rating_weightage = 0.5

    # Create a new column called "Label"
    df["is_success"] = False

    df["no_ratings"] = (
        df["no of 5 star"]
        + df["no of 4 star"]
        + df["no of 3 star"]
        + df["no of 2 star"]
        + df["no of 1 star"]
    )

    # Calculate average rating
    df["avg_rating"] = (
        df["no of 5 star"] * 5
        + df["no of 4 star"] * 4
        + df["no of 3 star"] * 3
        + df["no of 2 star"] * 2
        + df["no of 1 star"]
    ) / df["no_ratings"]

    # Iterate over each row and apply the labeling logic
    for index, row in df.iterrows():
        percent = 0

        # Calculate percent based on the difference from the thresholds
        for feature, threshold in thresholds.items():
            if feature == "avg_rating":
                # Apply weightage to the Average Rating feature
                diff = row[feature] - threshold
                percent += rating_weightage * (
                    diff / 5
                )
            if feature == "no_ratings":
                # Apply weightage to the Average Rating feature
                diff = row[feature] - threshold
                percent += no_rating_weightage * (
                    diff / 5
                )
            elif row[feature] >= threshold:
                percent += 0.1
            else:
                percent -= 0.1

        # Assign the label based on the overall value of percent
        df.at[index, "is_success"] = percent >= 0

    df.drop(columns=["no of 5 star", "no of 4 star", "no of 3 star", "no of 2 star", "no of 1 star", "avg_rating", "no_ratings"])
    return df

In [52]:
def encode_columns(data: pd.DataFrame, columns: dict[str, str]) -> pd.DataFrame:
    df = data.copy()

    for col, encoding_type in columns.items():
        if encoding_type == "one_hot":
            column = df.pop(col)
            one_hot = pd.crosstab((s := column.explode()).index, s)
            new_col_names = [
                f"{''.join(map(lambda x: x[0], col.split('_'))) }_{val}"
                for val in one_hot.columns
            ]
            one_hot.columns = new_col_names
            df = df.join(one_hot)
        elif encoding_type == "label":
            le = LabelEncoder()
            df[f"{col}"] = le.fit_transform(df[col].astype(str))
            print(
                {index: label for index, label in enumerate(le.classes_)},
                "-",
                {label: index for index, label in enumerate(le.classes_)},
            )

    return df

columns_to_encode = {
    "battery_type": "label",
    "form_factor": "label",
    "os_name": "label",
}

data = encode_columns(data, columns_to_encode)

{0: 'Lithium Ion', 1: 'Lithium Polymer'} - {'Lithium Ion': 0, 'Lithium Polymer': 1}
{0: 'bar', 1: 'flip', 2: 'foldable', 3: 'palm', 4: 'phablet', 5: 'slate', 6: 'slider', 7: 'smartphone', 8: 'touch', 9: 'touchscreen'} - {'bar': 0, 'flip': 1, 'foldable': 2, 'palm': 3, 'phablet': 4, 'slate': 5, 'slider': 6, 'smartphone': 7, 'touch': 8, 'touchscreen': 9}
{0: 'android', 1: 'funtouch os', 2: 'hios', 3: 'miui', 4: 'oxygenos', 5: 's30+', 6: 'windows'} - {'android': 0, 'funtouch os': 1, 'hios': 2, 'miui': 3, 'oxygenos': 4, 's30+': 5, 'windows': 6}


In [53]:
data = label_smartphones(data)

In [54]:
data.dtypes

mrp                          int64
no of 5 star                 int64
no of 4 star                 int64
no of 3 star                 int64
no of 2 star                 int64
no of 1 star                 int64
ram                          int32
inbuilt_storage              int32
weight                       int64
battery_power                int32
battery_type                 int32
form_factor                  int32
length                     float64
width                      float64
height                     float64
os_name                      int32
os_version                 float64
phone_warranty (months)      int64
camera_count                 int64
cam_has_AI                   int64
cam_has_OIS                  int64
cam_has_Zoom                 int64
cam_has_HDR                  int64
cam_has_Macro                int64
cam_has_Portrait             int64
main_camera_MP               int64
is_success                    bool
no_ratings                   int64
avg_rating          

In [55]:
data.is_success.value_counts()

is_success
True     368
False    268
Name: count, dtype: int64

In [56]:
len(list(data.iterrows()))

636

In [57]:
data.to_csv("../../data/processed-v2.csv")