### Import Libraries

In [196]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

---
Load Dataset

In [197]:
mob = pd.read_csv('./Mobiles Dataset (2025).csv', encoding='latin1')
mob.head()

Unnamed: 0,Company Name,Model Name,Mobile Weight,RAM,Front Camera,Back Camera,Processor,Battery Capacity,Screen Size,Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year
0,Apple,iPhone 16 128GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 224,999","INR 79,999","CNY 5,799",USD 799,"AED 2,799",2024
1,Apple,iPhone 16 256GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 234,999","INR 84,999","CNY 6,099",USD 849,"AED 2,999",2024
2,Apple,iPhone 16 512GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 244,999","INR 89,999","CNY 6,499",USD 899,"AED 3,199",2024
3,Apple,iPhone 16 Plus 128GB,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 249,999","INR 89,999","CNY 6,199",USD 899,"AED 3,199",2024
4,Apple,iPhone 16 Plus 256GB,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 259,999","INR 94,999","CNY 6,499",USD 949,"AED 3,399",2024


---
Overview

In [198]:
mob.tail()

Unnamed: 0,Company Name,Model Name,Mobile Weight,RAM,Front Camera,Back Camera,Processor,Battery Capacity,Screen Size,Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year
925,Poco,Pad 5G 128GB,571g,8GB,8MP,8MP,Snapdragon 7s Gen 2,"10,000mAh",12.1 inches,"PKR 66,220","INR 23,999","CNY 2,099",USD 280,"AED 1,029",2024
926,Poco,Pad 5G 256GB,571g,8GB,8MP,8MP,Snapdragon 7s Gen 2,"10,000mAh",12.1 inches,"PKR 71,220","INR 25,999","CNY 2,299",USD 300,"AED 1,099",2024
927,Samsung,Galaxy Z Fold6 256GB,239g,12GB,"10MP, 4MP (UDC)",50MP,Snapdragon 8 Gen 3,4400mAh,7.6 inches,"PKR 604,999","INR 164,999","¥13,999","USD 1,899","AED 7,199",2024
928,Samsung,Galaxy Z Fold6 512GB,239g,12GB,"10MP, 4MP (UDC)",50MP,Snapdragon 8 Gen 3,4400mAh,7.6 inches,"PKR 544,999","INR 176,999","CNY 15,999",USD 1719,"AED 7,699",2024
929,Samsung,Galaxy Z Fold6 1TB,239g,12GB,"10MP, 4MP (UDC)",50MP,Snapdragon 8 Gen 3,4400mAh,7.6 inches,Not available,"INR 200,999","CNY 17,999","USD 2,259","AED 8,699",2024


In [199]:
mob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Company Name               930 non-null    object
 1   Model Name                 930 non-null    object
 2   Mobile Weight              930 non-null    object
 3   RAM                        930 non-null    object
 4   Front Camera               930 non-null    object
 5   Back Camera                930 non-null    object
 6   Processor                  930 non-null    object
 7   Battery Capacity           930 non-null    object
 8   Screen Size                930 non-null    object
 9   Launched Price (Pakistan)  930 non-null    object
 10  Launched Price (India)     930 non-null    object
 11  Launched Price (China)     930 non-null    object
 12  Launched Price (USA)       930 non-null    object
 13  Launched Price (Dubai)     930 non-null    object
 14  Launched Y

In [200]:
mob['Company Name'].unique()

array(['Apple', 'Samsung', 'OnePlus', 'Vivo', 'iQOO', 'Oppo', 'Realme',
       'Xiaomi', 'Lenovo', 'Motorola', 'Huawei', 'Nokia', 'Sony',
       'Google', 'Tecno', 'Infinix', 'Honor', 'POCO', 'Poco'],
      dtype=object)

In [201]:
# Poco to POCO
mob[mob['Company Name'] == 'Poco']['Company Name'] = 'POCO'

In [202]:
mob['Launched Year'].unique()

array([2024, 2023, 2022, 2021, 2020, 2019, 2017, 2018, 2016, 2014, 2025])

---
Couple of things to change.
1. Model Name to have its name only. And the storage in another column
2. Mobile Weight without g, and change type to float.
3. RAM without gb, and change type to float
4. Screen Size remove inches and (internal or external or unfolded)
5. In Prices, remove the currency, and change type to float.


In [204]:
import re
storage = mob["Model Name"].str.extract(r'(\d+)\s*(gb|tb)', flags=re.IGNORECASE)
storage

Unnamed: 0,0,1
0,128,GB
1,256,GB
2,512,GB
3,128,GB
4,256,GB
...,...,...
925,128,GB
926,256,GB
927,256,GB
928,512,GB


---
For the storage, some phones has a 1TB, so convert it to 1024GB for each TB.

In [205]:
mob["Storage"] = pd.to_numeric(storage[0], errors="coerce")
unit = storage[1].str.lower()
mob.loc[unit == "tb", "Storage"] *= 1024

In [206]:
mob["Model Name"] = mob["Model Name"].str.replace(
    r'\s*\d+\s*(gb|tb)',
    '',
    regex=True,
    flags=re.IGNORECASE
).str.strip()

mob["Storage"] = mob["Storage"].astype("Int64")
mob.head()

Unnamed: 0,Company Name,Model Name,Mobile Weight,RAM,Front Camera,Back Camera,Processor,Battery Capacity,Screen Size,Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year,Storage
0,Apple,iPhone 16,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 224,999","INR 79,999","CNY 5,799",USD 799,"AED 2,799",2024,128
1,Apple,iPhone 16,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 234,999","INR 84,999","CNY 6,099",USD 849,"AED 2,999",2024,256
2,Apple,iPhone 16,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 244,999","INR 89,999","CNY 6,499",USD 899,"AED 3,199",2024,512
3,Apple,iPhone 16 Plus,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 249,999","INR 89,999","CNY 6,199",USD 899,"AED 3,199",2024,128
4,Apple,iPhone 16 Plus,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 259,999","INR 94,999","CNY 6,499",USD 949,"AED 3,399",2024,256


In [207]:
#RAM to have it separately
mob.loc[mob['RAM'] == '8GB / 12GB', 'RAM'] = '12GB'

ram = mob['RAM'].str.replace(
    r'\s*(GB)',
    '',
    regex=True,
    flags=re.IGNORECASE
).str.strip()

mob['RAM'] = ram.astype(float)
mob['RAM'].unique()

array([ 6. ,  8. ,  4. ,  3. , 12. ,  2. ,  1.5, 16. , 10. ,  1. ])

In [208]:
#Mobile Weight to be separate.
test = mob['Mobile Weight'].str.replace(
    r'\s*(g)',
    '',
    regex=True,
    flags=re.IGNORECASE
).str.strip()


mob['Mobile Weight'] = test.astype(float)

In [210]:
mob["Screen Inches"] = pd.to_numeric(
    mob["Screen Size"].str.extract(r'(\d+\.?\d*)')[0],
    errors="coerce"
)

In [211]:
#The screen size can be (main, external, internal, unfold) so we will add it in separate column.
mob["Screen Role"] = "standard"

mob.loc[mob["Screen Size"].str.contains("external", case=False, na=False),
        "Screen Role"] = "external"

mob.loc[mob["Screen Size"].str.contains("internal", case=False, na=False),
        "Screen Role"] = "internal"

mob.loc[mob["Screen Size"].str.contains("main|unfold", case=False, na=False),
        "Screen Role"] = "main / unfolded"

mob.drop(columns=['Screen Size'], inplace = True)

In [213]:
#Remove mAh from the battery capacity.
bat = mob['Battery Capacity'].str.replace(
    r'\s*(mAh)',
    '',
    regex=True,
    flags=re.IGNORECASE
).str.strip()

mob['Battery Capacity'] = bat.str.replace(',', '').astype('int')

In [214]:
#Remove the currency in Country prices, and convert to float
price_cols = [
    'Launched Price (China)',
    'Launched Price (Pakistan)',
    'Launched Price (India)',
    'Launched Price (USA)',
    'Launched Price (Dubai)'
]

for col in price_cols:
    mob[col] = (
        mob[col]
        .astype(str)
        .str.replace(r'[^0-9.]', '', regex=True)
        .replace('', np.nan)
        .astype(float)
    )


In [226]:
# There is one field in the India price, where it is empty, so we add nan instead.
# And there is a price in usd that is 39622, which is not realistic so it is converted to 396.22
import numpy as np
mob['Launched Price (Pakistan)'] = mob['Launched Price (India)'].replace('', np.nan)
mob['Launched Price (USA)'].loc[(mob['Launched Price (USA)'] == 39622.)] = 396.22

In [230]:
mob.to_csv(
    "cleaned_mobile_dataset.csv",
    index=False,
    encoding="utf-8"
)

In [228]:
mob.head()

Unnamed: 0,Company Name,Model Name,Mobile Weight,RAM,Front Camera,Back Camera,Processor,Battery Capacity,Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year,Storage,Screen Inches,Screen Role
0,Apple,iPhone 16,174.0,6.0,12MP,48MP,A17 Bionic,3600,79999.0,79999.0,5799.0,799.0,2799.0,2024,128,6.1,standard
1,Apple,iPhone 16,174.0,6.0,12MP,48MP,A17 Bionic,3600,84999.0,84999.0,6099.0,849.0,2999.0,2024,256,6.1,standard
2,Apple,iPhone 16,174.0,6.0,12MP,48MP,A17 Bionic,3600,89999.0,89999.0,6499.0,899.0,3199.0,2024,512,6.1,standard
3,Apple,iPhone 16 Plus,203.0,6.0,12MP,48MP,A17 Bionic,4200,89999.0,89999.0,6199.0,899.0,3199.0,2024,128,6.7,standard
4,Apple,iPhone 16 Plus,203.0,6.0,12MP,48MP,A17 Bionic,4200,94999.0,94999.0,6499.0,949.0,3399.0,2024,256,6.7,standard
