In [22]:
import numpy as np
import pandas as pd

In [23]:
pd.set_option('display.max_columns', None)

# Show all rows
pd.set_option('display.max_rows', None)

# Optional: Show full content of each cell (prevent "..." in text)
pd.set_option('display.max_colwidth', None)

In [24]:
df = pd.read_csv('smartprix_mobiles_cleaned_final.csv')

In [25]:
df.columns

Index(['Product_Name', 'brand_name', 'Price', 'Rating', 'Spec_Score',
       'has_dual_sim', 'has_3g', 'has_4g', 'has_5g', 'has_nfc',
       'has_ir_blaster', 'processor_name', 'processor_brand', 'cores',
       'cpu_speed_ghz', 'ram_gb', 'rom_gb', 'battery_capacity_mah',
       'fast_charging_available', 'fast_charging_speed_w',
       'display_size_inches', 'resolution_width', 'resolution_height',
       'display_frequency_hz', 'front_camera_mp', 'front_camera_secondary_mp',
       'rear_camera_mp', 'rear_camera_secondary_mp', 'rear_camera_tertiary_mp',
       'rear_camera_count', 'card_supported', 'operating_system',
       'Product_Link', 'Image_URL', 'Specifications', 'device_type'],
      dtype='object')

In [26]:
df.cores.value_counts()

cores
Octa Core      899
Hexa Core       37
Single Core     13
Deca Core       11
Nine Core        3
Quad Core        2
Name: count, dtype: int64

In [27]:
core_mapping = {
    'Single Core': 1,
    'Dual Core': 2,
    'Quad Core': 4,
    'Hexa Core': 6,
    'Octa Core': 8,
    'Nine Core': 9,
    'Deca Core': 10
}

In [28]:
df['cores'] = df['cores'].map(core_mapping)


---

In [29]:
# Extract OS type - handle all variants
def extract_os_type(os_string):
    if pd.isna(os_string):
        return 'Unknown'
    os_string = str(os_string).strip()
    if 'Android' in os_string:
        return 'Android'
    elif 'iOS' in os_string or 'iPhone' in os_string:
        return 'iOS'
    elif 'Symbian' in os_string:
        return 'Symbian'
    elif 'Windows' in os_string:
        return 'Windows'
    elif 'KaiOS' in os_string:
        return 'KaiOS'
    else:
        return 'Other'

df['os_type'] = df['operating_system'].apply(extract_os_type)


In [30]:
# Extract OS version (numeric only)
def extract_os_version(os_string):
    if pd.isna(os_string):
        return np.nan
    os_string = str(os_string)
    # Extract version number after 'v'
    import re
    match = re.search(r'v(\d+\.?\d*)', os_string)
    if match:
        try:
            return float(match.group(1))
        except:
            return np.nan
    return np.nan

df['os_version'] = df['operating_system'].apply(extract_os_version)

---

In [31]:
print("\n[STEP 6] Dropping unnecessary columns...")

columns_to_drop = [
    'Product_Link',          # URL, not needed
    'Image_URL',             # URL, not needed
    'Specifications',        # Redundant text
    'operating_system'       # Replaced by os_type and os_version
]

df_eda = df.drop(columns=columns_to_drop)
print(f"✓ Dropped {len(columns_to_drop)} columns")
print(f"  Remaining columns: {df_eda.shape[1]}")


[STEP 6] Dropping unnecessary columns...
✓ Dropped 4 columns
  Remaining columns: 34


---

In [32]:
df.isnull().sum()

Product_Name                  0
brand_name                    0
Price                         0
Rating                        0
Spec_Score                    0
has_dual_sim                  0
has_3g                        0
has_4g                        0
has_5g                        0
has_nfc                       0
has_ir_blaster                0
processor_name               50
processor_brand              50
cores                        55
cpu_speed_ghz                77
ram_gb                        4
rom_gb                        6
battery_capacity_mah          0
fast_charging_available       0
fast_charging_speed_w        53
display_size_inches           0
resolution_width              9
resolution_height             9
display_frequency_hz          0
front_camera_mp              62
front_camera_secondary_mp     0
rear_camera_mp                4
rear_camera_secondary_mp      0
rear_camera_tertiary_mp       0
rear_camera_count             0
card_supported                0
operatin

In [33]:
temp_df = df[df['rear_camera_mp'].isnull()]

In [34]:
# ============================================================================
# STEP 7: SET CORRECT DATA TYPES
# ============================================================================
print("\n[STEP 7] Setting correct data types...")

# Integer columns (features that should be whole numbers)
int_columns = [
    'Price', 'Spec_Score', 'has_3g', 'has_4g', 'has_5g', 
    'has_nfc', 'has_ir_blaster', 'battery_capacity_mah',
    'fast_charging_available', 'display_frequency_hz',
    'rear_camera_count', 'card_supported'
]

for col in int_columns:
    df_eda[col] = df_eda[col].astype('int32')




[STEP 7] Setting correct data types...


In [35]:
# Float columns (can have decimals)
float_columns = [
    'Rating', 'cpu_speed_ghz', 'ram_gb', 'rom_gb', 'cores',
    'fast_charging_speed_w', 'display_size_inches',
    'resolution_width', 'resolution_height',
    'front_camera_mp', 'front_camera_secondary_mp',
    'rear_camera_mp', 'rear_camera_secondary_mp',
    'rear_camera_tertiary_mp', 'os_version'
]

for col in float_columns:
    df_eda[col] = df_eda[col].astype('float32')

In [36]:
# Categorical columns
categorical_columns = [
    'brand_name', 'processor_name', 'processor_brand',
    'device_type', 'os_type'
]

for col in categorical_columns:
    df_eda[col] = df_eda[col].astype('category')
    
print(f"✓ Set data types:")
print(f"  - Integer columns: {len(int_columns)}")
print(f"  - Float columns: {len(float_columns)}")
print(f"  - Categorical columns: {len(categorical_columns)}")

✓ Set data types:
  - Integer columns: 12
  - Float columns: 15
  - Categorical columns: 5


---
---
---

In [37]:
print("\n[STEP 8] Data quality checks and corrections...")

# Fix extreme outliers (based on domain knowledge)
# OS version outliers
df_eda.loc[df_eda['os_version'] > 30, 'os_version'] = np.nan

# Unrealistic values
df_eda.loc[df_eda['cpu_speed_ghz'] > 10, 'cpu_speed_ghz'] = np.nan
df_eda.loc[df_eda['display_size_inches'] > 10, 'display_size_inches'] = np.nan

print(f"✓ Fixed outliers and invalid values")


[STEP 8] Data quality checks and corrections...
✓ Fixed outliers and invalid values


In [38]:
df_eda.loc[824,['ram_gb', 'rom_gb']] = [0.031, 3.0]

In [39]:
df_eda.loc[390,['ram_gb', 'rom_gb']] = [3.0, 32.0]

In [40]:
df_eda.to_csv('mobiles_eda_ready.csv', index=False)