In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv(r'C:\Users\KAREN J FERNANDES\anaconda3\Files\Projects\Flipkart Web Scraping\data_extraction_phones_dataset.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,title,price,original_price,discount,star,ratings_reviews,description,brand,...,screen_size_inch,display,ram,storage,expandable_storage,color,processor,battery,rating,review
0,0,0,"Motorola Edge 50 Fusion (Hot Pink, 128 GB)","₹22,999","₹25,999",11% off,4.5,"24,505 Ratings & 1,465 Reviews",8 GB RAM | 128 GB ROM17.02 cm (6.7 inch) Full ...,Motorola,...,6.7,Full HD+,8 GB RAM,128 GB,,Hot Pink,7s Gen 2 Processor,5000 mAh Battery,"24,505 Ratings","1,465 Reviews"
1,1,1,"Motorola G34 5G (Ice Blue, 128 GB)","₹11,999","₹14,999",20% off,4.2,"1,06,420 Ratings & 7,385 Reviews",8 GB RAM | 128 GB ROM16.51 cm (6.5 inch) HD+ D...,Motorola,...,6.5,HD+,8 GB RAM,128 GB,,Ice Blue,Snapdragon 695 5G Processor,5000 mAh Battery,"1,06,420 Ratings","7,385 Reviews"


In [3]:
# drop Unnamed: 0 column
df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
# convert the title names to proper case
df['title'] = df['title'].str.title()

In [5]:
# cleaning out currency symbol and comma from price column
df['price'] = df['price'].str.replace(r'[₹,]','', regex=True)
df['original_price'] = df['original_price'].str.replace(r'[₹,]','', regex=True)

In [6]:
# cleaning out % off from discount column
df['discount'] = df['discount'].str.replace(r'% off','', regex=True)

In [7]:
# convert the brand names to proper case
df['brand'] = df['brand'].str.title()

In [8]:
# convert the model names to proper case
df['model'] = df['model'].str.title()

In [9]:
# convert screen_size_inch to numeric, forcing errors to NaN
df['screen_size_inch'] = pd.to_numeric(df['screen_size_inch'], errors='coerce')

# Define function to fill display based on screen size
def assign_display_type(row):
    screen_size = row['screen_size_inch']
    
    if pd.isna(row['display']):
        if screen_size < 3:
            display_type = 'QVGA'
        elif 3 <= screen_size < 5.5:
            display_type = 'HD'
        elif 5.5 <= screen_size < 6:
            display_type = 'HD+'
        else:
            display_type = 'Full HD+'
        return display_type
    return row['display']

# Apply function to fill missing display values
df['display'] = df.apply(assign_display_type, axis=1)

In [10]:
# clearing out 'RAM'
df['ram'] = df['ram'].str.replace(r' RAM', '', regex=True)

In [11]:
# By research, it was found that the values of 'ram' are the same for 'storage' as well.
# Fill NaN values in 'storage' with corresponding 'ram' values where 'storage' is NaN
df['storage'] = df['storage'].fillna(df['ram'])

In [12]:
# Define a function to update processor information
def extract_processor(model):
    if pd.isna(model):
        return None
    if re.search('Apple iPhone 15 Plus', model, re.IGNORECASE):  
        return 'A16 Bionic Processor'
    elif re.search('Apple iPhone 15', model, re.IGNORECASE): 
        return 'A16 Bionic Processor'
    elif re.search('Apple iPhone 14 Plus', model, re.IGNORECASE): 
        return 'A15 Bionic Processor'
    elif re.search('SAMSUNG Galaxy M14 4G', model, re.IGNORECASE): 
        return 'Snapdragon 680 Processor'
    elif re.search('SAMSUNG Galaxy M34 5G', model, re.IGNORECASE): 
        return 'Exynos 1280 Processor'
    elif re.search('OnePlus Nord CE 3 Lite 5G', model, re.IGNORECASE): 
        return 'Qualcomm Snapdragon 695 Processor'
    elif re.search('Realme Narzo N63 4G', model, re.IGNORECASE): 
        return 'Mediatek Processor'
    elif re.search('Realme Narzo N63', model, re.IGNORECASE): 
        return 'Unisoc Processor'
    elif re.search('Redmi 13C 5G', model, re.IGNORECASE): 
        return 'Mediatek Processor'
    else:
        return None

# Apply the function to update the 'processor' column
df['processor'] = df.apply(lambda row: extract_processor(row['model']) if pd.isna(row['processor']) else row['processor'], axis=1)

In [13]:
# no brand of NA Processor available
df['processor'] = df['processor'].str.replace(r'NA Processor','', regex = True)

In [14]:
# Define a function to update battery information
def extract_battery(model):
    if pd.isna(model):
        return None
    if re.search('Apple iPhone 15 Plus', model, re.IGNORECASE):  
        return '4383 mAh Battery'
    elif re.search('Apple iPhone 15', model, re.IGNORECASE): 
        return '3349 mAh Battery'
    elif re.search('Apple iPhone 14 Plus', model, re.IGNORECASE): 
        return '3279 mAh Battery'
    else:
        return None

# Apply the function to update the 'battery' column
df['battery'] = df.apply(lambda row: extract_battery(row['model']) if pd.isna(row['battery']) else row['battery'], axis=1)

In [15]:
df['battery'] = df['battery'].str.replace(r'(?i)Lithium Ion', '', regex = True)

In [16]:
# making it a numeric column
df['rating'] = df['rating'].str.replace(r'[Ratings,]','', regex = True)

In [17]:
# making it a numeric column
df['review'] = df['review'].str.replace(r'[Reviews,]','', regex = True)

In [18]:
# checking datatypes
df.dtypes

id                      int64
title                  object
price                  object
original_price         object
discount               object
star                  float64
ratings_reviews        object
description            object
brand                  object
model                  object
camera                 object
screen_size_cm        float64
screen_size_inch      float64
display                object
ram                    object
storage                object
expandable_storage     object
color                  object
processor              object
battery                object
rating                 object
review                 object
dtype: object

In [19]:
# updating price, original_price, discount, rating, review to int
columns = ['price','original_price','discount','rating','review']

for column in columns:
    df[column] = df[column].astype(int)

In [20]:
df['screen_size_cm'] = df['screen_size_cm'].astype(float)

In [21]:
# renaming column
df.to_csv('cleaned_phones_dataset.csv')

In [22]:
df.columns

Index(['id', 'title', 'price', 'original_price', 'discount', 'star',
       'ratings_reviews', 'description', 'brand', 'model', 'camera',
       'screen_size_cm', 'screen_size_inch', 'display', 'ram', 'storage',
       'expandable_storage', 'color', 'processor', 'battery', 'rating',
       'review'],
      dtype='object')