In [1]:
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'C:\Users\Karen Fernandes\anaconda3\Files\Projects\Web Scraping\data_extraction_phones_dataset.csv')
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,price,original_price,discount,star,ratings_reviews,description,brand,...,screen_size_inch,display,ram,storage,expandable_storage,color,processor,battery,rating,review
0,0,0,"Apple iPhone 15 (Black, 128 GB)","₹64,999","₹79,600",18% off,4.6,"47,150 Ratings & 2,494 Reviews",128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Apple,...,6.1,Super Retina XDR,,128 GB,,Black,"A16 Bionic Chip, 6 Core Processor Processor",,"47,150 Ratings","2,494 Reviews"
1,1,1,"MOTOROLA Edge 50 (Koala Grey, 256 GB)","₹27,999","₹32,999",15% off,4.3,"1,465 Ratings & 101 Reviews",8 GB RAM | 256 GB ROM16.94 cm (6.67 inch) Disp...,MOTOROLA,...,6.67,,8 GB RAM,256 GB,,Koala Grey,Snapdragon 7 Gen 1 Accelerated Edition Processor,5000 mAh Battery,"1,465 Ratings",101 Reviews


In [3]:
df.shape

(480, 23)

In [4]:
# drop Unnamed: 0 column
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

In [5]:
# convert the title names to proper case
df['title'] = df['title'].str.title()

In [6]:
# cleaning out currency symbol and comma from price column
df['price'] = df['price'].str.replace(r'[₹,]','', regex=True)
df['original_price'] = df['original_price'].str.replace(r'[₹,]','', regex=True)

In [7]:
# cleaning out % off from discount column
df['discount'] = df['discount'].str.replace(r'% off','', regex=True)

In [8]:
# convert the brand names to proper case
df['brand'] = df['brand'].str.title()

In [9]:
# convert the model names to proper case
df['model'] = df['model'].str.title()

In [10]:
# extract rear camera
df['rear_camera'] = df['camera'].str.extract(r'(\d+MP) Rear Camera')
df['rear_camera'] = df['rear_camera'].fillna(df['camera'].str.extract(r'(\d+MP)').iloc[:, 0])

In [11]:
# extract front camera
df['front_camera'] = df['camera'].str.extract(r'(\d+MP) Front Camera')
df['front_camera'] = df['front_camera'].fillna(df['camera'].str.extract(r'\+ (\d+MP)').iloc[:, 0])

In [12]:
# clearing out 'MP' in front_camera and rear_camera
df['front_camera'] = df['front_camera'].str.replace(r'MP', '', regex=True)
df['rear_camera'] = df['rear_camera'].str.replace(r'MP', '', regex=True)

In [13]:
# clearing out 'RAM'
df['ram'] = df['ram'].str.replace(r' GB RAM', '', regex=True)

In [14]:
df = df[df['ram'] != '32 MB RAM']

In [15]:
# clearing out 'GB' in storage
df['storage'] = df['storage'].str.replace(r' GB', '', regex=True)

In [16]:
# dropping expandable storage column
df.drop('expandable_storage', axis=1, inplace=True)

In [17]:
df['battery'] = df['battery'].str.replace(r'(\d+ mAh).*?Battery', r'\1 Battery', regex=True)

In [18]:
# making it a numeric column
df['battery'] = df['battery'].str.replace(r' mAh Battery','', regex = True)

In [19]:
# making it a numeric column
df['rating'] = df['rating'].str.replace(r'[Ratings,]','', regex = True)

In [20]:
# making it a numeric column
df['review'] = df['review'].str.replace(r'[Reviews,]','', regex = True)

In [21]:
# # Fill NaN with '0 MP'
# df['front_camera'] = df['front_camera'].fillna('0 MP')

In [22]:
# filling in the display cells with median values
most_frequent = df['display'].mode()[0]
df['display'].fillna(most_frequent, inplace=True)

In [23]:
df['screen_size_cm'] = df['screen_size_cm'].astype(float)

In [24]:
# remove whitespaces
df = df.replace(r'\s+', ' ', regex=True)

In [25]:
# checking datatypes and nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 479 entries, 0 to 479
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             479 non-null    object 
 1   price             476 non-null    object 
 2   original_price    452 non-null    object 
 3   discount          450 non-null    object 
 4   star              476 non-null    float64
 5   ratings_reviews   476 non-null    object 
 6   description       479 non-null    object 
 7   brand             479 non-null    object 
 8   model             479 non-null    object 
 9   camera            477 non-null    object 
 10  screen_size_cm    479 non-null    float64
 11  screen_size_inch  479 non-null    float64
 12  display           479 non-null    object 
 13  ram               455 non-null    object 
 14  storage           476 non-null    object 
 15  color             473 non-null    object 
 16  processor         381 non-null    object 
 17  ba

In [26]:
# dropping missing data
columns = ['price', 'original_price', 'discount', 'star', 'ratings_reviews', 'color', 'camera', 'ram', 'storage', 'processor', 'battery', 'rating', 'review', 'front_camera', 'rear_camera']
df = df.dropna(subset=columns)
df.reset_index(drop=True, inplace=True)

In [27]:
df.shape

(315, 22)

In [28]:
# updating price, original_price, discount, rating, review to int
columns = ['price','original_price','discount','rating','review','battery','ram','storage','front_camera','rear_camera']

for column in columns:
    df[column] = df[column].astype(int)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             315 non-null    object 
 1   price             315 non-null    int64  
 2   original_price    315 non-null    int64  
 3   discount          315 non-null    int64  
 4   star              315 non-null    float64
 5   ratings_reviews   315 non-null    object 
 6   description       315 non-null    object 
 7   brand             315 non-null    object 
 8   model             315 non-null    object 
 9   camera            315 non-null    object 
 10  screen_size_cm    315 non-null    float64
 11  screen_size_inch  315 non-null    float64
 12  display           315 non-null    object 
 13  ram               315 non-null    int64  
 14  storage           315 non-null    int64  
 15  color             315 non-null    object 
 16  processor         315 non-null    object 
 1

In [30]:
df.columns

Index(['title', 'price', 'original_price', 'discount', 'star',
       'ratings_reviews', 'description', 'brand', 'model', 'camera',
       'screen_size_cm', 'screen_size_inch', 'display', 'ram', 'storage',
       'color', 'processor', 'battery', 'rating', 'review', 'rear_camera',
       'front_camera'],
      dtype='object')

In [31]:
# renaming column
df.to_csv('cleaned_phones_dataset.csv')