In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv(r'C:\Users\Karen Fernandes\anaconda3\Files\Projects\Web Scraping\phones.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,price,original_price,discount,star,review,description
0,0,"Apple iPhone 15 (Black, 128 GB)","₹64,999","₹79,600",18% off,4.6,"47,150 Ratings & 2,494 Reviews",128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...
1,1,"MOTOROLA Edge 50 (Koala Grey, 256 GB)","₹27,999","₹32,999",15% off,4.3,"1,465 Ratings & 101 Reviews",8 GB RAM | 256 GB ROM16.94 cm (6.67 inch) Disp...


In [3]:
df.shape

(480, 8)

In [4]:
# renaming column
df.rename(columns = {'review': 'ratings_reviews'}, inplace = True)

In [5]:
# creating a new brand column from title column
df['brand'] = df['title'].str.split().str[0]

In [6]:
# creating a new model name column from title column
df['model'] = df['title'].str.extract(r'(^[^\(]*)')

In [7]:
def extract_camera(description):
    # Define regex patterns
    patterns = {
        'pattern1': r'(\d+MP(?: \([\w\s]+\))?(?: \+ [\w\s]+)* \| \d+MP Front Camera)',  # General MP info with Front Camera
        'pattern2': r'(\d+MP(?: \([\w\s]+\))?(?: \+ [\w\s]+)* (?:Rear Camera) \| \d+MP Front Camera)',  # Rear Camera with Front Camera
        'pattern3': r'(\d+MP(?: \([\w\s]+\))?(?: \+ \d+MP)+ \| \d+MP Front Camera)',  # Multiple MP values
        'pattern4': r'(\d+MP(?: \([\w\s]+\))?(?: \+ [\w\s]+)+ \| \d+MP Front Camera)',  # With terms like "AI Lens"
        'pattern5': r'(\d+MP(?: \([\w\s]+\))?(?: \+ \d+MP)* \| \d+MP Front Camera)',  # Includes zero MP values
        'pattern6': r'(\d+\s*MP\s*Rear\s*Camera)',  # Simple Rear Camera format
        'pattern7': r'(\d+MP(?: \([\w\s]+\))?(?: \+ \d+MP)?(?: \([\w\s]+\))? \| \d+MP Front Camera)',  # General case with optional details
        'pattern8': r'(\d+MP(?: \+ \d+MP)* \| \d+\.\d+MP Front Camera)',  # Decimal point for front camera
        'pattern9': r'(\d+\s*MP\s*Front\s*Camera)' #0MP Front Camera
    }
    
    # Define a list of patterns in a prioritized order
    pattern_order = [
        'pattern1',
        'pattern2',
        'pattern3',
        'pattern4',
        'pattern5',
        'pattern6',
        'pattern7',
        'pattern8',
        'pattern9'
    ]
    
   # Try to match each pattern in the defined order
    for pattern_key in pattern_order:
        match = pd.Series(description).str.extract(patterns[pattern_key])[0].values[0]
        if pd.notna(match):
            return pd.Series([match])
    
    return pd.Series([None])

# Apply the function to the DataFrame
df['camera'] = df['description'].apply(extract_camera)

In [8]:
# extracting and creating two new columns - screen_size_cm and screen_size_inch from description column
pattern = r'(?i)(\d+\.?\d*)\s*cm\s*\((\d+\.?\d*)\s*inch\)'
df[['screen_size_cm', 'screen_size_inch']] = df['description'].str.extract(pattern)

In [9]:
# Define a function to extract display information
def extract_display(description):
    if pd.isna(description):
        return None
    if re.search(r'Full HD\+', description, re.IGNORECASE): # Full HD+ Display
        return 'Full HD+'
    elif re.search(r'HD\+', description, re.IGNORECASE):    # HD+ Display
        return 'HD+'
    elif re.search(r'HD', description, re.IGNORECASE):    # HD Display
        return 'HD'
    elif re.search(r'Super Retina XDR Display', description, re.IGNORECASE):    # Super Retina XDR Display
        return 'Super Retina XDR'
    elif re.search(r'QVGA Display', description, re.IGNORECASE):    # QVGA Display
        return 'QVGA'
    else:
        return None

# Apply the function to create a new 'Display' column
df['display'] = df['description'].apply(extract_display)

In [10]:
# extracting and creating a new ram column from description column
def extract_ram(description):
    # Define regex patterns
    patterns = {
        'pattern1': r'(\d+\s*GB\s*RAM)',  # GB Ram
        'pattern2': r'(\d+\s*MB\s*RAM)'   # MB Ram
    }

    # Define a list of patterns in a prioritized order
    pattern_order = [
        'pattern1',
        'pattern2'
    ]

   # Try to match each pattern in the defined order
    for key in patterns:
        match = pd.Series([description]).str.extract(patterns[key])
        if not match.empty and pd.notna(match.iloc[0, 0]):
            return match.iloc[0, 0]

    return None

# Apply the function to the DataFrame
df['ram'] = df['description'].apply(extract_ram)

In [11]:
# extracting and creating a new storage column from title column
df['storage'] = df['title'].str.extract(r'(\d+\s*GB)')

In [12]:
# extracting storage from description column
def extract_storage(description):
    # Define regex patterns
    patterns = {
        'pattern1': r'Expandable\s*Upto\s*(\d+\s*TB)',  # TB
        'pattern2': r'Expandable\s*Upto\s*(\d+\s*GB)'   # GB
    }
    
    # Define a list of patterns in a prioritized order
    pattern_order = [
        'pattern1',
        'pattern2'
    ]
    
   # Try to match each pattern in the defined order
    for key in patterns:
        match = pd.Series([description]).str.extract(patterns[key])
        if not match.empty and pd.notna(match.iloc[0, 0]):
            return match.iloc[0, 0]
    
    return None

# Apply the function to the DataFrame
df['expandable_storage'] = df['description'].apply(extract_storage)

In [13]:
# extracting color from title column
df['color'] = df['title'].str.extract(r'\(([^)]+), \d{2,3} GB\)')

In [14]:
# extracting and creating a new processor column from description column
def extract_processor(description):
    # Define regex patterns
    patterns = {
        'pattern1': r'Battery\s*(.*?Processor)',  
        'pattern2': r'(\b[A-Za-z0-9\s,-]*\bProcessor)' # A16 Bionic Chip, 6 Core Processor
    }

    # Define a list of patterns in a prioritized order
    pattern_order = [
        'pattern1',
        'pattern2'
    ]

   # Try to match each pattern in the defined order
    for key in pattern_order:
        match = pd.Series([description]).str.extract(patterns[key])
        if not match.empty and pd.notna(match.iloc[0, 0]):
            # After finding a match, remove any leading 'Camera' content
            processor = match.iloc[0, 0].split('Camera')[-1].strip()
            return processor

    return None

# Apply the function to the DataFrame
df['processor'] = df['description'].apply(extract_processor)

In [15]:
# extracting and creating a new ram column from description column
def extract_battery(description):
    # Define regex patterns
    patterns = {
        'pattern1': r'(\d+\s*mAh\s*Battery)',  # 5000 mAh Battery
        'pattern2': r'(\d+\s*mAh\s*Lithium\s*Ion\s*Battery)', # 800 mAh Lithium Ion Battery
        'pattern3': r'(\d+\s*mAh\s*Lithium\s*ion\s*Battery)', # 6000 mAh Lithium ion Battery
        'pattern4': r'(\d+\s*mAh\s*Lithium-ion\s*Polymer\s*Battery)', #4800 mAh Lithium-ion Polymer Battery
        'pattern5': r'(\d+\s*mAh\s*Lithium\s*Ion\s*Polymer\s*Battery)', #4800 mAh Lithium Ion Polymer Battery
        'pattern6': r'(\d+\s*mAh\s*Li-ion\s*Polymer\s*Battery)', #5000 mAh Li-ion Polymer Battery
        'pattern7': r'(\d+\s*mAh\s*Li-ion\s*Battery)', #1000 mAh Li-ion Battery
        'pattern8': r'(\d+\s*mAh\s*Li-Ion\s*Battery)', #1000 mAh Li-Ion Battery
        'pattern9': r'(\d+\s*mAh\s*LiPo\s*Battery)', #5000 mAh LiPo Battery
        'pattern10': r'(\d+\s*mAh\s*Li-Polymer\s*Battery)' #5000 mAh Li-Polymer Battery
    }

    # Define a list of patterns in a prioritized order
    pattern_order = [
        'pattern1',
        'pattern2',
        'pattern3',
        'pattern4',
        'pattern5',
        'pattern6',
        'pattern7',
        'pattern8',
        'pattern9',
        'pattern10'
    ]

   # Try to match each pattern in the defined order
    for key in patterns:
        match = pd.Series([description]).str.extract(patterns[key])
        if not match.empty and pd.notna(match.iloc[0, 0]):
            return match.iloc[0, 0]

    return None

# Apply the function to the DataFrame
df['battery'] = df['description'].apply(extract_battery)

In [16]:
# creating a new rating column from ratings_reviews column
df['rating'] = df['ratings_reviews'].str.extract(r'(.*Ratings)')

In [17]:
# creating a new review column from ratings_reviews column
df['review'] = df['ratings_reviews'].str.extract(r'&\s*(.*Reviews)')

In [18]:
# renaming column
df.to_csv('data_extraction_phones_dataset.csv')

In [19]:
df.columns

Index(['Unnamed: 0', 'title', 'price', 'original_price', 'discount', 'star',
       'ratings_reviews', 'description', 'brand', 'model', 'camera',
       'screen_size_cm', 'screen_size_inch', 'display', 'ram', 'storage',
       'expandable_storage', 'color', 'processor', 'battery', 'rating',
       'review'],
      dtype='object')