In [1]:
import os

In [2]:
pwd

'c:\\Users\\ten\\Desktop\\Laptop_Price\\research\\testing'

In [3]:
os.chdir("../../")

In [20]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict
    preprocessed_data:Path

In [21]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [22]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
            preprocessed_data=config.preprocessed_data
        )

        return data_validation_config

In [13]:
import os
from mlProject import logger
import pandas as pd
import numpy as np

In [11]:
import re

def parse_processor_name(processor_name):
    # Define regular expressions for extracting information
    regexes = [
        re.compile(r'(\d+)(?:th|rd|st) Gen (Intel|AMD) (Core|i\d+|Celeron|Pentium|Atom|Ryzen|Athlon) ?(\w*)'),
        re.compile(r'(Apple) (M1|M2(?: Pro)?(?: Max)?)'),
        re.compile(r'(Intel) (Celeron|Pentium|Atom) (\w+)'),
        re.compile(r'(\d+)(?:th|rd|st) Gen (Intel) (Celeron) (\w+)'),
        re.compile(r'(\d+)(?:th|rd|st) Gen (Intel) (Pentium) (\w+)'),
        re.compile(r'(\d+)(?:th|rd|st) Gen (Intel) (Core) (i\d+) (\w*)'),
        re.compile(r'(\d+)(?:th|rd|st) Gen (Intel) (Core) (i\d+)'),
    ]

    # Match the regular expressions against the processor name
    for regex in regexes:
        match = regex.match(processor_name)
        if match:
            groups = match.groups()
            if groups[0] == 'Apple':
                return {'generation':'1','company': groups[0],'model_type': 'M1', 'version': groups[1]}
            elif groups[0] == 'Intel':
                if groups[2] in ['Celeron', 'Pentium', 'Atom']:
                    return {'generation': groups[1], 'company': groups[0], 'model_type': groups[2], 'version': groups[3]}
                elif groups[2] == 'Core':
                    return {'generation': groups[1], 'company': groups[0], 'model_type': f'{groups[2]} {groups[4]}', 'version': groups[5]}
                else:
                    return None
            else:
                return {'generation': groups[0], 'company': groups[1], 'model_type': groups[2], 'version': groups[3]}

    return None


import re

def get_gpu_type(gpu_name):
    # Define regular expressions for extracting GPU type information
    regexes = [
        re.compile(r'(NVIDIA|AMD)\s*(Radeon)?'),
        re.compile(r'(Apple)\s*(Integrated Graphics)'),
        re.compile(r'(Intel)\s*(Iris Xe Graphics|UHD Graphics|HD Graphics|Graphics)?'),
        re.compile(r'(ARM)\s*(Mali G\d+)'),
    ]

    # Match the regular expressions against the GPU name
    for regex in regexes:
        match = regex.search(gpu_name)
        if match:
            groups = match.groups()
            gpu_type = groups[1] if len(groups) > 1 and groups[1] else groups[0] if groups[0] else None
            return gpu_type

    return None

import re

def extract_cores_threads(cpu_name):
    # Check for the presence of Cores and Threads in the name
    cores_match = re.search(r'(\d+|Dual|Quad|Hexa|Octa)\s*Cores?', cpu_name)
    threads_match = re.search(r'(\d+)\s*Threads?', cpu_name)

    # Extract the number of cores and threads from the matches
    cores = 0 if cores_match is None else cores_match.group(1)
    threads = 0 if threads_match is None else threads_match.group(1)

    # Convert 'Dual', 'Quad', 'Hexa', 'Octa' to corresponding numbers
    cores_dict = {'Dual': 2, 'Quad': 4, 'Hexa': 6, 'Octa': 8}
    cores = cores_dict.get(cores, cores)

    return int(cores), int(threads)

In [36]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.preprocessed_data)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
            
            print(validation_status)
            return validation_status
        
        except Exception as e:
            raise e


    def advanced_processing(self)-> bool:
        try:
            df = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(df.columns)
            all_schema = self.config.all_schema.keys()

            # Step 1 lets handle the processor column

            processors = list(df['processor'])
            new = []
            for processor in processors:
                 value = parse_processor_name(processor)
                 new.append(value)
            
            processor_data = []
            for obj in new:
                if obj is None:
                    processor_data.append([None,None,None,None])
                else:
                    processor_data.append([obj['company'],obj['generation'],obj['version'],obj['model_type']])
            # adding new columns (feture engineering)
            df[['processor_brand','processor_gen','processor_version','processor_model']] = processor_data


            # Step 2 handle gpu column

            gpus = list(df['GPU'])
            gpu_data = []
            for gpu in gpus:
                value = get_gpu_type(gpu)
                gpu_data.append(value)
            # adding new column
            df['gpu_type'] = gpu_data

            
            # Step 3 handling cpu column

            cpu_data = []
            for cpu in list(df['CPU']):
                cpu_data.append(extract_cores_threads(cpu))
            
            # adding new columns
            df[['cpu_core','cpu_threads']] = cpu_data


            # Remove all unwanted columns from the data
            data = df.drop(['Unnamed: 0.1', 'Unnamed: 0','name','processor','CPU','Ram_type','GPU','processor_model'],axis=1)

            # handling Ram column
            data.update(data['Ram'].apply(lambda x: int(x.split('GB')[0])))

            # handling ROM
            data.update(data['ROM'].apply(lambda x: int(x.split('GB')[0]) if 'GB' in x else int(x.split('TB')[0])*1024))

            # handling ROM_type
            data.update(data['ROM_type'].apply(lambda x: 1 if 'SSD' in x else 0))

            # handling missing values in processor_gen column
            data.update(data['processor_gen'].fillna(data['processor_gen'].mode()[0],inplace=True))

            # handling missing values in processor_brand column
            data.update(data['processor_brand'].fillna(data['processor_brand'].mode()[0],inplace=True))

            # handling missing values in processor_model which depends on processor_brand
            for brand in data['processor_brand'].value_counts().index:
                data.update(data[data['processor_brand']==brand]['processor_version'].replace(np.nan,data[data['processor_brand']==brand]['processor_version'].mode()[0]))
            
            # handling missing values in gpu_type
            data['gpu_type'].fillna(data['gpu_type'].mode()[0],inplace=True)

            # OS column have little issue
            data.update(data['OS'].replace('Windows 11  OS','Windows 11 OS'))
            data.update(data['OS'].replace('Windows 10  OS','Windows 10 OS'))


            # some os the colums have numerical values but their dtype is object so handling them
            data[['Ram','ROM','ROM_type','processor_gen']] = data[['Ram','ROM','ROM_type','processor_gen']].apply(np.int64)
            print(data.isnull().sum())
            print(data.columns)
            logger.info("Advanced pre processing is done")

            data.to_csv(self.config.preprocessed_data)

            logger.info("data file saved to given path")
            return True
        
        except Exception as e:
            raise e



In [37]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.advanced_processing()
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2023-12-10 18:04:23,775: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-10 18:04:23,787: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-10 18:04:23,791: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-10 18:04:23,793: INFO: common: created directory at: artifacts]
[2023-12-10 18:04:23,794: INFO: common: created directory at: artifacts/data_validation]


brand                0
price                0
spec_rating          0
Ram                  0
ROM                  0
ROM_type             0
display_size         0
resolution_width     0
resolution_height    0
OS                   0
warranty             0
processor_brand      0
processor_gen        0
processor_version    0
gpu_type             0
cpu_core             0
cpu_threads          0
dtype: int64
Index(['brand', 'price', 'spec_rating', 'Ram', 'ROM', 'ROM_type',
       'display_size', 'resolution_width', 'resolution_height', 'OS',
       'warranty', 'processor_brand', 'processor_gen', 'processor_version',
       'gpu_type', 'cpu_core', 'cpu_threads'],
      dtype='object')
[2023-12-10 18:04:23,851: INFO: 1150478305: Advanced pre processing is done]
[2023-12-10 18:04:23,859: INFO: 1150478305: data file saved to given path]
True
