In [7]:
import numpy as np
import pandas as pd

In [15]:
data = pd.read_csv('.\\data\\laptopData.csv')
data.head(10)

In [9]:
data.info()

In [10]:
data = data.drop('Unnamed: 0', axis=1)

In [11]:
data.isnull().sum()

In [13]:
data[data.isna().any(axis=1)].sample(10)

In [14]:
data = data.dropna()
data.isnull().sum()

In [39]:
import re

pattern_1 = r'\b\d+x\d+\b'

def return_resolution(sr: str):
    resolution = re.search(pattern_1, sr)
    return resolution.group()


def drop_GB(ram: str):
    return ram[0:-2]


def drop_kg(weight: str):
    return weight[0:-2]


def find_speed_cpu(cpu: str):
    cpu_list = cpu.split()
    return cpu_list[-1].replace('GHz', '')


def find_mark_cpu(cpu:str):
    cpu_list = cpu.split()
    return cpu_list[0]


def find_size_memory(memory: str):
    total_memory = 0
    memory_split = memory.split(sep='+')
    for memory_element in memory_split:
        memory_size = memory_element.split()
        if memory_size[0][-2:] == 'TB':
            memory_gb = 1024*float(memory_size[0].replace('TB', ''))
            total_memory += memory_gb
        elif memory_size[0][-2:] == 'GB':
            memory_gb = float(memory_size[0].replace('GB', ''))
            total_memory += memory_gb
        else:
            total_memory = 0
        return total_memory

def find_kind_memory(memory: str):
    if memory != '?':
        memory_split = memory.split()
        memory_kind = memory_split[1]
    else:
        memory_kind = 'unknown'
    return memory_kind


data = data.assign(
    Ram=lambda x: x['Ram'].map(drop_GB),
    Weight=lambda x: x['Weight'].map(drop_kg),
    ScreenResolution=lambda x: x['ScreenResolution'].map(return_resolution),
    Cpu_speed=lambda x: x['Cpu'].map(find_speed_cpu),
    Cpu_mark=lambda x: x['Cpu'].map(find_mark_cpu),
    Memory_GB=lambda x: x['Memory'].map(find_size_memory),
    Memory_kind=lambda x: x['Memory'].map(find_kind_memory))
    
data.head(3)

data = data[data['Ram'] != '']
data['Ram'] = data['Ram'].astype(int)
data = data[data['Weight'] != '']
data['Weight'] = data['Weight'].astype(float)
data = data[data['Inches'] != '?']
data['Inches'] = data['Inches'].astype(float)
data['Cpu_speed'] = data['Cpu_speed'].astype(float)

data.drop(columns='Cpu',inplace=True)
data.drop(columns='Memory',inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1271 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1271 non-null   object 
 1   TypeName          1271 non-null   object 
 2   Inches            1271 non-null   float64
 3   ScreenResolution  1271 non-null   object 
 4   Ram               1271 non-null   int32  
 5   Gpu               1271 non-null   object 
 6   OpSys             1271 non-null   object 
 7   Weight            1271 non-null   float64
 8   Price             1271 non-null   float64
 9   Cpu_speed         1271 non-null   float64
 10  Cpu_mark          1271 non-null   object 
 11  Memory_GB         1271 non-null   float64
 12  Memory_kind       1271 non-null   object 
dtypes: float64(5), int32(1), object(7)
memory usage: 134.1+ KB


In [40]:
data.head(3)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Ram,Gpu,OpSys,Weight,Price,Cpu_speed,Cpu_mark,Memory_GB,Memory_kind
0,Apple,Ultrabook,13.3,2560x1600,8,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,2.3,Intel,128.0,SSD
1,Apple,Ultrabook,13.3,1440x900,8,Intel HD Graphics 6000,macOS,1.34,47895.5232,1.8,Intel,128.0,Flash
2,HP,Notebook,15.6,1920x1080,8,Intel HD Graphics 620,No OS,1.86,30636.0,2.5,Intel,256.0,SSD


In [41]:
column_order = list(data.columns)
column_order.remove("Price")
column_order.append("Price")
data = data.reindex(columns=column_order)
data.head(10)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Ram,Gpu,OpSys,Weight,Cpu_speed,Cpu_mark,Memory_GB,Memory_kind,Price
0,Apple,Ultrabook,13.3,2560x1600,8,Intel Iris Plus Graphics 640,macOS,1.37,2.3,Intel,128.0,SSD,71378.6832
1,Apple,Ultrabook,13.3,1440x900,8,Intel HD Graphics 6000,macOS,1.34,1.8,Intel,128.0,Flash,47895.5232
2,HP,Notebook,15.6,1920x1080,8,Intel HD Graphics 620,No OS,1.86,2.5,Intel,256.0,SSD,30636.0
3,Apple,Ultrabook,15.4,2880x1800,16,AMD Radeon Pro 455,macOS,1.83,2.7,Intel,512.0,SSD,135195.336
4,Apple,Ultrabook,13.3,2560x1600,8,Intel Iris Plus Graphics 650,macOS,1.37,3.1,Intel,256.0,SSD,96095.808
5,Acer,Notebook,15.6,1366x768,4,AMD Radeon R5,Windows 10,2.1,3.0,AMD,500.0,HDD,21312.0
6,Apple,Ultrabook,15.4,2880x1800,16,Intel Iris Pro Graphics,Mac OS X,2.04,2.2,Intel,256.0,Flash,114017.6016
7,Apple,Ultrabook,13.3,1440x900,8,Intel HD Graphics 6000,macOS,1.34,1.8,Intel,256.0,Flash,61735.536
8,Asus,Ultrabook,14.0,1920x1080,16,Nvidia GeForce MX150,Windows 10,1.3,1.8,Intel,512.0,SSD,79653.6
9,Acer,Ultrabook,14.0,1920x1080,8,Intel UHD Graphics 620,Windows 10,1.6,1.6,Intel,256.0,SSD,41025.6


In [42]:
data.to_csv("cleaning_data.csv", index=False)