## Essentials Data Cleaning
##### Goal: To clean irrelevant information from the dataset.
##### Dataset: Laptop/CPU product information for retail trade.

In [255]:
# import packages
import pandas as pd

In [256]:
# import datafile
data = pd.read_csv("uncleaned.csv")
data.head(2)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232


In [258]:
# about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


In [261]:
# data shape
data.shape

(1303, 12)

In [262]:
# How many missing values are there, in each column? 
print(data.isnull().sum())

Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64


In [265]:
# How many missing values are there, in each row?
print(data.duplicated().sum())

29


#### Editing the dataset

In [268]:
# Replace strange marks eg. ? with NaN
data.replace("?", pd.NA, inplace= True)

In [270]:
# Find the critical columns for future data handling.
critical_cols = ["Company", "Cpu", "Ram", "Memory", "Gpu", "OpSys", "Weight","Price"]

# Drop any rows missing data in critical_cols.
data = data.dropna(subset=critical_cols)

In [272]:
# Drop duplicates in dataset.
data = data.drop_duplicates()

In [274]:
# What does the dataset look like now?
print("What does the new cleaned data look like? ")
print(data.shape)

What does the new cleaned data look like? 
(1271, 12)


#### Standardizing the dataset

In [279]:
# Remove unncessary information of kg with nothing.
data["Weight"] = data["Weight"].replace("kg","", regex=True)

# Covert string to float
# data["Weight"] = pd.to_numeric(data["Weight"],errors='coerce')

data.head(2)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232


In [281]:
# Round decimal prices to 2 decimal places.
data["Price"] = data["Price"].round(2)

In [283]:
# Fill any missing data with 0.
data["Weight"].fillna(0, inplace = True)
data["Price"].fillna(0, inplace = True)
data.head(2)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.68
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.52


In [293]:
# Remove any spacing i.e Windows 10 to Windows_10
data["OpSys"] = data["OpSys"].str.lower().str.replace(" ", "_")

In [295]:
# What is the current data shape?
data.shape

(1271, 12)

In [299]:
# What does the data look like now?
data.tail(2)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,windows_10,2.19,40705.92
1302,1302.0,Asus,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,500GB HDD,Intel HD Graphics,windows_10,2.2,19660.32


In [303]:
# Extract CPU speed from CPU column to numerical only.

def extract_cpu(cpu_info):
    try:
        return float(cpu_info.split()[-1][:-3]) #turn 1.6GHZ to 1.6 only.
    except Exception as e:
        return None


In [309]:
# Create a new column for the CPU numerical 
data["CPU_Speed"] = data["Cpu"].apply(extract_cpu)
print("Current data shape: ", data.shape)

Current data shape:  (1271, 13)


In [313]:
# Standardize our memory, change the GB & TB to MB standardize 

def convert_memory(memory):
    try:
        if "GB" in memory:
            return int(memory.replace("GB","")) *1024
        elif "TB" in memory:
            return int(memory.replace("TB","")) *1024*1024
    except Exception as e:
        return None

In [315]:
# Create new column for the MB datasize
data["Memory_MB"] = data["Memory"].apply(convert_memory)

In [317]:
# Fill in any missing data with 0 for the new columns.
data["CPU_Speed"].fillna(0, inplace = True)
data["Memory_MB"].fillna(0, inplace = True)

In [331]:
# Fill any missing data from other columns.
data["Inches"].fillna(0, inplace = True)

#### Final Testing

In [327]:
# What is the final shape?
data.shape

(1271, 14)

In [333]:
# What are the missing values within each column after cleaning?
data.isnull().sum()

Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
CPU_Speed           0
Memory_MB           0
dtype: int64

In [335]:
# What are the number of duplicated rows after cleaning?
data.duplicated().sum()

0

In [337]:
# What are the remaining data types?
data.dtypes

Unnamed: 0          float64
Company              object
TypeName             object
Inches               object
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price               float64
CPU_Speed           float64
Memory_MB             int64
dtype: object

In [339]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1271 entries, 0 to 1302
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1271 non-null   float64
 1   Company           1271 non-null   object 
 2   TypeName          1271 non-null   object 
 3   Inches            1271 non-null   object 
 4   ScreenResolution  1271 non-null   object 
 5   Cpu               1271 non-null   object 
 6   Ram               1271 non-null   object 
 7   Memory            1271 non-null   object 
 8   Gpu               1271 non-null   object 
 9   OpSys             1271 non-null   object 
 10  Weight            1271 non-null   object 
 11  Price             1271 non-null   float64
 12  CPU_Speed         1271 non-null   float64
 13  Memory_MB         1271 non-null   int64  
dtypes: float64(3), int64(1), object(10)
memory usage: 148.9+ KB
