# Priprava in čiščenje podatkov


In [3]:
import pandas as pd
import numpy as np

# Copy-on-Write will become the default in pandas 3.0. We recommend turning it on now to benefit from all improvements.
pd.options.mode.copy_on_write = True

In [2]:
!head -n 5 data/INPUT_laptops.csv

Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM, Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,"1339,69"
Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,"898,94"
HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,"575,00"
Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,"2537,45"


## Reading CSV Files with Encodings

In [6]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [7]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


## Cleaning Column Names

In [21]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


## Converting String Columns to Numeric

In [36]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

# Converting String Columns to Numeric
# Column - screen_size
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "").astype("float")
# Column - ram
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
# Column - weight
laptops["weight"] = laptops["weight"].str.replace("kg", "").str.replace("s", "").astype("float")
# Column - price_euros
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
laptops = laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches", "weight": "weight_kg"})

laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34,898.94


## Extracting Values from Strings

In [46]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

# Converting String Columns to Numeric
# Column - screen_size
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "").astype("float")
# Column - ram
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
# Column - weight
laptops["weight"] = laptops["weight"].str.replace("kg", "").str.replace("s", "").astype("float")
# Column - price_euros
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
laptops = laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches", "weight": "weight_kg"})

# Extracting Values from Strings
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops['cpu_manufacturer'] = laptops["cpu"].str.split().str[0]

laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37,1339.69,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34,898.94,Intel,Intel


## Correcting Bad Values - map() method

In [51]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

# Converting String Columns to Numeric
# Column - screen_size
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "").astype("float")
# Column - ram
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
# Column - weight
laptops["weight"] = laptops["weight"].str.replace("kg", "").str.replace("s", "").astype("float")
# Column - price_euros
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
laptops = laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches", "weight": "weight_kg"})

# Extracting Values from Strings
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops['cpu_manufacturer'] = laptops["cpu"].str.split().str[0]

# Correcting Bad Values - map() method - če vrednost pozabimo postane nan
mapping_os = {
    "macOS": "mac",
    "No OS": None,
    "Windows": "windows",
    "Mac OS": "mac",
    "Linux": "linux",
    "Android": "android",
    "Chrome OS": "chrome_os"
}
laptops["os"] = laptops["os"].map(mapping_os)

laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,mac,,1.37,1339.69,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,mac,,1.34,898.94,Intel,Intel


## Introduction to Missing Data

In [54]:
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                     66
os_version            170
weight_kg               0
price_euros             0
gpu_manufacturer        0
cpu_manufacturer        0
dtype: int64

In [55]:
laptops["os_version"].value_counts(dropna=False)

os_version
10      1072
NaN      170
7         45
X          8
10 S       8
Name: count, dtype: int64

In [57]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

os
linux        62
chrome_os    27
mac          13
android       2
Name: count, dtype: int64

In [60]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

# Converting String Columns to Numeric
# Column - screen_size
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "").astype("float")
# Column - ram
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
# Column - weight
laptops["weight"] = laptops["weight"].str.replace("kg", "").str.replace("s", "").astype("float")
# Column - price_euros
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
laptops = laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches", "weight": "weight_kg"})

# Extracting Values from Strings
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops['cpu_manufacturer'] = laptops["cpu"].str.split().str[0]

# Correcting Bad Values - map() method - če vrednost pozabimo postane nan
mapping_os = {
    "macOS": "mac",
    "No OS": None,
    "Windows": "windows",
    "Mac OS": "mac",
    "Linux": "linux",
    "Android": "android",
    "Chrome OS": "chrome_os"
}
laptops["os"] = laptops["os"].map(mapping_os)

# Missing data
laptops.loc[(laptops["os"] == "mac") & laptops["os_version"].isnull(), "os_version"] = "X"

laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,mac,X,1.37,1339.69,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,mac,X,1.34,898.94,Intel,Intel


## Removing Duplicates

In [62]:
laptops.duplicated().value_counts()

False    1275
True       28
Name: count, dtype: int64

In [75]:
def clean_dataframe_columns(column_name: str) -> str:
    column_name = column_name.replace("(", "").replace(")", "")
    column_name = column_name.replace("Operating System", "os")
    return column_name.strip().replace(" ", "_").lower()

laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
# Cleaning Column Names
laptops.columns = [clean_dataframe_columns(laptop) for laptop in laptops.columns]

# Converting String Columns to Numeric
# Column - screen_size
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "").astype("float")
# Column - ram
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
# Column - weight
laptops["weight"] = laptops["weight"].str.replace("kg", "").str.replace("s", "").astype("float")
# Column - price_euros
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
laptops = laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches", "weight": "weight_kg"})

# Extracting Values from Strings
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops['cpu_manufacturer'] = laptops["cpu"].str.split().str[0]

# Correcting Bad Values - map() method - če vrednost pozabimo postane nan
mapping_os = {
    "macOS": "mac",
    "No OS": None,
    "Windows": "windows",
    "Mac OS": "mac",
    "Linux": "linux",
    "Android": "android",
    "Chrome OS": "chrome_os"
}
laptops["os"] = laptops["os"].map(mapping_os)

# Missing data
laptops.loc[(laptops["os"] == "mac") & laptops["os_version"].isnull(), "os_version"] = "X"

# Removing Duplicates
print(f"Pred odstranitvijo duplikatov: {laptops.shape[0]}")
laptops = laptops.drop_duplicates()
print(f"Po odstranitvi duplikatov: {laptops.shape[0]}")

# Replacing Values
laptops = laptops.replace("MSI", "Micro-Star International")

# Extract the screen resolution from the screen column
resolution = laptops["screen"].str.split(" ").str[-1].str.split("x")
laptops["screen_width_px"] = resolution.str[0].astype("int")
laptops["screen_hight_px"] = resolution.str[1].astype("int")  
# Dropping Columns - screen
laptops = laptops.drop(columns=["screen"])

laptops.head(2)

Pred odstranitvijo duplikatov: 1303
Po odstranitvi duplikatov: 1275


Unnamed: 0,manufacturer,model_name,category,screen_size_inches,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_hight_px
0,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,mac,X,1.37,1339.69,Intel,Intel,2560,1600
1,Apple,Macbook Air,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,mac,X,1.34,898.94,Intel,Intel,1440,900


## Vaje za analizo

#### Naloga 1: Kateri je najcenejši laptop (top 5), ki ima scren_size več ali enko 15 inchov ter ima najmanj 8 GB RAMa?

In [79]:
cols_to_show = ['manufacturer', 'model_name', 'price_euros', 'ram_gb', 'screen_size_inches']

In [96]:
laptops.loc[(laptops["screen_size_inches"] >= 15) & (laptops["ram_gb"] >= 8), cols_to_show].sort_values(by="price_euros").head(1)

Unnamed: 0,manufacturer,model_name,price_euros,ram_gb,screen_size_inches
783,Lenovo,IdeaPad 110-15IBR,329.0,8,15.6


#### Naloga 2: Določite razliko v popvrečni ceni za latope s procesorjem Intel in AMD

In [93]:
intel_mean_price = laptops.loc[laptops["cpu_manufacturer"] == "Intel", "price_euros"].mean()
amd_mean_price = laptops.loc[laptops["cpu_manufacturer"] == "AMD", "price_euros"].mean()
print(f"AMD je v povprečju cenejši za {intel_mean_price - amd_mean_price: .2f}€")

AMD je v povprečju cenejši za  602.74€


#### Naloga 3: Kateri laptop ima največ RAMa?

In [95]:
laptops[laptops['ram_gb'] == laptops['ram_gb'].max()]

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_hight_px
1066,Asus,ROG G701VO,Gaming,17.3,Intel Core i7 6820HK 2.7GHz,64,1TB SSD,Nvidia GeForce GTX 980,windows,10,3.58,3975.0,Nvidia,Intel,1920,1080
