# Končna analiaza

In [1]:
import pandas as pd 
import numpy as np

In [4]:
def clean_column_name(name: str):
    name = name.strip()
    name = name.lower()
    name = name.replace("operating system", "os")
    name = name.replace(" ", "_")
    name = name.replace("(", "")
    name = name.replace(")", "")
    return name

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

In [16]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
laptops.columns = [clean_column_name(c) for c in laptops.columns]

# CLEANING
laptops["screen_size"] = laptops["screen_size"].str.replace('"', '').astype("float")
laptops["ram"] = laptops["ram"].str.replace("GB", "").astype("int")
laptops.rename(columns={"ram": "ram_gb", "screen_size": "screen_size_inches"}, inplace=True)
laptops["weight"] = laptops["weight"].str.replace("kgs", "").str.replace("kg", "").astype("float")
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")
# Extract the screen resolution from the screen column
resolution = laptops["screen"].str.split(" ").str[-1].str.split("x")
laptops['screen_width_px'] = resolution.str[0].astype("int")
laptops['screen_high_px'] = resolution.str[1].astype("int")
# odstranite stolpec screen
laptops.drop(columns=["screen"], inplace=True)
# Extract the processor speed from the cpu column
laptops['cpu_speed_ghz'] = laptops["cpu"].str.split(" ") \
              .str[-1] \
              .str.replace("GHz", "") \
              .astype("float")



# Extracting Values from Strings
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]

# Correcting Bad Values - map() method
laptops["os"] = laptops["os"].map(mapping_dict)
# Filling Missing Values
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"
laptops.loc[laptops["os_version"].isnull(), "os_version"] = "Version Unknown"
# Removing Duplicates
laptops.drop_duplicates(inplace=True)
# Replacing Values
laptops.replace("MSI", "Micro-Star", inplace=True)
# Dropping Columns
laptops.drop(columns=["category", "gpu"], inplace=True)

laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,screen_width_px,screen_high_px,cpu_speed_ghz,cpu_manufacturer,gpu_manufacturer
0,Apple,MacBook Pro,13.3,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,2560,1600,2.3,Intel,Intel
1,Apple,Macbook Air,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,1440,900,1.8,Intel,Intel
2,HP,250 G6,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86,575.0,1920,1080,2.5,Intel,Intel
3,Apple,MacBook Pro,15.4,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83,2537.45,2880,1800,2.7,Intel,AMD
4,Apple,MacBook Pro,13.3,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37,1803.6,2560,1600,3.1,Intel,Intel


In [19]:
laptops.to_json("data/clean_laptops.json")