# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [1]:
import pandas as pd 
import numpy as np

In [2]:
!head -n 3 data/INPUT_laptops.csv

Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM, Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,"1339,69"
Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,"898,94"


In [3]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")

In [4]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [5]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


## Cleaning Column Names

In [6]:
#laptops[" Storage"]

In [7]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [8]:
def clean_column_name(name: str):
    name = name.strip()
    name = name.lower()
    name = name.replace("operating system", "os")
    name = name.replace(" ", "_")
    name = name.replace("(", "")
    name = name.replace(")", "")
    return name

new_columns_names = [clean_column_name(c) for c in laptops.columns]
laptops.columns = new_columns_names

In [9]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [10]:
from enum import Enum

class Laptops(Enum):
    MODEL_NAME = "model_name"
    CATEGORY = "category"


In [11]:
Laptops.MODEL_NAME

<Laptops.MODEL_NAME: 'model_name'>

In [12]:
laptops[Laptops.MODEL_NAME.value]

0                               MacBook Pro
1                               Macbook Air
2                                    250 G6
3                               MacBook Pro
4                               MacBook Pro
                       ...                 
1298                         Yoga 500-14ISK
1299                         Yoga 900-13ISK
1300                     IdeaPad 100S-14IBR
1301    15-AC110nv (i7-6500U/6GB/1TB/Radeon
1302    X553SA-XX031T (N3050/4GB/500GB/W10)
Name: model_name, Length: 1303, dtype: object

## Converting String Columns to Numeric

In [13]:
laptops.iloc[:5, 2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600



<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [14]:
laptops["screen_size"].dtype

dtype('O')

In [15]:
laptops["screen_size"].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [16]:
laptops["screen_size"] = laptops["screen_size"].str.replace('"', '')
laptops["screen_size"] = laptops["screen_size"].astype("float")

In [17]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [18]:
laptops["ram"].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)

In [19]:
laptops["ram"] = laptops["ram"].str.replace("GB", "")
laptops["ram"] = laptops["ram"].astype("int")

In [20]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [21]:
laptops.dtypes

manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram               int64
storage          object
gpu              object
os               object
os_version       object
weight           object
price_euros      object
dtype: object

## Renaming Columns

In [22]:
laptops.rename(columns={"ram": "ram_gb"}, inplace=True)

In [23]:
laptops.head(1)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969


In [24]:
laptops.rename(columns={"screen_size": "screen_size_inches"}, inplace=True)

In [25]:
laptops.head(1)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969


In [26]:
laptops["ram_gb"].describe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

## Extracting Values from Strings

In [27]:
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]

In [28]:
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]

In [30]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745,Intel,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360,Intel,Intel


In [31]:
laptops["gpu_manufacturer"].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manufacturer, dtype: int64

## Correcting Bad Values - map() method

In [34]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [39]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])

In [40]:
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

In [41]:
corrections = {
    "oranje": "orange",
    "bananna": "banana"
}

In [42]:
s.map(corrections)

0       NaN
1    orange
2    banana
3    orange
4    orange
5    orange
dtype: object

In [43]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

In [44]:
laptops["os"] = laptops["os"].map(mapping_dict)

In [45]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

## Introduction to Missing Data 

In [46]:
None

### Trade-Offs in Missing Data Conventions


### Missing Data in Pandas




### None: Pythonic missing data


In [48]:
vals1 = np.array([1, None, 2, 3])

In [49]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing numerical data



In [50]:
vals2 = np.array([1, np.nan, 3, 4])

In [51]:
vals2

array([ 1., nan,  3.,  4.])

In [52]:
vals2.dtype

dtype('float64')

In [53]:
np.nan + 1

nan

In [54]:
np.nan * 0

nan

In [58]:
np.nansum(vals2)

8.0

### NaN and None in Pandas



In [61]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64


<table>
<thead><tr>
<th>Typeclass</th>
<th>Conversion When Storing NAs</th>
<th>NA Sentinel Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>floating</code></td>
<td>No change</td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>object</code></td>
<td>No change</td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
<tr>
<td><code>integer</code></td>
<td>Cast to <code>float64</code></td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>boolean</code></td>
<td>Cast to <code>object</code></td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
</tbody>
</table>


### Operating on Null Values



- `isnull()`: Generate a boolean mask indicating missing values
- `notnull()`: Opposite of isnull()
- `dropna()`: Return a filtered version of the data
- `fillna()`: Return a copy of the data with missing values filled or imputed



#### Detecting null values



In [64]:
data = pd.Series([1, np.nan, 'hello', None, -1, 0, -9999, "NA"])

In [65]:
data

0        1
1      NaN
2    hello
3     None
4       -1
5        0
6    -9999
7       NA
dtype: object

In [66]:
data.isnull()

0    False
1     True
2    False
3     True
4    False
5    False
6    False
7    False
dtype: bool

In [67]:
data[data.notnull()]

0        1
2    hello
4       -1
5        0
6    -9999
7       NA
dtype: object

#### Dropping null values


In [72]:
data.dropna().reset_index(drop=True)

0        1
1    hello
2       -1
3        0
4    -9999
5       NA
dtype: object

In [73]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

In [74]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [75]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [76]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [77]:
df[3] = np.nan

In [78]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [80]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [84]:
df.dropna(axis="columns", thresh=3)

Unnamed: 0,2
0,2
1,5
2,6


#### Filling null values



In [85]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

In [86]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [87]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [88]:
data.fillna(method="ffill")

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [89]:
data.fillna(method="bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [90]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [93]:
df.fillna(method="ffill", axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Dropping Missing Values

In [95]:
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                      0
os_version            170
weight                  0
price_euros             0
cpu_manufacturer        0
gpu_manufacturer        0
dtype: int64

In [98]:
laptops.shape

(1303, 15)

In [99]:
laptops.dropna(axis=0).shape

(1133, 15)

In [100]:
laptops.dropna(axis=1).shape

(1303, 14)

## Filling Missing Values

In [103]:
laptops["os_version"].value_counts(dropna=False)

10      1072
NaN      170
7         45
X          8
10 S       8
Name: os_version, dtype: int64

In [106]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: os, dtype: int64

In [110]:
laptops.loc[laptops["os"] == "macOS", ["os", "os_version"]].head(10)

Unnamed: 0,os,os_version
0,macOS,
1,macOS,
3,macOS,
4,macOS,
6,macOS,X
7,macOS,
12,macOS,
14,macOS,
15,macOS,
17,macOS,


In [111]:
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

In [113]:
laptops.loc[laptops["os_version"].isnull(), "os_version"] = "Version Unknown"

In [114]:
laptops.isnull().sum()

manufacturer          0
model_name            0
category              0
screen_size_inches    0
screen                0
cpu                   0
ram_gb                0
storage               0
gpu                   0
os                    0
os_version            0
weight                0
price_euros           0
cpu_manufacturer      0
gpu_manufacturer      0
dtype: int64

## Removing Duplicates

In [115]:
laptops.duplicated().head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [116]:
laptops.duplicated().value_counts()

False    1275
True       28
dtype: int64

In [118]:
laptops[laptops.duplicated()].head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer
1275,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel
1276,Dell,Inspiron 3567,Notebook,15.6,1366x768,Intel Core i3 7100U 2.4GHz,6,1TB HDD,Intel HD Graphics 620,Windows,10,2.3kg,45900,Intel,Intel
1277,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4,500GB HDD,Intel HD Graphics 400,Linux,Version Unknown,2.4kg,28900,Intel,Intel
1278,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,500GB HDD,Intel HD Graphics,Windows,10,2.20kg,37900,Intel,Intel
1279,Lenovo,IdeaPad Y700-15ISK,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8,1TB HDD,Nvidia GeForce GTX 960M,Windows,10,2.6kg,89900,Intel,Nvidia


In [119]:
laptops[laptops["model_name"] == "ZenBook UX305CA-UBM1"]

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer
1261,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel
1275,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel
1289,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel


In [120]:
laptops.shape

(1303, 15)

In [121]:
laptops.drop_duplicates(inplace=True)

In [122]:
laptops.shape

(1275, 15)

In [125]:
len(laptops["model_name"].unique())

618

## Replacing Values

In [126]:
laptops["manufacturer"].value_counts()

Dell         291
Lenovo       289
HP           268
Asus         152
Acer         101
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: manufacturer, dtype: int64

In [127]:
laptops.replace("MSI", "Micro-Star", inplace=True)

In [128]:
laptops["manufacturer"].value_counts()

Dell          291
Lenovo        289
HP            268
Asus          152
Acer          101
Micro-Star     54
Toshiba        48
Apple          21
Samsung         9
Razer           7
Mediacom        7
Microsoft       6
Xiaomi          4
Vero            4
Chuwi           3
Google          3
Fujitsu         3
LG              3
Huawei          2
Name: manufacturer, dtype: int64

## Dropping Columns 

In [129]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,X,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,Version Unknown,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,X,1.83kg,253745,Intel,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,X,1.37kg,180360,Intel,Intel


In [130]:
laptops.drop(columns=["category", "gpu"], inplace=True)

In [131]:
laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34kg,89894,Intel,Intel
2,HP,250 G6,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83kg,253745,Intel,AMD
4,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37kg,180360,Intel,Intel


## Vaja

In [137]:
laptops["weight"] = laptops["weight"].str.replace("kgs", "").str.replace("kg", "").astype("float")
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)

In [139]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,133969,Intel,Intel
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,89894,Intel,Intel


### Convert the price_euros column to a numeric dtype.

In [141]:
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")

### Extract the screen resolution from the screen column.

In [148]:
resolution = laptops["screen"].str.split(" ").str[-1].str.split("x")

In [149]:
resolution

0       [2560, 1600]
1        [1440, 900]
2       [1920, 1080]
3       [2880, 1800]
4       [2560, 1600]
            ...     
1270    [1920, 1080]
1271    [3200, 1800]
1272     [1366, 768]
1273     [1366, 768]
1274     [1366, 768]
Name: screen, Length: 1275, dtype: object

In [151]:
laptops['screen_width_px'] = resolution.[0].astype("int")
laptops['screen_high_px'] = resolution.str[1].astype("int")
# odstranite stolpec screen
laptops.drop(columns=["screen"], inplace=True)

### Extract the processor speed from the cpu column.

In [155]:
laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_high_px
0,Apple,MacBook Pro,13.3,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2560,1600
1,Apple,Macbook Air,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1440,900
2,HP,250 G6,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86,575.0,Intel,Intel,1920,1080
3,Apple,MacBook Pro,15.4,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83,2537.45,Intel,AMD,2880,1800
4,Apple,MacBook Pro,13.3,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37,1803.6,Intel,Intel,2560,1600


In [164]:
laptops['cpu_speed_ghz'] = laptops["cpu"].str.split(" ") \
              .str[-1] \
              .str.replace("GHz", "") \
              .astype("float")

In [165]:
laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz
0,Apple,MacBook Pro,13.3,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2560,1600,2.3
1,Apple,Macbook Air,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1440,900,1.8
2,HP,250 G6,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86,575.0,Intel,Intel,1920,1080,2.5
3,Apple,MacBook Pro,15.4,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83,2537.45,Intel,AMD,2880,1800,2.7
4,Apple,MacBook Pro,13.3,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37,1803.6,Intel,Intel,2560,1600,3.1


In [208]:
storage_regex = "(?P<size_disk1>\d+)(?P<unit_disk1>[T,G]B)\s*(?P<type_disk1>[\w ]+)\+? *(?P<size_disk2>\d+)?(?P<unit_disk2>[T,G]B)? *(?P<type_disk2>[\w ]+)?"
disk_storage = laptops["storage"].str.extract(storage_regex).tail(30)

In [209]:
disk_storage.loc[disk_storage["unit_disk1"] == "TB", "size_disk1"] = disk_storage["size_disk1"].astype("int") * 1000

In [213]:
disk_storage.loc[disk_storage["unit_disk2"] == "TB", "size_disk2"] = disk_storage["size_disk2"].astype("float") * 1000

In [223]:
disk_storage.loc[disk_storage["size_disk2"].isnull(), "size_disk2"] = 0

In [224]:
disk_storage["storage_size_gb"] = disk_storage["size_disk1"].astype("float") + disk_storage["size_disk2"].astype("float")

In [230]:
disk_storage.loc[disk_storage["type_disk2"].isnull(), "type_disk2"] = ""
disk_storage["storage_type"] =  disk_storage["type_disk1"].str.strip().str.cat(disk_storage["type_disk2"], sep="/")

In [234]:
disk_storage["storage_size_gb"]

1245    1256.0
1246     500.0
1247    1256.0
1248     256.0
1249     256.0
1250     500.0
1251     500.0
1252     128.0
1253     256.0
1254     500.0
1255    1000.0
1256    1128.0
1257     500.0
1258    1000.0
1259    1128.0
1260     128.0
1261     512.0
1262    1000.0
1263     500.0
1264     500.0
1265    1000.0
1266    1000.0
1267    1000.0
1268      32.0
1269     500.0
1270     128.0
1271     512.0
1272      64.0
1273    1000.0
1274     500.0
Name: storage_size_gb, dtype: float64

In [231]:
laptops["storage_size_gb"] = disk_storage["storage_size_gb"]
laptops["storage_type"] = disk_storage["storage_type"]

In [232]:
laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz,storage_size_gb,storage_type
0,Apple,MacBook Pro,13.3,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2560,1600,2.3,,
1,Apple,Macbook Air,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1440,900,1.8,,
2,HP,250 G6,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86,575.0,Intel,Intel,1920,1080,2.5,,
3,Apple,MacBook Pro,15.4,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83,2537.45,Intel,AMD,2880,1800,2.7,,
4,Apple,MacBook Pro,13.3,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37,1803.6,Intel,Intel,2560,1600,3.1,,


## Save clean data to CSV file

## Analiza

### Are laptops made by Apple more expensive than those made by other manufacturers?


In [240]:
laptops.groupby("manufacturer")["price_euros"].mean().sort_values(ascending=False)

manufacturer
Razer         3346.142857
LG            2099.000000
Micro-Star    1728.908148
Google        1677.666667
Microsoft     1612.308333
Apple         1564.198571
Huawei        1424.000000
Samsung       1413.444444
Toshiba       1267.812500
Dell          1199.225120
Xiaomi        1133.462500
Asus          1123.829737
Lenovo        1093.862215
HP            1080.314664
Fujitsu        729.000000
Acer           633.464455
Chuwi          314.296667
Mediacom       295.000000
Vero           217.425000
Name: price_euros, dtype: float64

### What is the best value laptop with a screen size of 15" or more?
            

In [244]:
laptops[laptops["screen_size_inches"] >= 15].sort_values(by="price_euros").head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz,storage_size_gb,storage_type
290,Acer,Chromebook C910-C2ST,15.6,Intel Celeron Dual Core 3205U 1.5GHz,2,16GB SSD,Chrome OS,Version Unknown,2.19,199.0,Intel,Intel,1366,768,1.5,,
1102,Acer,Chromebook 15,15.6,Intel Celeron Dual Core 3205U 1.5GHz,4,16GB SSD,Chrome OS,Version Unknown,2.2,209.0,Intel,Intel,1366,768,1.5,,
555,Asus,A541NA-GO342 (N3350/4GB/500GB/Linux),15.6,Intel Celeron Dual Core N3350 1.1GHz,4,500GB HDD,Linux,Version Unknown,2.0,224.0,Intel,Intel,1366,768,1.1,,
30,Chuwi,"LapBook 15.6""",15.6,Intel Atom x5-Z8300 1.44GHz,4,64GB Flash Storage,Windows,10,1.89,244.99,Intel,Intel,1920,1080,1.44,,
483,Chuwi,"Lapbook 15,6",15.6,Intel Atom x5-Z8350 1.44GHz,4,64GB Flash Storage,Windows,10,1.89,248.9,Intel,Intel,1920,1080,1.44,,


### Which laptop has the most RAM?

In [246]:
laptops[laptops["ram_gb"] == laptops["ram_gb"].max()]

Unnamed: 0,manufacturer,model_name,screen_size_inches,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz,storage_size_gb,storage_type
1066,Asus,ROG G701VO,17.3,Intel Core i7 6820HK 2.7GHz,64,1TB SSD,Windows,10,3.58,3975.0,Intel,Nvidia,1920,1080,2.7,,
