# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [2]:
import pandas as pd 
import numpy as np

    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [5]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")

In [6]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [7]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


## Cleaning Column Names

In [10]:
#laptops[" Storage"]

In [23]:
new_columns = []

for column in laptops.columns:
    column = column.strip()
    column = column.replace("Operating System", "os")
    column = column.replace(" ", "_")
    column = column.replace("(", "")
    column = column.replace(")", "")
    column = column.lower()
    new_columns.append(column)

print(new_columns)
laptops.columns = new_columns

['manufacturer', 'model_name', 'category', 'screen_size', 'screen', 'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight', 'price_euros']


In [24]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [26]:
# krajši način

def clean_col(column):
    column = column.strip()
    column = column.replace("Operating System", "os")
    column = column.replace(" ", "_")
    column = column.replace("(", "")
    column = column.replace(")", "")
    column = column.lower()
    return column

laptops.columns = [clean_col(c) for c in laptops.columns]

In [27]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


## Converting String Columns to Numeric


<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [30]:
print(laptops["screen_size"].dtype)

object


In [31]:
laptops["screen_size"].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [33]:
laptops["screen_size"] = laptops["screen_size"].str.replace('"', '')

In [34]:
laptops["screen_size"].unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [35]:
laptops["screen_size"] = laptops["screen_size"].astype("float")

In [36]:
print(laptops["screen_size"].dtype)

float64


In [38]:
laptops.rename({"screen_size": "screen_size_inches"}, axis=1, inplace=True)

In [39]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [47]:
# Vaja: uredimo stoplec ram -> v int tip in poimenujemo ram_gb
laptops["ram"] = laptops["ram"].str.replace('GB','')
laptops['ram'] = laptops['ram'].astype('int')
laptops.rename(columns = {"ram": "ram_gb"}, inplace=True)
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [48]:
laptops["ram_gb"].describeribe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

## Extracting Values from Strings

In [54]:
laptops["cpu"].head(10)

0          Intel Core i5 2.3GHz
1          Intel Core i5 1.8GHz
2    Intel Core i5 7200U 2.5GHz
3          Intel Core i7 2.7GHz
4          Intel Core i5 3.1GHz
5       AMD A9-Series 9420 3GHz
6          Intel Core i7 2.2GHz
7          Intel Core i5 1.8GHz
8    Intel Core i7 8550U 1.8GHz
9    Intel Core i5 8250U 1.6GHz
Name: cpu, dtype: object

In [55]:
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]

In [56]:
laptops.head(3)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel,Intel


In [57]:
laptops["gpu_manufacturer"].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manufacturer, dtype: int64

## Correcting Bad Values - map() method

In [58]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [59]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])

In [60]:
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

In [61]:
corrections = {
    "oranje": "orange",
    "bananna": "banana"
}

In [62]:
s.map(corrections)

0       NaN
1    orange
2    banana
3    orange
4    orange
5    orange
dtype: object

In [63]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

In [64]:
laptops["os"] = laptops["os"].map(mapping_dict)

In [65]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

## Introduction to Missing Data 

### Trade-Offs in Missing Data Conventions


### Missing Data in Pandas




### None: Pythonic missing data


In [68]:
vals1 = np.array([1,2,None,4,5])
vals1

array([1, 2, None, 4, 5], dtype=object)

In [69]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing numerical data



In [70]:
vals2 = np.array([1, np.nan, 3, 4])

In [71]:
vals2

array([ 1., nan,  3.,  4.])

In [72]:
vals2.dtype

dtype('float64')

In [74]:
np.nan + 1

nan

In [75]:
np.nan * 0

nan

In [76]:
vals2 + 3

array([ 4., nan,  6.,  7.])

In [77]:
vals2.sum()

nan

In [79]:
np.nansum(vals2)

8.0

In [80]:
np.nanmax(vals2)

4.0

In [81]:
np.nanmin(vals2)

1.0

### NaN and None in Pandas



In [82]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64


<table>
<thead><tr>
<th>Typeclass</th>
<th>Conversion When Storing NAs</th>
<th>NA Sentinel Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>floating</code></td>
<td>No change</td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>object</code></td>
<td>No change</td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
<tr>
<td><code>integer</code></td>
<td>Cast to <code>float64</code></td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>boolean</code></td>
<td>Cast to <code>object</code></td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
</tbody>
</table>


### Operating on Null Values



- `isnull()`: Generate a boolean mask indicating missing values
- `notnull()`: Opposite of isnull()
- `dropna()`: Return a filtered version of the data
- `fillna()`: Return a copy of the data with missing values filled or imputed



#### Detecting null values



In [83]:
data = pd.Series([1, np.nan, 'hello', None])

In [84]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [85]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [86]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [87]:
data[data.notnull()]

0        1
2    hello
dtype: object

#### Dropping null values


In [89]:
data.dropna()

0        1
2    hello
dtype: object

In [90]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

In [91]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [93]:
df.dropna(axis="index")

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [94]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [95]:
df[3] = np.nan

In [96]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [98]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [99]:
df.dropna(axis="rows", thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


#### Filling null values



In [100]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

In [101]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [102]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [103]:
data.fillna(method="ffill")

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [104]:
data.fillna(method="bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [105]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [107]:
df.fillna(method="ffill", axis="columns")

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Dropping Missing Values

In [108]:
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                      0
os_version            170
weight                  0
price_euros             0
gpu_manufacturer        0
cpu_manufacturer        0
dtype: int64

In [109]:
laptops.shape

(1303, 15)

In [110]:
laptops.dropna(axis=0).shape

(1133, 15)

In [111]:
laptops.dropna(axis=1).shape

(1303, 14)

## Filling Missing Values

In [113]:
laptops["os_version"].value_counts(dropna=False)

10      1072
NaN      170
7         45
X          8
10 S       8
Name: os_version, dtype: int64

In [116]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: os, dtype: int64

In [121]:
# Mac
# laptops.loc[laptops["os"] == "macOS", ["os", "os_version"]]
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

In [122]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

No OS        66
Linux        62
Chrome OS    27
Android       2
Name: os, dtype: int64

In [125]:
laptops.loc[laptops["os"] == "Linux", "os_version"] = "Version Unknown"
laptops.loc[laptops["os"] == "Chrome OS" , "os_version"] = "Version Unknown"
laptops.loc[laptops["os"] == "Android" , "os_version"] = "Version Unknown"
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"
# no more misssing values
laptops.isnull().sum()

manufacturer          0
model_name            0
category              0
screen_size_inches    0
screen                0
cpu                   0
ram_gb                0
storage               0
gpu                   0
os                    0
os_version            0
weight                0
price_euros           0
gpu_manufacturer      0
cpu_manufacturer      0
dtype: int64

## Removing Duplicates

In [128]:
laptops.duplicated().head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [129]:
laptops.duplicated().value_counts()

False    1275
True       28
dtype: int64

In [131]:
#laptops[laptops.duplicated()]

In [132]:
laptops.shape

(1303, 15)

In [133]:
laptops.drop_duplicates(inplace=True)

In [134]:
laptops.shape

(1275, 15)

## Replacing Values

In [135]:
laptops["manufacturer"].value_counts()

Dell         291
Lenovo       289
HP           268
Asus         152
Acer         101
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: manufacturer, dtype: int64

In [136]:
laptops.replace("MSI", 'Micro-Star International', inplace=True)

In [137]:
laptops["manufacturer"].value_counts()

Dell                        291
Lenovo                      289
HP                          268
Asus                        152
Acer                        101
Micro-Star International     54
Toshiba                      48
Apple                        21
Samsung                       9
Razer                         7
Mediacom                      7
Microsoft                     6
Xiaomi                        4
Vero                          4
Chuwi                         3
Google                        3
Fujitsu                       3
LG                            3
Huawei                        2
Name: manufacturer, dtype: int64

## Dropping Columns 

In [138]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,X,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,Version Unknown,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,X,1.83kg,253745,AMD,Intel
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,X,1.37kg,180360,Intel,Intel


In [139]:
laptops.drop(columns=["category", "gpu"], inplace=True)

In [140]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34kg,89894,Intel,Intel


## Vaja

In [144]:
laptops["weight"] = laptops["weight"].str.replace("kgs", "").str.replace("kg", "").astype("float")

In [145]:
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)

In [146]:
laptops.dtypes

manufacturer           object
model_name             object
screen_size_inches    float64
screen                 object
cpu                    object
ram_gb                  int64
storage                object
os                     object
os_version             object
weight_kg             float64
price_euros            object
gpu_manufacturer       object
cpu_manufacturer       object
dtype: object

### Convert the price_euros column to a numeric dtype.

In [147]:
laptops['price_euros'] = laptops['price_euros'].str.replace(',', '.').astype('float')

In [148]:
laptops['price_euros'].head()

0    1339.69
1     898.94
2     575.00
3    2537.45
4    1803.60
Name: price_euros, dtype: float64

In [149]:
laptops.dtypes

manufacturer           object
model_name             object
screen_size_inches    float64
screen                 object
cpu                    object
ram_gb                  int64
storage                object
os                     object
os_version             object
weight_kg             float64
price_euros           float64
gpu_manufacturer       object
cpu_manufacturer       object
dtype: object

### Extract the screen resolution from the screen column.

In [150]:
laptops['screen'].head()

0    IPS Panel Retina Display 2560x1600
1                              1440x900
2                     Full HD 1920x1080
3    IPS Panel Retina Display 2880x1800
4    IPS Panel Retina Display 2560x1600
Name: screen, dtype: object

In [154]:
resolution_splited = laptops['screen'].str.split(" ").str[-1].str.split("x")

In [155]:
laptops['screen_width_px'] = resolution_splited.str[0].astype("int")
laptops['screen_high_px'] = resolution_splited.str[1].astype("int")

In [157]:
laptops.dtypes

manufacturer           object
model_name             object
screen_size_inches    float64
screen                 object
cpu                    object
ram_gb                  int64
storage                object
os                     object
os_version             object
weight_kg             float64
price_euros           float64
gpu_manufacturer       object
cpu_manufacturer       object
screen_width_px         int64
screen_high_px          int64
dtype: object

In [158]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_high_px
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2560,1600
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1440,900


### Extract the processor speed from the cpu column.

In [159]:
laptops["cpu"]

0                       Intel Core i5 2.3GHz
1                       Intel Core i5 1.8GHz
2                 Intel Core i5 7200U 2.5GHz
3                       Intel Core i7 2.7GHz
4                       Intel Core i5 3.1GHz
                        ...                 
1270              Intel Core i7 6500U 2.5GHz
1271              Intel Core i7 6500U 2.5GHz
1272    Intel Celeron Dual Core N3050 1.6GHz
1273              Intel Core i7 6500U 2.5GHz
1274    Intel Celeron Dual Core N3050 1.6GHz
Name: cpu, Length: 1275, dtype: object

In [165]:
laptops['cpu_speed_ghz'] = laptops["cpu"].str.split().str[-1].str.replace("GHz", "").astype("float")

In [166]:
laptops.head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2560,1600,2.3
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1440,900,1.8
2,HP,250 G6,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,Version Unknown,1.86,575.0,Intel,Intel,1920,1080,2.5
3,Apple,MacBook Pro,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,macOS,X,1.83,2537.45,AMD,Intel,2880,1800,2.7
4,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,macOS,X,1.37,1803.6,Intel,Intel,2560,1600,3.1


## Save clean data to CSV file

In [167]:
laptops.to_csv("data/OUTPUT_laptops_cleaned.csv", index=False)

## Analiza

### Are laptops made by Apple more expensive than those made by other manufacturers?


In [168]:
man_dict = {}

for manufacturer in laptops["manufacturer"].unique():
    man_dict[manufacturer] = laptops.loc[laptops["manufacturer"] == manufacturer, "price_euros"].mean()

In [173]:
pd.Series(man_dict).sort_values(ascending=False)

Razer                       3346.142857
LG                          2099.000000
Micro-Star International    1728.908148
Google                      1677.666667
Microsoft                   1612.308333
Apple                       1564.198571
Huawei                      1424.000000
Samsung                     1413.444444
Toshiba                     1267.812500
Dell                        1199.225120
Xiaomi                      1133.462500
Asus                        1123.829737
Lenovo                      1093.862215
HP                          1080.314664
Fujitsu                      729.000000
Acer                         633.464455
Chuwi                        314.296667
Mediacom                     295.000000
Vero                         217.425000
dtype: float64

### What is the best value laptop with a screen size of 15" or more?
            

In [177]:
laptops[laptops["screen_size_inches"] >= 15].sort_values(by="price_euros").head()

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz
290,Acer,Chromebook C910-C2ST,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2,16GB SSD,Chrome OS,Version Unknown,2.19,199.0,Intel,Intel,1366,768,1.5
1102,Acer,Chromebook 15,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,4,16GB SSD,Chrome OS,Version Unknown,2.2,209.0,Intel,Intel,1366,768,1.5
555,Asus,A541NA-GO342 (N3350/4GB/500GB/Linux),15.6,1366x768,Intel Celeron Dual Core N3350 1.1GHz,4,500GB HDD,Linux,Version Unknown,2.0,224.0,Intel,Intel,1366,768,1.1
30,Chuwi,"LapBook 15.6""",15.6,Full HD 1920x1080,Intel Atom x5-Z8300 1.44GHz,4,64GB Flash Storage,Windows,10,1.89,244.99,Intel,Intel,1920,1080,1.44
483,Chuwi,"Lapbook 15,6",15.6,Full HD 1920x1080,Intel Atom x5-Z8350 1.44GHz,4,64GB Flash Storage,Windows,10,1.89,248.9,Intel,Intel,1920,1080,1.44


### Which laptop has the most RAM?

In [178]:
laptops[laptops["ram_gb"] == laptops["ram_gb"].max()]

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight_kg,price_euros,gpu_manufacturer,cpu_manufacturer,screen_width_px,screen_high_px,cpu_speed_ghz
1066,Asus,ROG G701VO,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6820HK 2.7GHz,64,1TB SSD,Windows,10,3.58,3975.0,Nvidia,Intel,1920,1080,2.7
