# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [172]:
import pandas as pd 
import numpy as np

In [173]:
!head -n 3 data/INPUT_laptops.csv

Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM, Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,"1339,69"
Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,"898,94"


In [174]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")

    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [175]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [176]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


## Cleaning Column Names

In [177]:
#laptops[" Storage"]

In [178]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [179]:
laptops_test = laptops.copy()

In [180]:
laptops_test.columns = ['A', 'B', 'C', 'D', 'E',
                        'F', 'G', 'H', 'I', 'J',
                        'K', 'L', 'M']

In [181]:
laptops_test.head(2)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [182]:
def clean_col(col_name):
    col = col_name.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

laptops.columns = [clean_col(col) for col in laptops.columns]

laptops.head(1)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969


## Converting String Columns to Numeric

In [183]:
laptops.iloc[:5, 2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600



<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [184]:
print(laptops["screen_size"].dtype)

object


In [185]:
laptops["screen_size"].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [186]:
laptops["ram"].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [187]:
laptops["screen_size"].str[:-1]

0       13.3
1       13.3
2       15.6
3       15.4
4       13.3
        ... 
1298    14.0
1299    13.3
1300    14.0
1301    15.6
1302    15.6
Name: screen_size, Length: 1303, dtype: object

In [188]:
laptops["screen_size"] = laptops["screen_size"].str.replace('"', '')

In [189]:
laptops["screen_size"].unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [190]:
laptops["ram"] = laptops["ram"].str.replace('GB', '')

In [191]:
laptops["ram"].unique()

array(['8', '16', '4', '2', '12', '6', '32', '24', '64'], dtype=object)

In [192]:
laptops.dtypes

manufacturer    object
model_name      object
category        object
screen_size     object
screen          object
cpu             object
ram             object
storage         object
gpu             object
os              object
os_version      object
weight          object
price_euros     object
dtype: object

In [193]:
laptops["screen_size"] = laptops["screen_size"].astype("float")

In [194]:
laptops["ram"] = laptops["ram"].astype("int")

In [195]:
laptops.dtypes

manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram               int64
storage          object
gpu              object
os               object
os_version       object
weight           object
price_euros      object
dtype: object

## Renaming Columns

In [196]:
laptops.rename({"screen_size": "screen_size_inches"}, axis=1, inplace=True)

In [197]:
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)

In [198]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size_inches',
       'screen', 'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version',
       'weight', 'price_euros'],
      dtype='object')

In [199]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [200]:
laptops["ram_gb"].describe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

## Extracting Values from Strings

In [201]:
laptops["gpu"].head()

0    Intel Iris Plus Graphics 640
1          Intel HD Graphics 6000
2           Intel HD Graphics 620
3              AMD Radeon Pro 455
4    Intel Iris Plus Graphics 650
Name: gpu, dtype: object

In [202]:
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]

In [203]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360,Intel


In [204]:
laptops["gpu_manufacturer"].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manufacturer, dtype: int64

In [205]:
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]

In [206]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745,AMD,Intel
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360,Intel,Intel


## Correcting Bad Values - map() method

In [207]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [208]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])

In [209]:
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

In [210]:
corrections = {
    "pair": "pear",
    "oranje": "orange",
    "bananna": "banana"
}

In [211]:
s = s.map(corrections)

In [212]:
s

0      pear
1    orange
2    banana
3    orange
4    orange
5    orange
dtype: object

In [213]:
s = s.map(corrections)

In [214]:
s

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: object

In [215]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

In [216]:
laptops["os"] = laptops["os"].map(mapping_dict)

In [217]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

## Introduction to Missing Data 

### Trade-Offs in Missing Data Conventions


### Missing Data in Pandas




### None: Pythonic missing data


In [218]:
vals1 = np.array([1,2,None,3,4])

In [219]:
vals1

array([1, 2, None, 3, 4], dtype=object)

In [220]:
%timeit np.arange(1E6, dtype="object").sum()

83.9 ms ± 160 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [221]:
%timeit np.arange(1E6, dtype="int").sum()

1.49 ms ± 59.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [222]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing numerical data



In [223]:
vals2 = np.array([1, np.nan, 3, 4])

In [224]:
vals2.dtype

dtype('float64')

In [225]:
vals2 + 1

array([ 2., nan,  4.,  5.])

In [226]:
1 + np.nan

nan

In [227]:
0 * np.nan

nan

In [228]:
vals2.sum()

nan

In [229]:
np.nansum(vals2)

8.0

### NaN and None in Pandas



In [230]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [231]:
x = pd.Series(range(5), dtype=int)

In [232]:
x

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [233]:
x[0] = None

In [234]:
x

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64


<table>
<thead><tr>
<th>Typeclass</th>
<th>Conversion When Storing NAs</th>
<th>NA Sentinel Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>floating</code></td>
<td>No change</td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>object</code></td>
<td>No change</td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
<tr>
<td><code>integer</code></td>
<td>Cast to <code>float64</code></td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>boolean</code></td>
<td>Cast to <code>object</code></td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
</tbody>
</table>


### Operating on Null Values



- `isnull()`: Generate a boolean mask indicating missing values
- `notnull()`: Opposite of isnull()
- `dropna()`: Return a filtered version of the data
- `fillna()`: Return a copy of the data with missing values filled or imputed



#### Detecting null values



In [235]:
data = pd.Series([1, np.nan, 'hello', None])

In [236]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [237]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [238]:
data[data.notnull()]

0        1
2    hello
dtype: object

#### Dropping null values


In [239]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [240]:
data.dropna()

0        1
2    hello
dtype: object

In [241]:
# dataframe

In [242]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

In [243]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [244]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [245]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [246]:
df[3] = np.nan

In [247]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [248]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [249]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [250]:
# odsranimo vse vrstice, ki ne vsebujejo 3 ali več ne null vrednosti
df.dropna(axis="rows", thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


#### Filling null values



In [251]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

In [252]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [253]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [254]:
data.fillna(method="ffill")

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [255]:
data.fillna(method="bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [256]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [257]:
df.fillna(method="ffill", axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Dropping Missing Values

In [258]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel,Intel


In [259]:
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                      0
os_version            170
weight                  0
price_euros             0
gpu_manufacturer        0
cpu_manufacturer        0
dtype: int64

In [260]:
laptops.shape

(1303, 15)

In [261]:
laptops.dropna(axis=0).shape

(1133, 15)

In [262]:
laptops.dropna(axis=1).shape

(1303, 14)

## Filling Missing Values

In [263]:
laptops["os_version"].value_counts(dropna=False)

10      1072
NaN      170
7         45
X          8
10 S       8
Name: os_version, dtype: int64

In [264]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts(dropna=False)

No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: os, dtype: int64

In [265]:
laptops.loc[laptops["os"] == "macOS", ['os', 'os_version']].head()

Unnamed: 0,os,os_version
0,macOS,
1,macOS,
3,macOS,
4,macOS,
6,macOS,X


In [266]:
# dopolnemo mankajoče vrednsoti, ki imajo os macOS z X
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

In [267]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts(dropna=False)

No OS        66
Linux        62
Chrome OS    27
Android       2
Name: os, dtype: int64

In [268]:
# doplonemo No OS vrednosti
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"

In [269]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts(dropna=False)

Linux        62
Chrome OS    27
Android       2
Name: os, dtype: int64

In [270]:
laptops.loc[laptops["os"] == "Linux", "os_version"] = "Ubuntu"

In [271]:
laptops.loc[laptops["os"] == "Chrome OS", "os_version"] = "Version Unknown"

In [272]:
laptops.loc[laptops["os"] == "Android", "os_version"] = "Version Unknown"

In [273]:
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts(dropna=False)

Series([], Name: os, dtype: int64)

In [274]:
laptops["os_version"].value_counts(dropna=False)

10                 1072
Version Unknown      95
Ubuntu               62
7                    45
X                    21
10 S                  8
Name: os_version, dtype: int64

## Removing Duplicates

In [275]:
laptops.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1298     True
1299     True
1300     True
1301     True
1302     True
Length: 1303, dtype: bool

In [276]:
laptops.duplicated().value_counts()

False    1275
True       28
dtype: int64

In [277]:
laptops[laptops.duplicated()].head(1)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
1275,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel


In [278]:
zenbook_model = 'ZenBook UX305CA-UBM1'

In [279]:
laptops[laptops["model_name"] == zenbook_model]

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
1261,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel
1275,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel
1289,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8,512GB SSD,Intel HD Graphics 515,Windows,10,1.2kg,72900,Intel,Intel


In [280]:
laptops.shape

(1303, 15)

In [281]:
laptops.drop_duplicates(inplace=True)

In [282]:
laptops.shape

(1275, 15)

## Replacing Values

In [283]:
laptops["manufacturer"].value_counts()

Dell         291
Lenovo       289
HP           268
Asus         152
Acer         101
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: manufacturer, dtype: int64

In [285]:
laptops.replace("MSI", "Micro-Star International", inplace=True)

In [286]:
laptops["manufacturer"].value_counts()

Dell                        291
Lenovo                      289
HP                          268
Asus                        152
Acer                        101
Micro-Star International     54
Toshiba                      48
Apple                        21
Samsung                       9
Razer                         7
Mediacom                      7
Microsoft                     6
Xiaomi                        4
Vero                          4
Chuwi                         3
Google                        3
Fujitsu                       3
LG                            3
Huawei                        2
Name: manufacturer, dtype: int64

## Dropping Columns 

In [287]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,X,1.34kg,89894,Intel,Intel


In [288]:
laptops.drop(columns=["category", "gpu"], inplace=True)

In [289]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34kg,89894,Intel,Intel


## Vaja

In [290]:
# odstranimo kg znak in preimenujmo stolpec v weight_kg in tip float
laptops["weight"].head(10)

0    1.37kg
1    1.34kg
2    1.86kg
3    1.83kg
4    1.37kg
5     2.1kg
6    2.04kg
7    1.34kg
8     1.3kg
9     1.6kg
Name: weight, dtype: object

In [291]:
laptops["weight"] = laptops["weight"].str.replace("kgs", "").str.replace("kg", "").astype(float)

In [292]:
laptops["weight"].dtype

dtype('float64')

### Convert the price_euros column to a numeric dtype.

In [300]:
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".").astype("float")

In [301]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer,cpu_speed_ghz,storage_size_gb
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2.3,128
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1.8,128


### Extract the screen resolution from the screen column.

In [302]:
laptops["screen"].head()

0    IPS Panel Retina Display 2560x1600
1                              1440x900
2                     Full HD 1920x1080
3    IPS Panel Retina Display 2880x1800
4    IPS Panel Retina Display 2560x1600
Name: screen, dtype: object

In [305]:
resolution_splitted = laptops["screen"].str.split(" ").str[-1].str.split("x")

In [306]:
laptops["screen_width_px"] = resolution_splitted.str[0].astype("int")
laptops["screen_high_px"] = resolution_splitted.str[1].astype("int")

In [307]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer,cpu_speed_ghz,storage_size_gb,screen_width_px,screen_high_px
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,1339.69,Intel,Intel,2.3,128,2560,1600
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,898.94,Intel,Intel,1.8,128,1440,900


### Extract the processor speed from the cpu column.

In [293]:
laptops["cpu"].unique()

laptops["cpu_speed_ghz"] = (laptops["cpu"].str.split(" ")
               .str[-1]
               .str.replace("GHz", "")
               .astype("float"))

In [294]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer,cpu_speed_ghz
0,Apple,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,macOS,X,1.37,133969,Intel,Intel,2.3
1,Apple,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,X,1.34,89894,Intel,Intel,1.8


### Extract the storage size from the storage column.

In [295]:
laptops["storage"].unique()

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '32GB Flash Storage', '128GB SSD +  1TB HDD',
       '256GB SSD +  256GB SSD', '64GB Flash Storage',
       '256GB SSD +  1TB HDD', '256GB SSD +  2TB HDD', '32GB SSD',
       '2TB HDD', '64GB SSD', '1TB Hybrid', '512GB SSD +  1TB HDD',
       '1TB SSD', '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '1GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid',
       '512GB SSD +  1TB Hybrid', '256GB SSD +  1TB Hybrid'], dtype=object)

In [297]:
laptops["storage_size_gb"] = laptops["storage"].str.split(" ").str[0]
laptops["storage_size_gb"] = laptops["storage_size_gb"].str.replace("GB", "")
laptops["storage_size_gb"] = laptops["storage_size_gb"].str.replace("TB", "000")
laptops["storage_size_gb"] = laptops["storage_size_gb"].astype("int")


laptops["storage_size_gb"].unique()

array([ 128,  256,  512,  500, 1000,   32,   64, 2000,   16,    1,  240,
          8,  508])

In [308]:
laptops.dtypes

manufacturer           object
model_name             object
screen_size_inches    float64
screen                 object
cpu                    object
ram_gb                  int64
storage                object
os                     object
os_version             object
weight                float64
price_euros           float64
gpu_manufacturer       object
cpu_manufacturer       object
cpu_speed_ghz         float64
storage_size_gb         int64
screen_width_px         int64
screen_high_px          int64
dtype: object

## Save clean data to CSV file

In [313]:
laptops.to_csv("data/OUTPUT_laptops_cleaned.csv", index=False)

## Analiza

### Are laptops made by Apple more expensive than those made by other manufacturers?


In [329]:
# dobra rešitev 
laptops.groupby('manufacturer')['price_euros'].mean().sort_values(ascending=False)

manufacturer
Razer                       3346.142857
LG                          2099.000000
Micro-Star International    1728.908148
Google                      1677.666667
Microsoft                   1612.308333
Apple                       1564.198571
Huawei                      1424.000000
Samsung                     1413.444444
Toshiba                     1267.812500
Dell                        1199.225120
Xiaomi                      1133.462500
Asus                        1123.829737
Lenovo                      1093.862215
HP                          1080.314664
Fujitsu                      729.000000
Acer                         633.464455
Chuwi                        314.296667
Mediacom                     295.000000
Vero                         217.425000
Name: price_euros, dtype: float64

In [328]:
# opcija z našim znanjem
results = {}

for manufacturer_name in laptops["manufacturer"].unique():
    results[manufacturer_name] = laptops.loc[laptops["manufacturer"] == manufacturer_name, "price_euros"].mean()
    
sorted(results.items(), key= lambda item: item[1], reverse=True)

[('Razer', 3346.1428571428573),
 ('LG', 2099.0),
 ('Micro-Star International', 1728.9081481481483),
 ('Google', 1677.6666666666667),
 ('Microsoft', 1612.3083333333334),
 ('Apple', 1564.1985714285713),
 ('Huawei', 1424.0),
 ('Samsung', 1413.4444444444443),
 ('Toshiba', 1267.8125),
 ('Dell', 1199.2251202749142),
 ('Xiaomi', 1133.4625),
 ('Asus', 1123.8297368421051),
 ('Lenovo', 1093.8622145328723),
 ('HP', 1080.3146641791045),
 ('Fujitsu', 729.0),
 ('Acer', 633.4644554455446),
 ('Chuwi', 314.2966666666667),
 ('Mediacom', 295.0),
 ('Vero', 217.425)]

In [335]:
pd.Series(results).sort_values(ascending=False)

Razer                       3346.142857
LG                          2099.000000
Micro-Star International    1728.908148
Google                      1677.666667
Microsoft                   1612.308333
Apple                       1564.198571
Huawei                      1424.000000
Samsung                     1413.444444
Toshiba                     1267.812500
Dell                        1199.225120
Xiaomi                      1133.462500
Asus                        1123.829737
Lenovo                      1093.862215
HP                          1080.314664
Fujitsu                      729.000000
Acer                         633.464455
Chuwi                        314.296667
Mediacom                     295.000000
Vero                         217.425000
dtype: float64

### What is the best value laptop with a screen size of 15" or more?
            

In [336]:
cols_to_show = ['manufacturer', 'model_name', 'price_euros', 'screen_size_inches']

In [340]:
laptops.loc[laptops["screen_size_inches"] >= 15, cols_to_show].sort_values(by="price_euros").head()

Unnamed: 0,manufacturer,model_name,price_euros,screen_size_inches
290,Acer,Chromebook C910-C2ST,199.0,15.6
1102,Acer,Chromebook 15,209.0,15.6
555,Asus,A541NA-GO342 (N3350/4GB/500GB/Linux),224.0,15.6
30,Chuwi,"LapBook 15.6""",244.99,15.6
483,Chuwi,"Lapbook 15,6",248.9,15.6


### Which laptop has the most RAM?

In [344]:
laptops.loc[laptops["ram_gb"] == laptops["ram_gb"].max(), :]

Unnamed: 0,manufacturer,model_name,screen_size_inches,screen,cpu,ram_gb,storage,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer,cpu_speed_ghz,storage_size_gb,screen_width_px,screen_high_px
1066,Asus,ROG G701VO,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6820HK 2.7GHz,64,1TB SSD,Windows,10,3.58,3975.0,Nvidia,Intel,2.7,1000,1920,1080
