# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [1]:
import pandas as pd 
import numpy as np

In [3]:
!head -n 3 data/INPUT_laptops.csv

Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM, Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,"1339,69"
Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,"898,94"


In [37]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")

    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [38]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [39]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


## Cleaning Column Names

In [40]:
#laptops[" Storage"]

In [41]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [42]:
laptops_test = laptops.copy()

In [43]:
laptops_test.columns = ['A', 'B', 'C', 'D', 'E',
                        'F', 'G', 'H', 'I', 'J',
                        'K', 'L', 'M']

In [44]:
laptops_test.head(2)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [45]:
def clean_col(col_name):
    col = col_name.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

laptops.columns = [clean_col(col) for col in laptops.columns]

laptops.head(1)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969


## Converting String Columns to Numeric

In [46]:
laptops.iloc[:5, 2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600



<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [47]:
print(laptops["screen_size"].dtype)

object


In [49]:
laptops["screen_size"].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [30]:
laptops["ram"].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [50]:
laptops["screen_size"].str[:-1]

0       13.3
1       13.3
2       15.6
3       15.4
4       13.3
        ... 
1298    14.0
1299    13.3
1300    14.0
1301    15.6
1302    15.6
Name: screen_size, Length: 1303, dtype: object

In [51]:
laptops["screen_size"] = laptops["screen_size"].str.replace('"', '')

In [52]:
laptops["screen_size"].unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [53]:
laptops["ram"] = laptops["ram"].str.replace('GB', '')

In [54]:
laptops["ram"].unique()

array(['8', '16', '4', '2', '12', '6', '32', '24', '64'], dtype=object)

In [56]:
laptops.dtypes

manufacturer    object
model_name      object
category        object
screen_size     object
screen          object
cpu             object
ram             object
storage         object
gpu             object
os              object
os_version      object
weight          object
price_euros     object
dtype: object

In [57]:
laptops["screen_size"] = laptops["screen_size"].astype("float")

In [58]:
laptops["ram"] = laptops["ram"].astype("int")

In [59]:
laptops.dtypes

manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram               int64
storage          object
gpu              object
os               object
os_version       object
weight           object
price_euros      object
dtype: object

## Renaming Columns

In [60]:
laptops.rename({"screen_size": "screen_size_inches"}, axis=1, inplace=True)

In [61]:
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)

In [62]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size_inches',
       'screen', 'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version',
       'weight', 'price_euros'],
      dtype='object')

In [63]:
laptops.head(2)

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894


In [64]:
laptops["ram_gb"].describecribe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

## Extracting Values from Strings

In [65]:
laptops["gpu"].head()

0    Intel Iris Plus Graphics 640
1          Intel HD Graphics 6000
2           Intel HD Graphics 620
3              AMD Radeon Pro 455
4    Intel Iris Plus Graphics 650
Name: gpu, dtype: object

In [66]:
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]

In [67]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745,AMD
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360,Intel


In [68]:
laptops["gpu_manufacturer"].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manufacturer, dtype: int64

In [69]:
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]

In [70]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,gpu_manufacturer,cpu_manufacturer
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969,Intel,Intel
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894,Intel,Intel
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500,Intel,Intel
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745,AMD,Intel
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360,Intel,Intel


## Correcting Bad Values - map() method

In [71]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [72]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])

In [73]:
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

In [74]:
corrections = {
    "pair": "pear",
    "oranje": "orange",
    "bananna": "banana"
}

In [75]:
s = s.map(corrections)

In [76]:
s

0      pear
1    orange
2    banana
3    orange
4    orange
5    orange
dtype: object

In [77]:
s = s.map(corrections)

In [78]:
s

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: object

In [79]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

In [80]:
laptops["os"] = laptops["os"].map(mapping_dict)

In [81]:
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

## Introduction to Missing Data 

### Trade-Offs in Missing Data Conventions


### Missing Data in Pandas




### None: Pythonic missing data


In [82]:
vals1 = np.array([1,2,None,3,4])

In [83]:
vals1

array([1, 2, None, 3, 4], dtype=object)

In [84]:
%timeit np.arange(1E6, dtype="object").sum()

137 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [85]:
%timeit np.arange(1E6, dtype="int").sum()

1.51 ms ± 58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [86]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing numerical data



In [87]:
vals2 = np.array([1, np.nan, 3, 4])

In [88]:
vals2.dtype

dtype('float64')

In [89]:
vals2 + 1

array([ 2., nan,  4.,  5.])

In [90]:
1 + np.nan

nan

In [91]:
0 * np.nan

nan

In [92]:
vals2.sum()

nan

In [94]:
np.nansum(vals2)

8.0

### NaN and None in Pandas



In [95]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [96]:
x = pd.Series(range(5), dtype=int)

In [97]:
x

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [98]:
x[0] = None

In [99]:
x

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64


<table>
<thead><tr>
<th>Typeclass</th>
<th>Conversion When Storing NAs</th>
<th>NA Sentinel Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>floating</code></td>
<td>No change</td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>object</code></td>
<td>No change</td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
<tr>
<td><code>integer</code></td>
<td>Cast to <code>float64</code></td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>boolean</code></td>
<td>Cast to <code>object</code></td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
</tbody>
</table>


### Operating on Null Values



- `isnull()`: Generate a boolean mask indicating missing values
- `notnull()`: Opposite of isnull()
- `dropna()`: Return a filtered version of the data
- `fillna()`: Return a copy of the data with missing values filled or imputed



#### Detecting null values



In [100]:
data = pd.Series([1, np.nan, 'hello', None])

In [101]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [102]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [103]:
data[data.notnull()]

0        1
2    hello
dtype: object

#### Dropping null values


In [77]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

#### Filling null values



In [85]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

## Dropping Missing Values

## Filling Missing Values

## Removing Duplicates

In [3]:
zenbook_model = 'ZenBook UX305CA-UBM1'

## Replacing Values

## Dropping Columns 

## Vaja

### Convert the price_euros column to a numeric dtype.

### Extract the screen resolution from the screen column.

### Extract the processor speed from the cpu column.

## Save clean data to CSV file

## Analiza

### Are laptops made by Apple more expensive than those made by other manufacturers?


### What is the best value laptop with a screen size of 15" or more?
            

### Which laptop has the most RAM?