# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [2]:
import pandas as pd 
import numpy as np


In [4]:
!head -n 3 data\INPUT_laptops.csv

'head' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
with open("data\INPUT_laptops.csv") as f:
    for _ in range(3): # Preberi prve 3 vrstice datoteke
        print(f.readline())

Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM, Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)

Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,"1339,69"

Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,"898,94"



In [3]:
#df=pd.read_csv("data\INPUT_laptops.csv") #UnicodeDecodeError zaradi napačnega (utf-8) encodinga

    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [4]:
laptops=pd.read_csv("data/INPUT_laptops.csv",encoding="Latin-1")

In [5]:
#laptops=pd.read_csv("data/INPUT_laptops.csv",encoding='utf-8') #UTF8 tip encodinga ne prebere datoteke

In [16]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [61]:
laptops=pd.read_csv("data/INPUT_laptops.csv",encoding="Latin-1",delimiter=",")
#Dodan parameter "delimiter", ki pove kako csv dokument loči posamezne elemente

In [6]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


## Cleaning Column Names

In [7]:
laptops_test=laptops.copy()

In [8]:
#laptops_test.columns=['A','B','C'] #Treba spremenit enako število elementov, kot je vseh stolpcev

In [12]:
laptops_test.columns = ['A', 'B', 'C', 'D', 'E',
                        'F', 'G', 'H', 'I', 'J',
                        'K', 'L', 'M']

In [13]:
laptops_test.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'], dtype='object')

In [14]:
laptops_test=laptops.copy()
new_columns=[]

for c in laptops_test.columns:
    clean_c=c.strip()
    #clean_c=c.strip('M') 
    clean_c=c.replace('(Euros)','€')
    clean_c=c.lower()
    new_columns.append(clean_c)

In [15]:
laptops_test.columns=new_columns

In [16]:
laptops_test.columns

Index(['manufacturer', 'model name', 'category', 'screen size', 'screen',
       'cpu', 'ram', ' storage', 'gpu', 'operating system',
       'operating system version', 'weight', 'price (euros)'],
      dtype='object')

In [20]:
#Dva načina za dokaz, da je Python "case-sensitive"
x="Evro"
y="evro"
x==y

In [23]:
if x == y:
    print("True")
else:
    print("False")

False


In [24]:
def clean_col(col):
    col=col.strip()
    col=col.replace('Operating System','os')
    col=col.replace(' ','_')
    col=col.replace('(','')
    col=col.replace(')','')
    col=col.lower()
    return col

In [25]:
new_columns=[]
for c in laptops.columns:
    clean_c=clean_col(c)
    new_columns.append(clean_c)

In [26]:
new_columns

['manufacturer',
 'model_name',
 'category',
 'screen_size',
 'screen',
 'cpu',
 'ram',
 'storage',
 'gpu',
 'os',
 'os_version',
 'weight',
 'price_euros']

In [27]:
laptops.columns=new_columns

In [28]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

## Converting String Columns to Numeric


<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [29]:
laptops.iloc[:5,2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600


In [33]:
print(laptops['screen_size'].dtype)

object


In [34]:
laptops['screen_size'].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [35]:
new_columns_com=[c.replace('"','') for c in laptops['screen_size']] #List comprehension

In [36]:
new_cols=pd.Series(new_columns_com)

In [37]:
new_cols.unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [38]:
laptops['screen_size']=new_cols.astype('float')

In [39]:
laptops['screen_size']

0       13.3
1       13.3
2       15.6
3       15.4
4       13.3
        ... 
1298    14.0
1299    13.3
1300    14.0
1301    15.6
1302    15.6
Name: screen_size, Length: 1303, dtype: float64

### Vectorized String Methods


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [40]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  1303 non-null   object 
 1   model_name    1303 non-null   object 
 2   category      1303 non-null   object 
 3   screen_size   1303 non-null   float64
 4   screen        1303 non-null   object 
 5   cpu           1303 non-null   object 
 6   ram           1303 non-null   object 
 7   storage       1303 non-null   object 
 8   gpu           1303 non-null   object 
 9   os            1303 non-null   object 
 10  os_version    1133 non-null   object 
 11  weight        1303 non-null   object 
 12  price_euros   1303 non-null   object 
dtypes: float64(1), object(12)
memory usage: 132.5+ KB


In [41]:
laptops['price_euros'].str.replace(',','.')

0       1339.69
1        898.94
2        575.00
3       2537.45
4       1803.60
         ...   
1298     638.00
1299    1499.00
1300     229.00
1301     764.00
1302     369.00
Name: price_euros, Length: 1303, dtype: object

In [42]:
laptops['price_euros'].str.replace(',','.').astype('float')

0       1339.69
1        898.94
2        575.00
3       2537.45
4       1803.60
         ...   
1298     638.00
1299    1499.00
1300     229.00
1301     764.00
1302     369.00
Name: price_euros, Length: 1303, dtype: float64

In [43]:
laptops['ram'].str.replace('GB','').astype('int')

0        8
1        8
2        8
3       16
4        8
        ..
1298     4
1299    16
1300     2
1301     6
1302     4
Name: ram, Length: 1303, dtype: int32

<table class="tg">
<tbody><tr>
<th>Method</th>
<th>Description</th>
</tr>
<tr>
<td>Series.str.split()</td>
<td>Splits each element in the Series.</td>
</tr>
<tr>
<td>Series.str.strip()</td>
<td>Strips whitespace from each string in the Series.</td>
</tr>
<tr>
<td>Series.str.lower()</td>
<td>Converts strings in the Series to lowercase.</td>
</tr>
<tr>
<td>Series.str.upper()</td>
<td>Converts strings in the Series to uppercase.</td>
</tr>
<tr>
<td>Series.str.get()</td>
<td>Retrieves the ith element of each element in the Series.</td>
</tr>
<tr>
<td><span style="font-weight:300;font-style:normal">Series.str.replace()</span></td>
<td><span style="font-weight:300;font-style:normal">Replaces a regex or string in the Series with another string.</span></td>
</tr>
<tr>
<td>Series.str.cat()</td>
<td>Concatenates strings in a Series.</td>
</tr>
<tr>
<td><span style="font-weight:300;font-style:normal">Series.str.extract()</span></td>
<td><span style="font-weight:300;font-style:normal">Extracts substrings from the Series matching a regex pattern.</span></td>
</tr>
</tbody></table>

<div class="alert alert-block alert-info">
<b>Vaja: </b> From CPU column extract processor frequency and change its dtype to float
</div>

## Renaming Columns

## Extracting Values from Strings

## Correcting Bad Values - map() method

In [None]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])

In [None]:
corrections = {
    "pair": "pear",
    "oranje": "orange",
    "bananna": "banana"
}

In [None]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}