# Del 9: Priprava in čiščenje podatkov - napredno

## Working With Strings In Pandas

### Data

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# read the data sets into pandas
world_dev = pd.read_csv("data/World_dev.csv")
happiness2015 = pd.read_csv('data/World_Happiness_2015.csv')

In [6]:
merged = pd.merge(left=happiness2015,
                 right=world_dev,
                 how="left",
                 left_on="Country",
                 right_on="ShortName")


col_renaming = {'SourceOfMostRecentIncomeAndExpenditureData': 'IESurvey'}
merged.rename(col_renaming, axis=1, inplace=True)

In [7]:
merged.head(2)

Unnamed: 0,Country,Region_x,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),...,GovernmentAccountingConcept,ImfDataDisseminationStandard,LatestPopulationCensus,LatestHouseholdSurvey,IESurvey,VitalRegistrationComplete,LatestAgriculturalCensus,LatestIndustrialData,LatestTradeData,LatestWaterWithdrawalData
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,...,Consolidated central government,Special Data Dissemination Standard (SDDS),2010,,"Expenditure survey/budget survey (ES/BS), 2004",Yes,2008,2010.0,2013.0,2000.0
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,...,Consolidated central government,Special Data Dissemination Standard (SDDS),2011,,"Integrated household survey (IHS), 2010",Yes,2010,2005.0,2013.0,2005.0


### Using Apply to Transform Strings

In [12]:
currency_categories = merged["CurrencyUnit"].str.lower().str.split().str.get(-1)
currency_categories.value_counts()

euro        20
franc       16
dollar      12
dinar        8
peso         7
            ..
rupiah       1
lira         1
naira        1
ngultrum     1
afghani      1
Name: CurrencyUnit, Length: 71, dtype: int64

## Regular Expressions in Pandas

In [13]:
import re

In [15]:
print(re.match(r"and", "hand"))

None


In [16]:
print(re.match(r"and", "and"))

<re.Match object; span=(0, 3), match='and'>


In [17]:
print(re.search(r"and", "hand"))

<re.Match object; span=(1, 4), match='and'>


In [20]:
print(re.match(r"and", "andh"))

<re.Match object; span=(0, 3), match='and'>


### Regex in pandas

In [22]:
merged.head(2)

Unnamed: 0,Country,Region_x,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),...,GovernmentAccountingConcept,ImfDataDisseminationStandard,LatestPopulationCensus,LatestHouseholdSurvey,IESurvey,VitalRegistrationComplete,LatestAgriculturalCensus,LatestIndustrialData,LatestTradeData,LatestWaterWithdrawalData
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,...,Consolidated central government,Special Data Dissemination Standard (SDDS),2010,,"Expenditure survey/budget survey (ES/BS), 2004",Yes,2008,2010.0,2013.0,2000.0
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,...,Consolidated central government,Special Data Dissemination Standard (SDDS),2011,,"Integrated household survey (IHS), 2010",Yes,2010,2005.0,2013.0,2005.0


In [24]:
merged["SpecialNotes"].head(10)

0                                                  NaN
1                                                  NaN
2                                                  NaN
3                                                  NaN
4    Fiscal year end: March 31; reporting period fo...
5    A simple multiplier is used to convert the nat...
6    A simple multiplier is used to convert the nat...
7    Fiscal year end: June 30; reporting period for...
8    Fiscal year end: March 31; reporting period fo...
9    Fiscal year end: June 30; reporting period for...
Name: SpecialNotes, dtype: object

In [30]:
merged["SpecialNotes"].str.contains(r"[]iscal year")

0        NaN
1        NaN
2        NaN
3        NaN
4       True
       ...  
153    False
154      NaN
155      NaN
156      NaN
157    False
Name: SpecialNotes, Length: 158, dtype: object

### Finding Specific Words in Strings

<img src="./images/set_syntax_breakdown.svg">

<img src="./images/basic_match_2.svg">

### Import new dataset

In [31]:
hn = pd.read_csv('data/hacker_news.csv')

In [33]:
hn.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
0,12224879,Interactive Dynamic Video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,8/4/2016 11:52
1,11964716,Florida DJs May Face Felony for April Fools' W...,http://www.thewire.com/entertainment/2013/04/f...,2,1,vezycash,6/23/2016 22:20
2,11919867,Technology ventures: From Idea to Enterprise,https://www.amazon.com/Technology-Ventures-Ent...,3,1,hswarna,6/17/2016 0:01
3,10301696,Note by Note: The Making of Steinway L1037 (2007),http://www.nytimes.com/2007/11/07/movies/07ste...,8,2,walterbell,9/30/2015 4:12
4,10482257,Title II kills investment? Comcast and other I...,http://arstechnica.com/business/2015/10/comcas...,53,22,Deinos,10/31/2015 9:48


In [34]:
# Želimo pridobiti vse Python novice
hn[hn["title"].str.contains(r"[Py]ython")].head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
102,10974870,From Python to Lua: Why We Switched,https://www.distelli.com/blog/using-lua-for-ou...,243,188,chase202,1/26/2016 18:17
103,11244541,Ubuntu 16.04 LTS to Ship Without Python 2,http://news.softpedia.com/news/ubuntu-16-04-lt...,2,1,_snydly,3/8/2016 10:39
144,10963528,Create a GUI Application Using Qt and Python i...,http://digitalpeer.com/s/c63e,21,1,zoodle,1/24/2016 19:01
196,10716331,How I Solved GCHQ's Xmas Card with Python and ...,http://matthewearl.github.io/2015/12/10/gchq-x...,6,1,kipi,12/11/2015 10:38
436,11895088,"Unikernel Power Comes to Java, Node.js, Go, an...",http://www.infoworld.com/article/3082051/open-...,3,1,syslandscape,6/13/2016 16:23


In [36]:
# Želimo pridobiti vse novice z email besedo
hn[hn["title"].str.contains(r"e-?mail")].head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
119,10603601,Show HN: Send an email from your shell to your...,https://ping.registryd.com,4,1,ybrs,11/20/2015 20:23
313,10736929,Disposable emails for safe spam free shopping,http://couponinbox.com,1,1,genesem,12/15/2015 10:20
1361,11079401,Ask HN: Doing cold emails? helps us prove this...,,8,12,going_to_800,2/11/2016 10:48
1750,12021044,"Protect yourself from spam, bots and phishing ...",http://die.life?x=3,4,2,code2crud,7/2/2016 2:04
2421,10934913,Ashley Madison hack treating email,http://pastebin.com/V5tmcFXq,7,4,hippich,1/19/2016 23:24


In [46]:
# Želimo pridobiti vse email naslove
hn.loc[hn["title"].str.contains(r"[\w\d\-\.]+a?@[\w]+\.\w+"), "title"].head(10)

1057    Site for generating bids and invoices- User:te...
Name: title, dtype: object

In [48]:
hn.loc[1057, "title"]

'Site for generating bids and invoices- User:test000@bidvoice.co PW:Test_000'

In [60]:
# želimo ugotoviti vse tipe podatkovnih baz
hn["title"].str.extract(r"(\w+SQL\w*)", flags=re.I)[0].value_counts()

PostgreSQL    27
NoSQL         16
MySQL         12
MemSQL         1
SparkSQL       1
nosql          1
mySql          1
CloudSQL       1
Name: 0, dtype: int64

In [65]:
# Izbiranje URLjev

hn["url"].str.extract(r"(?P<protocol>https?):\/\/(?P<domain>[\w\.]+)\/?(?P<path>[^\s]*)")

Unnamed: 0,protocol,domain,path
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-water-joke/63798/
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-network-investment-despite-net-neutrality/
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-management-technology/
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-translated-e4c0f5235081
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8fdcda1da74


In [62]:
#pd.options.display.max_colwidth = 150

In [77]:
import pandas as pd

# Create a pandas dataframe with a column of URLs
df = pd.DataFrame({'url': ['https://www.example.com/path1', 'https://www.example.com/path2', 'https://www.example.com/path3']})

# Extract the protocol, domain, and path from the URLs
df = df['url'].str.extract(r'(https?)://([^/]+)/(.*)')

df.rename(columns={'protocol': 'Protocol', 'domain': 'Domain', 'path': 'Path'}, inplace=True)

In [78]:
df

Unnamed: 0,0,1,2
0,https,www.example.com,path1
1,https,www.example.com,path2
2,https,www.example.com,path3


## Working With Missing Data

### Introduction

In [None]:
happiness2015 = pd.read_csv('data/wh_2015.csv') 
happiness2016 = pd.read_csv('data/wh_2016.csv') 
happiness2017 = pd.read_csv('data/wh_2017.csv')

In [None]:
shape_2015 = happiness2015.shape
shape_2016 = happiness2016.shape
shape_2017 = happiness2017.shape

In [None]:
shape_2015

In [None]:
shape_2016

In [None]:
shape_2017

### Identifying Missing Values

### Correcting Data Cleaning Errors that Result in Missing Values

### Visualizing Missing Data

### Using Data From Additional Sources to Fill in Missing Values

In [None]:
regions2015 = happiness2015[['COUNTRY', 'REGION']].copy()
regions2016 = happiness2016[['COUNTRY', 'REGION']].copy()


### Identifying Duplicates Values

### Correcting Duplicates Values

### Handle Missing Values by Dropping Columns

In [None]:
columns_to_drop = ['LOWER CONFIDENCE INTERVAL', 'STANDARD ERROR', 
                   'UPPER CONFIDENCE INTERVAL', 'WHISKER HIGH', 
                   'WHISKER LOW']

### Analyzing Missing Data

### Handling Missing Values with Imputation

### Dropping Rows

## Identifying Hidden Missing Data

### Primer: Happiness 2015

In [None]:
happiness2015 = pd.read_csv('data/wh_2015_special.csv')

### Primer: Diabetes

In [None]:
diabetes = pd.read_csv('data/pima-indians-diabetes_data.csv')

#### Analyzing missingness percentage

## Andvance Visualization of Missing Data

In [None]:
# Import missingno as msno
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

### Missingness Patterns

## Handle Missing Values

### Dropping Rows

### Imputation Techniques

#### Mean & median imputation


#### Mode and constant imputation

#### Visualize imputations

In [None]:



imputations = {'Mean Imputation': diabetes_mean, 'Median Imputation': diabetes_median, 
               'Most Frequent Imputation': diabetes_mode, 'Constant Imputation': diabetes_constant}

