<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/cont/002_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn  
Website: [https://scikit-learn.org/](https://scikit-learn.org/)

In [0]:
# !pip install --upgrade scikit-learn

In [1]:
import numpy as np
import pandas as pd
import sklearn
sklearn.__version__

'0.22.1'

In [13]:
def fetch_financial_data(company='AMZN'):
    """
    This function fetches stock market quotations.
    """
    import pandas_datareader.data as web
    return web.DataReader(name=company, data_source='stooq')

df = fetch_financial_data()
df = df[:5]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-09,1909.89,1917.82,1895.8,1901.05,3174962.0
2020-01-08,1898.04,1911.0,1886.44,1891.97,3511966.0
2020-01-07,1904.5,1913.89,1892.04,1906.86,4134010.0
2020-01-06,1860.0,1903.69,1860.0,1902.88,4065698.0
2020-01-03,1864.5,1886.2,1864.5,1874.97,3766604.0


In [6]:
# df = df.reset_index()
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2020-01-09,1909.89,1917.82,1895.8,1901.05,3174962.0
1,2020-01-08,1898.04,1911.0,1886.44,1891.97,3511966.0
2,2020-01-07,1904.5,1913.89,1892.04,1906.86,4134010.0
3,2020-01-06,1860.0,1903.69,1860.0,1902.88,4065698.0
4,2020-01-03,1864.5,1886.2,1864.5,1874.97,3766604.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
Date      5 non-null datetime64[ns]
Open      5 non-null float64
High      5 non-null float64
Low       5 non-null float64
Close     5 non-null float64
Volume    5 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 368.0 bytes


In [19]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-09,1909.89,1917.82,1895.8,1901.05,3174962.0,9,1,2020
2020-01-08,1898.04,1911.0,1886.44,1891.97,3511966.0,8,1,2020
2020-01-07,1904.5,1913.89,1892.04,1906.86,4134010.0,7,1,2020
2020-01-06,1860.0,1903.69,1860.0,1902.88,4065698.0,6,1,2020
2020-01-03,1864.5,1886.2,1864.5,1874.97,3766604.0,3,1,2020


In [36]:
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [37]:
df['height_cat'] = pd.cut(x=df.height, bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [38]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [43]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195), labels=['small', 'medium', 'high'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [45]:
pd.get_dummies(df, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [58]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [59]:
df['lang_number'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [61]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [63]:
df = pd.DataFrame(data={'website': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [68]:
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
