# Working with Features
---

### Categorical features

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [2]:
sns.load_dataset?

[0;31mSignature:[0m [0msns[0m[0;34m.[0m[0mload_dataset[0m[0;34m([0m[0mname[0m[0;34m,[0m [0mcache[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mdata_home[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkws[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load a dataset from the online repository (requires internet).

Parameters
----------
name : str
    Name of the dataset (`name`.csv on
    https://github.com/mwaskom/seaborn-data).  You can obtain list of
    available datasets using :func:`get_dataset_names`
cache : boolean, optional
    If True, then cache data locally and use the cache on subsequent calls
data_home : string, optional
    The directory in which to cache data. By default, uses ~/seaborn-data/
kws : dict, optional
    Passed to pandas.read_csv
[0;31mFile:[0m      ~/miniconda3/envs/pyml/lib/python3.7/site-packages/seaborn/utils.py
[0;31mType:[0m      function


In [48]:
data = [
    {"price": 8_500_000, "rooms": 3, "loc": "Kathmandu"},
    {"price": 7_000_000, "rooms": 4, "loc": "Biratnagar"},
    {"price": 9_850_000, "rooms": 3, "loc": "Pokhara"},
    {"price": 7_500_000, "rooms": 5, "loc": "Birgunj"},
    {"price": 5_000_000, "rooms": 3, "loc": "Birgunj"},
]

In [49]:
pd.DataFrame(data)

Unnamed: 0,loc,price,rooms
0,Kathmandu,8500000,3
1,Biratnagar,7000000,4
2,Pokhara,9850000,3
3,Birgunj,7500000,5
4,Birgunj,5000000,3


In [4]:
from sklearn.feature_extraction import DictVectorizer

In [5]:
DictVectorizer?

[0;31mInit signature:[0m
[0mDictVectorizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'numpy.float64'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mseparator[0m[0;34m=[0m[0;34m'='[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Transforms lists of feature-value mappings to vectors.

This transformer turns lists of mappings (dict-like objects) of feature
names to feature values into Numpy arrays or scipy.sparse matrices for use
with scikit-learn estimators.

When feature values are strings, this transformer will do a binary one-hot
(aka one-of-K) coding: one boolean-valued feature is constructed for each
of the possible string values that the feature can take on. For instance,
a feature "f" that can

In [50]:
vec = DictVectorizer(sparse=False, dtype=int)
vec_1 = vec.fit_transform(data)
vec_1

array([[      0,       0,       1,       0, 8500000,       3],
       [      1,       0,       0,       0, 7000000,       4],
       [      0,       0,       0,       1, 9850000,       3],
       [      0,       1,       0,       0, 7500000,       5],
       [      0,       1,       0,       0, 5000000,       3]],
      dtype=int64)

In [7]:
vec.feature_names_

['loc=Biratnagar',
 'loc=Birgunj',
 'loc=Kathmandu',
 'loc=Pokhara',
 'price',
 'rooms']

In [8]:
vec.get_feature_names()

['loc=Biratnagar',
 'loc=Birgunj',
 'loc=Kathmandu',
 'loc=Pokhara',
 'price',
 'rooms']

In [51]:
pd.DataFrame(vec_1, columns=vec.get_feature_names())

Unnamed: 0,loc=Biratnagar,loc=Birgunj,loc=Kathmandu,loc=Pokhara,price,rooms
0,0,0,1,0,8500000,3
1,1,0,0,0,7000000,4
2,0,0,0,1,9850000,3
3,0,1,0,0,7500000,5
4,0,1,0,0,5000000,3


### Text Features

In [9]:
sample = [
    "this is broken",
    "does not work after couple of days",
    "is not what is advertised"
]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
CountVectorizer?

[0;31mInit signature:[0m
[0mCountVectorizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minput[0m[0;34m=[0m[0;34m'content'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;34m'utf-8'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_error[0m[0;34m=[0m[0;34m'strict'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrip_accents[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlowercase[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpreprocessor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstop_words[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_pattern[0m[0;34m=[0m[0;34m'(?u)\\b\\w\\w+\\b'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mngram_range[0m[0;34m=[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m

In [12]:
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x12 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [16]:
X.toarray()

array([[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0]], dtype=int64)

In [17]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,advertised,after,broken,couple,days,does,is,not,of,this,what,work
0,0,0,1,0,0,0,1,0,0,1,0,0
1,0,1,0,1,1,1,0,1,1,0,0,1
2,1,0,0,0,0,0,2,1,0,0,1,0


#### TfIdf

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,advertised,after,broken,couple,days,does,is,not,of,this,what,work
0,0.0,0.0,0.622766,0.0,0.0,0.0,0.47363,0.0,0.0,0.622766,0.0,0.0
1,0.0,0.389888,0.0,0.389888,0.389888,0.389888,0.0,0.29652,0.389888,0.0,0.0,0.389888
2,0.452123,0.0,0.0,0.0,0.0,0.0,0.687703,0.343851,0.0,0.0,0.452123,0.0


### Derived feature

In [20]:
x = np.arange(1, 9)
x

array([1, 2, 3, 4, 5, 6, 7, 8])

In [21]:
X = x[:, np.newaxis]
X

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8]])

In [22]:
from sklearn.preprocessing import PolynomialFeatures

In [23]:
PolynomialFeatures?

[0;31mInit signature:[0m
[0mPolynomialFeatures[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdegree[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minteraction_only[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minclude_bias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0morder[0m[0;34m=[0m[0;34m'C'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Generate polynomial and interaction features.

Generate a new feature matrix consisting of all polynomial combinations
of the features with degree less than or equal to the specified degree.
For example, if an input sample is two dimensional and of the form
[a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].

Parameters
----------
degree : integer
    The degree of the polynomial features. Default = 2.

interaction_only : boolean, default = False
    If true, only interaction features are pr

In [26]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X2 = poly.fit_transform(X)
X2

array([[ 1.,  1.],
       [ 2.,  4.],
       [ 3.,  9.],
       [ 4., 16.],
       [ 5., 25.],
       [ 6., 36.],
       [ 7., 49.],
       [ 8., 64.]])

In [27]:
poly = PolynomialFeatures(degree=2, include_bias=True)
X2_ = poly.fit_transform(X)
X2_

array([[ 1.,  1.,  1.],
       [ 1.,  2.,  4.],
       [ 1.,  3.,  9.],
       [ 1.,  4., 16.],
       [ 1.,  5., 25.],
       [ 1.,  6., 36.],
       [ 1.,  7., 49.],
       [ 1.,  8., 64.]])

In [28]:
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X)
X3

array([[  1.,   1.,   1.],
       [  2.,   4.,   8.],
       [  3.,   9.,  27.],
       [  4.,  16.,  64.],
       [  5.,  25., 125.],
       [  6.,  36., 216.],
       [  7.,  49., 343.],
       [  8.,  64., 512.]])

### Handling Missing data

In [29]:
nan = np.nan
XX = np.array([
    [nan, 0,   3],
    [4,   5,   9],
    [8,   7,   2],
    [4,   nan, 6],
    [8,   8,   1]
])
XX

array([[nan,  0.,  3.],
       [ 4.,  5.,  9.],
       [ 8.,  7.,  2.],
       [ 4., nan,  6.],
       [ 8.,  8.,  1.]])

In [30]:
yy = np.array([12, 18, -3, 3, -1])

In [31]:
from sklearn.preprocessing import Imputer

In [32]:
Imputer?

[0;31mInit signature:[0m [0mImputer[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Imputation transformer for completing missing values.

Read more in the :ref:`User Guide <imputation>`.

Parameters
----------
missing_values : integer or "NaN", optional (default="NaN")
    The placeholder for the missing values. All occurrences of
    `missing_values` will be imputed. For missing values encoded as np.nan,
    use the string value "NaN".

strategy : string, optional (default="mean")
    The imputation strategy.

    - If "mean", then replace missing values using the mean along
      the axis.
    - If "median", then replace missing values using the median along
      the axis.
    - If "most_frequent", then replace missing using the most frequent
      value along the axis.

axis : integer, optional (default=0)
    The axis along which to impute.

    - If `axis=0`, then impute along col

In [33]:
Imputer(strategy="mean").fit_transform(XX)

array([[6., 0., 3.],
       [4., 5., 9.],
       [8., 7., 2.],
       [4., 5., 6.],
       [8., 8., 1.]])

In [34]:
Imputer(strategy="median").fit_transform(XX)

array([[6., 0., 3.],
       [4., 5., 9.],
       [8., 7., 2.],
       [4., 6., 6.],
       [8., 8., 1.]])

In [35]:
Imputer(strategy="most_frequent").fit_transform(XX)

array([[4., 0., 3.],
       [4., 5., 9.],
       [8., 7., 2.],
       [4., 0., 6.],
       [8., 8., 1.]])

### Feature pipeline

In [37]:
from sklearn.linear_model import LinearRegression

In [36]:
from sklearn.pipeline import make_pipeline

In [38]:
model = make_pipeline(
    Imputer(strategy="mean"),
    PolynomialFeatures(degree=2),
    LinearRegression()
)

In [39]:
model.fit(XX, yy)

Pipeline(memory=None,
         steps=[('imputer',
                 Imputer(axis=0, copy=True, missing_values='NaN',
                         strategy='mean', verbose=0)),
                ('polynomialfeatures',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [40]:
model.predict(XX)

array([12., 18., -3.,  3., -1.])