# Feature engineering

## 1.categorical data

In [1]:
data = [
    {'price':850000, 'room':4, 'neighborhood':'Queen Anne'},
    {'price':700000, 'room':3, 'neighborhood':'Fremont'},
    {'price':650000, 'room':3, 'neighborhood':'Wallingford'},
    {'price':600000, 'room':2, 'neighborhood':'Fremont'}
]

In [2]:
from sklearn.feature_extraction import DictVectorizer 
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]], dtype=int32)

In [3]:
vec.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'room']

In [5]:
vec = DictVectorizer(sparse=True, dtype=int)
vec.fit_transform(data)

<4x5 sparse matrix of type '<class 'numpy.int32'>'
	with 12 stored elements in Compressed Sparse Row format>

## 2. Text feature

In [8]:
sample = ['problem of evil', 'evil queen', ' horizon problem']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer() 
X = vec.fit_transform(sample)

In [11]:
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [12]:
import pandas as pd 
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


### TF-IDF(term frequency-inverse document frequency)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns = vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.517856,0.0,0.680919,0.517856,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


## 3. missing value

In [18]:
import numpy as np
from numpy import nan
X = np.array([[nan, 0 , 3],
             [2,  7 , 9],
             [3,  5 , 2],
             [4,  nan, 6],
             [8, 8 , 1]])
y = np.array([14,16,-1, 8, -5])

In [19]:
# imputation 
from sklearn.preprocessing import Imputer 
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2

array([[4.25, 0.  , 3.  ],
       [2.  , 7.  , 9.  ],
       [3.  , 5.  , 2.  ],
       [4.  , 5.  , 6.  ],
       [8.  , 8.  , 1.  ]])

In [21]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X2, y)
model.predict(X2)

array([12.85969275, 14.22112069, -0.58568837, 11.21946747, -5.71459254])

## Feature pipeline

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
model = make_pipeline(Imputer(strategy='mean'), 
                      PolynomialFeatures(degree=2), 
                      LinearRegression())

model.fit(X,y) 
print(y)
print(model.predict(X))

In [25]:
model.fit(X,y)
print(X)
print(model.predict(X))

[[nan  0.  3.]
 [ 2.  7.  9.]
 [ 3.  5.  2.]
 [ 4. nan  6.]
 [ 8.  8.  1.]]
[14. 16. -1.  8. -5.]
