In [3]:
import numpy as np
import pandas as pd

In [4]:
# importing some data from vdp's handbook

data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [5]:
## feature engineering

# one-hot encoding
# super useful to represent thoughts, ideas, and dictionary-type stuff in a form
# that computers like (such as 1s and 0s instead of "cats" and "dogs")

# scikit's DictVectorizer can do this for us

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [8]:
# above: all integer formats have been left alone (like price and rooms)
# but the neighborhood has been converted to one-hot

# we can get a quick idea of what's present again if we need to with this
vec.get_feature_names_out()

array(['neighborhood=Fremont', 'neighborhood=Queen Anne',
       'neighborhood=Wallingford', 'price', 'rooms'], dtype=object)

In [9]:
## Text Features

# importing some data from vdp

sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [10]:
# we can count the occurence of each word in earch entry using scikit's CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [12]:
# using pandas to create a dataframe of X

pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


In [14]:
# the above method is cool and all, but puts too much weight on words that appear hyper-frequently (ex: 'the')
# so we can use 'frequence-inverse document frequency (TF-IDF)

from sklearn.feature_extraction.text import TfidfTransformer
vec = TfidfTransformer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

ValueError: Expected 2D array, got 1D array instead:
array=['problem of evil' 'evil queen' 'horizon problem'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.