## 6.1 管道和复合估算器

### 6.1.4 异构数据的列转换器

In [4]:
import pandas as pd

X = pd.DataFrame(
    {
        "city": ["London", "London", "Paris", "Sallisaw"],
        "title": [
            "His Last Bow",
            "How Watson Learned the Trick",
            "A Moveable Feast",
            "The Grapes of Wrath",
        ],
        "expert_rating": [5, 3, 4, 5],
        "user_rating": [4, 5, 4, 3],
    }
)

In [5]:
X

Unnamed: 0,city,title,expert_rating,user_rating
0,London,His Last Bow,5,4
1,London,How Watson Learned the Trick,3,5
2,Paris,A Moveable Feast,4,4
3,Sallisaw,The Grapes of Wrath,5,3


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder="drop",
)
column_trans.fit(X)

In [12]:
from sklearn import set_config
set_config(display='diagram')   
# diplays HTML representation in a jupyter context
column_trans  

In [8]:
column_trans.get_feature_names_out()

array(['city_category__city_London', 'city_category__city_Paris',
       'city_category__city_Sallisaw', 'title_bow__bow',
       'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
       'title_bow__how', 'title_bow__last', 'title_bow__learned',
       'title_bow__moveable', 'title_bow__of', 'title_bow__the',
       'title_bow__trick', 'title_bow__watson', 'title_bow__wrath'],
      dtype=object)

In [11]:
column_trans.transform(X).toarray()

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)