In [2]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

In [3]:
categories = ['sci.med', 'sci.space']
X_train, y_train = fetch_20newsgroups(random_state=1,
                                      subset='train',
                                      categories=categories,
                                      remove=('footers', 'quotes'),
                                      return_X_y=True)
X_test, y_test = fetch_20newsgroups(random_state=1,
                                    subset='test',
                                    categories=categories,
                                    remove=('footers', 'quotes'),
                                    return_X_y=True)

print(X_train[0])

From: mccall@mksol.dseg.ti.com (fred j mccall 575-3539)
Subject: Re: Metric vs English
Article-I.D.: mksol.1993Apr6.131900.8407
Organization: Texas Instruments Inc
Lines: 31




American, perhaps, but nothing military about it.  I learned (mostly)
slugs when we talked English units in high school physics and while
the teacher was an ex-Navy fighter jock the book certainly wasn't
produced by the military.

[Poundals were just too flinking small and made the math come out
funny; sort of the same reason proponents of SI give for using that.] 

-- 
"Insisting on perfect safety is for people who don't have the balls to live
 in the real world."   -- Mary Shafer, NASA Ames Dryden


In [4]:
def subject_body_extractor(posts):
    #用两列构造对象dtype数组
    #第一列=“主题”，第二列=“主体”
    features = np.empty(shape=(len(posts), 2), dtype=object)
    for i, text in enumerate(posts):
        # 临时变量“ _”存储“ \ n \ n”
        headers, _, body = text.partition('\n\n')
        # 将正文存储在第二栏中
        features[i, 1] = body

        prefix = 'Subject:'
        sub = ''
        # 在第一栏中的“主题：”之后保存文本
        for line in headers.split('\n'):
            if line.startswith(prefix):
                sub = line[len(prefix):]
                break
        features[i, 0] = sub

    return features


subject_body_transformer = FunctionTransformer(subject_body_extractor)

In [5]:
def text_stats(posts):
    return [{'length': len(text),
             'num_sentences': text.count('.')}
            for text in posts]


text_stats_transformer = FunctionTransformer(text_stats)

In [6]:
pipeline = Pipeline([
    # 提取标题和文字内容主体
    ('subjectbody', subject_body_transformer),
    # 使用ColumnTransformer组合标题和主体特征
    ('union', ColumnTransformer(
        [
            # 标题词袋（col 0）
            ('subject', TfidfVectorizer(min_df=50), 0),
            # 文章主体分解的词袋（col 1）
            ('body_bow', Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ]), 1),
            # 从帖子的正文中提取文本统计信息的管道
            ('body_stats', Pipeline([
                ('stats', text_stats_transformer),  # 返回字典列表
                ('vect', DictVectorizer()),  # 字典列表->特征矩阵
            ]), 1),
        ],
        # ColumnTransformer功能上的权重
        transformer_weights={
            'subject': 0.8,
            'body_bow': 0.5,
            'body_stats': 1.0,
        }
    )),
    # 在组合功能上使用SVC分类器
    ('svc', LinearSVC(dual=False)),
], verbose=True)

In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('Classification report:\n\n{}'.format(
    classification_report(y_test, y_pred))
)

[Pipeline] ....... (step 1 of 3) Processing subjectbody, total=   0.0s
[Pipeline] ............. (step 2 of 3) Processing union, total=   2.1s
[Pipeline] ............... (step 3 of 3) Processing svc, total=   0.0s
Classification report:

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       396
           1       0.87      0.84      0.85       394

    accuracy                           0.86       790
   macro avg       0.86      0.86      0.86       790
weighted avg       0.86      0.86      0.86       790


In [0]:
import pandas as pd
# 获取subjectbody的输出
X_subjectbody = pipeline.named_steps['subjectbody'].transform(X_train)

# 获取union的输出
X_union = pipeline.named_steps['union'].transform(X_subjectbody)

In [22]:
pipeline.named_steps['subjectbody'].get_feature_names_out()

AttributeError: This 'FunctionTransformer' has no attribute 'get_feature_names_out'

In [21]:
column_names = pipeline.named_steps['union'].get_feature_names_out()
df_result = pd.DataFrame(X_union, columns=column_names)


AttributeError: This 'FunctionTransformer' has no attribute 'get_feature_names_out'

In [20]:
X_union

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.53636680e-02,  5.07000000e+02,  4.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.88014140e-02,  1.22200000e+03,  4.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  7.58226317e-01, ...,
         1.68790836e-02,  1.74000000e+02,  2.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.40826685e-03,  4.87000000e+02,  4.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.68396701e-02,  3.24000000e+02,  4.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.34815789e-02,  1.10700000e+03,  2.50000000e+01]])