### Обучение пайплайна

1. Загрузим данные - определение токсичности комментария
2. Соберем пайплайн с простейшим препроцессингом (tfidf) на текстовых данных
3. Обучим логистическую регрессию и сохраним на диск предобученный пайплайн

In [1]:
import numpy as np
import pandas as pd
import dill

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

Здесь много разных вариантов, но мы сведем все к бинарному классу - toxic/nontoxic

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df = pd.read_csv('train.csv')
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


Загрузим данные

In [4]:
df['y'] = df[class_names].max(axis=1).values
df['y'].value_counts()

0    143346
1     16225
Name: y, dtype: int64

In [5]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,y
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


Разделим данные на train/test и сохраним тестовую выборку на диск (здесь мы ее касаться уже не будем)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', 1), 
                                                    df['y'], test_size=0.33, random_state=42)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [7]:
y_train.value_counts(normalize=True)

0    0.898711
1    0.101289
Name: y, dtype: float64

In [8]:
y_test.value_counts(normalize=True)

0    0.897529
1    0.102471
Name: y, dtype: float64

In [9]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [10]:
features = ['comment_text']
target = 'y'

Соберем кусок, ответственный за feature engineering

In [11]:
#combine
comment_text = Pipeline([
                ('imputer', TextImputer('comment_text', ' ')),
                ('selector', ColumnSelector(key='comment_text')),
                ('tfidf', TfidfVectorizer(sublinear_tf=True,
                                                            strip_accents='unicode',
                                                            analyzer='word',
                                                            token_pattern=r'\w{1,}',
                                                            stop_words='english',
                                                            ngram_range=(1, 1),
                                                            max_features=10000))
            ])

feats = FeatureUnion([('comment_text', comment_text)])

Добавим простейший классификатор

In [12]:
%%time

pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(C=0.1, solver='sag')),
])

pipeline.fit(X_train, y_train)

Wall time: 7.04 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('comment_text',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='comment_text',
                                                                              value=' ')),
                                                                 ('selector',
                                                                  ColumnSelector(key='comment_text')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(max_features=10000,
                                                                                  stop_words='english',
                                                                                  strip_accents='unicode',
                                                                       

Посмотрим, как выглядит наш pipeline

In [13]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('comment_text',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='comment_text',
                                                               value=' ')),
                                                  ('selector',
                                                   ColumnSelector(key='comment_text')),
                                                  ('tfidf',
                                                   TfidfVectorizer(max_features=10000,
                                                                   stop_words='english',
                                                                   strip_accents='unicode',
                                                                   sublinear_tf=True,
                                                                   token_pattern='\\w{1,}'))]))])),
 ('classifier', LogisticRegression(C=0.1, solver='

Сохраним модель (пайплайн)

In [14]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)