-
Notifications
You must be signed in to change notification settings - Fork 1
/
pipelines.py
126 lines (107 loc) · 3.29 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import pandas as pd
from sklego.preprocessing import ColumnSelector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler, QuantileTransformer, Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_transformers import (
FTLangdetectTransformer,
PooledDeBertaTransformer
)
from config import FASTTEXT_MODEL_PATH, MSFTDeBertaV3Config
from english_utils import (
number_of_unigrams,
number_of_line_breaks,
get_punctuation_error_fraction
)
def to_series(df: pd.DataFrame) -> pd.Series:
assert df.shape[1] == 1
return df.iloc[:, 0]
FEATURE_COLUMNS = ["full_text"]
feature_column_picker_pipe = Pipeline(
steps=[
("pick_full_text_column", ColumnSelector(FEATURE_COLUMNS)),
("to_series", FunctionTransformer(to_series)),
]
)
number_of_unigrams_pipe = Pipeline(
steps=[
("feature_column_picker", feature_column_picker_pipe),
("count_unigrams", FunctionTransformer(number_of_unigrams)),
("scale", StandardScaler())
]
)
number_of_line_breaks_pipe = Pipeline(
steps=[
("feature_column_picker", feature_column_picker_pipe),
("count_line_breaks", FunctionTransformer(number_of_line_breaks)),
("scale", StandardScaler())
]
)
i_pipe = Pipeline(
steps=[
# pick the column
("feature_column_picker", feature_column_picker_pipe),
# count i vs I
("i_I_counter", CountVectorizer(vocabulary=["i", "I"], lowercase=False, token_pattern=r"(?u)\b\w\b")),
("union", FeatureUnion(
[
("l1normalizer", Normalizer(norm='l1')),
("scaled_total_count", Pipeline(
steps=[
("sum_columns", FunctionTransformer(lambda x: np.sum(x, axis=1))),
("std_scaler", StandardScaler())
]
)
)
]
)
)
]
)
bad_punctuation_pipe = Pipeline(
steps=[
("feature_column_picker", feature_column_picker_pipe),
("bad_punctuation_frac", FunctionTransformer(get_punctuation_error_fraction)),
]
)
tf_idf_pipe = Pipeline(
steps=[
("feature_column_picker", feature_column_picker_pipe),
("tf-idf", TfidfVectorizer(lowercase=True, sublinear_tf=True, min_df=0.01, max_df=0.99)),
]
)
def make_english_score_pipe(model_path=FASTTEXT_MODEL_PATH):
english_score_pipe = Pipeline(
steps=[
("feature_column_picker", feature_column_picker_pipe),
("english_scorer", FTLangdetectTransformer(model_path=model_path)),
("scale", StandardScaler())
]
)
return english_score_pipe
def make_deberta_pipe(deberta_config):
pooled_deberta_pipe = Pipeline(steps=[
# this is upsetting as hell but somehow the only way I can make this pipeline to work
("index_resetter", FunctionTransformer(lambda _df: _df.reset_index())),
("feature_column_picker", feature_column_picker_pipe),
("deberta_embedding", PooledDeBertaTransformer(deberta_config)),
]
)
return pooled_deberta_pipe
def make_features_pipeline(
fastext_model_path,
deberta_config: MSFTDeBertaV3Config
):
features_pipeline = FeatureUnion(
[
("unigrams_count", number_of_unigrams_pipe),
("line_breaks_count", number_of_line_breaks_pipe),
("english_score", make_english_score_pipe(fastext_model_path)),
("i_vs_I", i_pipe),
("bad_punctuation", bad_punctuation_pipe),
("tf-idf", tf_idf_pipe),
("deberta_pipe", make_deberta_pipe(deberta_config))
]
)
return features_pipeline