In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor, MultiRegressionCustomMetric
from pycbrf import ExchangeRates
import torch
import itertools

In [2]:
import nltk
import string
from nltk import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from navec import Navec
import holidays
import re
nltk.download('stopwords')
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
RANDOM_STATE = 42

## Загрузка и предобработка данных

In [None]:
data = pd.read_csv('train_dataset_train.csv')
test_data = pd.read_csv('test_dataset_test.csv')

In [4]:
def save_solution(pred, filename): 
    solution = pd.DataFrame(test_data['document_id'])
    solution[['views', 'depth', 'full_reads_percent']] = pred
    solution.to_csv(filename, index=False)

def parse_title(title):
    title = title.split('\n')
    if len(title) == 2:
        title = title[0] + '. ' + title[1].strip()
    else:
        title = title[0]
    return title

def get_USD_rate(date):
    if date in rates:
        current_rate = rates[date]
    else:
        current_rate = rates[date] = ExchangeRates(date)['USD'].value
    
    prev = date-pd.Timedelta(days=1)
    if prev in rates:
        prev_rate = rates[prev]
    else:
        prev_rate = rates[prev] = ExchangeRates(prev)['USD'].value
    return current_rate - prev_rate

In [6]:
def extract_features(data):
    data['clean_title'] = data['title'].apply(parse_title) # очистка названия статьи
    data['num_words'] = data.title.apply(lambda x: len(x.split())) # количество слов в заголовке
    data['num_authors'] = data.authors.apply(eval)
    data['num_authors'] = data.num_authors.apply(len) #количество авторов
    data['num_tags'] = data.tags.apply(eval)
    data['num_tags'] = data.num_tags.apply(len) # количество тегов
    data['USD_difference'] = pd.to_datetime(data.publish_date).dt.date.apply(get_USD_rate) # курс доллара на дату публикации
    data['is_group_authors'] = data.num_authors > 1
    data['is_group_authors'] = data.is_group_authors.astype(int) # один автор или группа

In [7]:
data.drop([214, 948, 2424, 2438, 3603, 3756, 4183, 5086, 5337, 5634, 5923, 5951, 6198, 6359], inplace=True) #Статьи на китайском и английском, выбросы
data.drop(data[data.category.isin(data.category.value_counts()[-3:].index)].index, inplace=True) # Убираем статьи категории "дискуссионный клуб" (8 шт, отсутствует в тесте)

# ПРЕДОБРАБОТКА
rates = {}
extract_features(data)
extract_features(test_data)

# ДОБАВЛЕНИЕ ТЕКСТОВЫХ ТЭГОВ В ДАТАСЕТ
tags_df = pd.read_csv('parsed_tags.csv')
tags_df.tags_text = tags_df.tags_text.apply(lambda x: ' '.join(eval(x)))
data = data.merge(tags_df, how='left', on='document_id')
test_data = test_data.merge(tags_df, how='left', on='document_id')

# УДАЛЕНИЕ ДУБЛИРУЮЩИХСЯ СТАТЕЙ
nonunique_titles = data.clean_title.value_counts()
nonunique_titles = nonunique_titles[nonunique_titles>1].index.to_list()
multiple_titles = data[data.clean_title.isin(nonunique_titles)].index.to_list()
data.drop(multiple_titles, inplace=True)
data = data.reset_index()
data.views = data.views.apply(np.log)

targets = ['views', 'depth', 'full_reads_percent']
y = data[targets]
X = data.drop(targets, axis=1)

## Энкодеры

In [61]:
class DatetimeEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, day_split=12):
        self.feature_names = []
        self.day_split = day_split
        self.hol = holidays.Russia(years=[2017, 2018, 2019, 2020, 2021, 2022])
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.to_datetime(X) + pd.Timedelta('03:00:00')
        _X = pd.DataFrame()
        # Создание колонок со временем суток публикации
        _X['time_of_day'] = pd.cut(X.dt.strftime("%H").astype(int), [i for i in range(0, 25, int(24/self.day_split))], labels=range(self.day_split), right=False)
        # Создание колонок с годом, месяцем и днем публикации
        #for col in ['year', 'month', 'day']:
        #    _X[col] = X.dt.strftime(f"%{col[0]}").astype(int)
        _X['day'] = X.dt.strftime('%d').astype(int)
        _X['month'] = X.dt.strftime('%m-%y')
        # Создание колонки, показывающей создана ли публикация в будни
        _X['is_dayoff'] = X.dt.strftime("%w").astype(int).isin([0,6]).astype(int) | X.dt.date.isin(self.hol)
        _X['time_difference'] = (pd.to_datetime('2022-05-29')-X).dt.days
        #_X.drop(['publish_date'], axis=1, inplace=True)
        self.feature_names = np.array(_X.columns)
        return _X
    
    def get_feature_names_out(self, *args):
        return self.feature_names

In [8]:
class TitleEncoderDP(BaseEstimator, TransformerMixin):
    
    def __init__(self,  distance_threshold=0.4):
        self.feature_names = ['clustering']
        self.distance_threshold = distance_threshold
        
    def fit(self, title, y=None):
        return self
    
    def transform(self, title, y=None):         
        embedder = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence')

        embeddings = embedder.encode(title)
        print(embeddings)
        embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

        clustering_model = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=self.distance_threshold)
        clustering_model.fit(embeddings)
        res = pd.DataFrame(clustering_model.labels_, index=title.index)
        return res
    
    def get_feature_names_out(self, *args):
        return self.feature_names  

In [31]:
class TitleEncoderRUBERT(BaseEstimator, TransformerMixin):
    
    def __init__(self,  n_clusters=10):
        self.feature_names = []
        self.n_clusters = n_clusters
        
    def fit(self, title, y=None):
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
        self.model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
        return self
    
    def transform(self, title, y=None):         
        X = pd.DataFrame({'title': title})
        X['vectors'] = X['title'].apply(self.get_sentence_vec)
        X = X.join(pd.DataFrame(X['vectors'].tolist(), index = X.index))
        X.drop(['title', 'vectors'], axis=1, inplace=True)
        km = KMeans(n_clusters=self.n_clusters, max_iter=10000, n_init=100, random_state=RANDOM_STATE).fit(X)
        res = pd.DataFrame(km.labels_, index=X.index, columns=['clustering'])
        self.feature_names = res.columns
        return res
    
    def get_feature_names_out(self, *args):
        return self.feature_names
     
    def get_sentence_vec(self, sentence):
        t = self.tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**{k: v.to(self.model.device) for k, v in t.items()})
        embeddings = model_output.last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings)
        return embeddings[0].cpu().numpy()

In [10]:
class TitleEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_clusters=10):
        self.feature_names = []
        self.navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
        self.navec_voc = self.navec.vocab.words
        self.missing_counter = []
        self.abnormal = []
        self.n_clusters = n_clusters
        
    def fit(self, title, y=None):
        return self
    
    def transform(self, title, y=None):
        X = pd.DataFrame({'title': title})
        X['vectors'] = X['title'].apply(self.get_sentence_vec)
        X = X.join(pd.DataFrame(X['vectors'].tolist(), index = X.index))
        X.drop(['title', 'vectors'], axis=1, inplace=True)
        km = KMeans(n_clusters=self.n_clusters, max_iter=10000, n_init=100, random_state=RANDOM_STATE).fit(X)
        res = pd.DataFrame(km.labels_, index=X.index, columns=['clustering'])
        self.feature_names = res.columns
        return res
    
    def get_feature_names_out(self, *args):
        return self.feature_names
    
    def tokenize_sentence(self, sentence):
        tokens = word_tokenize(sentence, language='russian')
        tokens = [i.lower() for i in tokens if i.isalpha()]
        return tokens    
    
    def find_missing(self, missing):
        snowball = SnowballStemmer(language='russian')
        missing = [snowball.stem(i) for i in missing]
        res = []
        for w in missing:
            if not w:
                pass
                #res.append(synonyms[w])
            else:
                r = re.compile(r'\b' + w + r'\w*\b')
                search = list(filter(r.match, self.navec_voc))
                if search:
                    for i in search:
                        if i in self.navec:
                            res.append(i)
                            break
                else:
                    self.missing_counter.append(w)
        return res
    
    def get_sentence_vec(self, title):
        stop_words = stopwords.words('russian')
        title = self.tokenize_sentence(title)
        title = [i for i in title if i not in stop_words]
        title_lst = [i for i in title if i in self.navec]
        title_lst += self.find_missing([title[i] for i in range(len(title)) if title[i] not in title_lst])
        #print(title_lst)
        if len(title_lst) !=0:
            vec = sum([self.navec.get(w) for w in title_lst])/(len(title_lst)+0.001) 
        else:
            self.abnormal.append(title)
            vec = np.zeros((300,))
        return vec
    

In [17]:
class OHE_for_list(BaseEstimator,TransformerMixin):

    def __init__(self, prefix=True, unique_values={}):
        self.feature_names = []
        self.prefix = prefix
        self.unique_values = unique_values
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        """Распаковывает переданную в него серию, содержащую списки значений, 
        возвращает OHE-like матрицу размером n x m (n - кол-во уникальных значений, m - кол-во объектов)"""
        X = pd.DataFrame(X) # если передана одна колонка в виде Series
        result_df = pd.DataFrame(index=X.index)
        for col in X.columns:
            if not self.unique_values.get(col):
                col_unique = []
                for lst in X[col].apply(eval):
                    col_unique += lst
                self.unique_values[col] = list(set(col_unique))
                
            _X = pd.DataFrame({'list': X[col]})
            
            if self.prefix:
                _X = pd.DataFrame({f'{col}_{i}': _X.list.str.contains(i).astype(int) for i in self.unique_values[col]})
            else:
                _X = pd.DataFrame({i: _X.list.str.contains(i).astype(int) for i in self.unique_values[col]})
            
            result_df = pd.concat([result_df, _X.copy()], axis=1)
        
        self.feature_names = np.array(result_df.columns)
        return result_df
    
    def get_feature_names_out(self, *args):
        return self.feature_names    

In [101]:
class Custom_R2_Metric(MultiRegressionCustomMetric):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 3
        assert len(target[0]) == len(approxes[0])
        score_views = r2_score(target[0], approxes[0])
        score_depth = r2_score(target[1], approxes[1])
        score_frp = r2_score(target[2], approxes[2])
        score = 0.4 * score_views + 0.3 * score_depth + 0.3 * score_frp
        return score, 0

In [18]:
def get_top_features(df, target, importance=0.2):
    top_importance_features = {}
    for col in df:
        params = {'learning_rate': 0.1, 
          'depth': 6, 
          'l2_leaf_reg': 3, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 400,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bernoulli', 
          'allow_const_label': True, 
          'logging_level': 'Silent',
          'random_state': RANDOM_STATE
         }
        gbt = CatBoostRegressor(**params)
        OHE = OHE_for_list(prefix=False)
        gbt.fit(OHE.fit_transform(df[col]), target)
        feature_importances = pd.DataFrame(gbt.feature_importances_, index=OHE.get_feature_names_out(), columns=['importance'])
        top_importance_features[col] = feature_importances[feature_importances.importance > importance].index.to_list()
        del gbt
    return top_importance_features

In [19]:
unique = get_top_features(X[['authors', 'tags']], y, 0.1)
len(unique['authors']) + len(unique['tags'])

231

In [20]:
class Do_nothing(BaseEstimator,TransformerMixin):

    def __init__(self):
        self.feature_names = []
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        self.feature_names = np.array(X.columns)
        return X
    
    def get_feature_names_out(self, *args):
        return self.feature_names 

## Обработка датасета в основном пайплайне

In [247]:
# ПАЙПЛАЙН ОБРАБОТКИ КОЛОНОК
impute = Pipeline(steps=[('Imputer', SimpleImputer(missing_values=0.0, strategy='median')),
                        ('StdScaler', StandardScaler())])

col_transformer = ColumnTransformer([('Do_nothing', Do_nothing(), ['num_authors', 'num_tags', 'num_words', 'USD_difference', 'is_group_authors', 'category']),
                                     ('DatetimeEncoder', DatetimeEncoder(), 'publish_date'),
                                     ('Imputer', SimpleImputer(missing_values=0.0, strategy='median'), ['ctr']),
                                     ('TitleTransformer', TitleEncoderDP(distance_threshold=1.3), 'clean_title'),
                                     ('TitleTransformerTags', TitleEncoderDP(distance_threshold=1.3), 'tags_text'),
                                     ('OHE', OHE_for_list(unique_values=unique), ['authors', 'tags'])
                                    ])

In [249]:
# ОБРАБОТКА ДАТАСЕТА
X_train = pd.DataFrame(col_transformer.fit_transform(X), columns=col_transformer.get_feature_names_out())
X_test_final = pd.DataFrame(col_transformer.transform(X_test_final), columns=col_transformer.get_feature_names_out())

In [250]:
# РАЗБИВКА ТРЕНИРОВОЧНЫХ ДАННЫХ НА ТРЕНИРОВОЧНЫЕ И ВАЛИДАЦИОННЫЕ
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, random_state=4)

## Обучение и валидация модели

In [98]:
def get_score(y_test, pred):
    score_views = r2_score(y_test["views"], pred[:,0])
    score_depth = r2_score(y_test["depth"], pred[:,1])
    score_frp = r2_score(y_test["full_reads_percent"], pred[:,2])
    score = 0.4 * score_views + 0.3 * score_depth + 0.3 * score_frp
    return score

In [105]:
params = {'learning_rate': 0.1, 
          'depth': 6, 
          'l2_leaf_reg': 3, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': Custom_R2_Metric(), 
          'task_type': 'CPU', 
          'iterations': 800,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bernoulli', 
          'allow_const_label': True, 
          'logging_level': 'Silent',
          'random_state': RANDOM_STATE
         }
cb = CatBoostRegressor(cat_features=[4,5,6,7,8,9,12], **params)
cb.fit(X_train, y_train)

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'r2_score':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "..\..\..\AppData\Local\Temp\ipykernel_3976\784866544.py", line 11:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


<catboost.core.CatBoostRegressor at 0x2c118490>

In [106]:
train_pred = cb.predict(X_train)
test_pred = cb.predict(X_test)
print (get_score(y_train, train_pred), get_score(y_test, test_pred))
print('r2 scores for views, depth and full_read_percent TRAIN:', [r2_score(y_train.iloc[:,x], train_pred[:,x]) for x in range(3)])
print('r2 scores for views, depth and full_read_percent TEST:', [r2_score(y_test.iloc[:,x], test_pred[:,x]) for x in range(3)])

0.7291965606977935 0.6471294859055642
r2 scores for views, depth and full_read_percent TRAIN: [0.7583251580245085, 0.8063808110952827, 0.6131741805313506]
r2 scores for views, depth and full_read_percent TEST: [0.7023389767235608, 0.7501619387222755, 0.47048437866485715]


In [44]:
feature_importances = pd.Series(cb.feature_importances_, index=X_train.drop('views', axis=1).columns)
feature_importances.sort_values(ascending=False).to_csv('feat.csv')

## Сохранение сабмита

In [94]:
pred = cb.predict(X_test_final)
save_solution(pred, 'solution_9.csv')