## Библиотеки и глобальные переменные

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pymorphy2
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import Counter
import lightgbm

from transformers import AutoTokenizer, AutoModel
from umap.umap_ import UMAP
import torch
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('russian') + stopwords.words('english')
nltk.download('punkt')
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Roman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
TOKENIZER = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
BERT = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
MORPH = pymorphy2.MorphAnalyzer()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Загрузка данных

In [553]:
df_issues_train = pd.read_csv("data/train_issues.csv")
df_comment_train = pd.read_csv("data/train_comments.csv")

df_issues_test = pd.read_csv("data/test_issues.csv")
df_comment_test = pd.read_csv("data/test_comments.csv")

df_emp = pd.read_csv("data/employees.csv")

In [785]:
df_issues_test['overall_worklogs'] = -1
df_all = pd.concat([df_issues_train, df_issues_test]).reset_index(drop=True)
df_all_com = pd.concat([df_comment_train, df_comment_test]).reset_index(drop=True)
df_all_com.text = df_all_com.text.apply(lambda x: x.split(':')[-1] if x.__contains__('mentioned this issue in') else x) # Чистим комментарии от автотекста

## Feature engineering

In [786]:
def unique_words(lower=0, upper=df_issues_train.overall_worklogs.max()):
    X = df_issues_train[df_issues_train.overall_worklogs.between(lower, upper)].copy()
    X['clean_text'] = X.summary.apply(preprocess)
    vocab = get_vocab(X.clean_text)
    vocab = pd.DataFrame({'word': vocab.keys(), 'freq': vocab.values()})
    
    all_txt = []
    texts = [all_txt + i for i in X.clean_text]
    freqs=[]
    for lst in texts:
        freqs +=lst
    freqs = Counter(freqs)
    freqs = pd.DataFrame({'word': freqs.keys(), 'freq': freqs.values()})
    freqs = freqs[~freqs.word.isin(stop_words)]
    return freqs.copy()

def check_unique_words(lst):
    for word in lst:
        if word in merged.word.values:
            return 1
    return 0

In [787]:
# УНИКАЛЬНЫЕ СЛОВА В ЗАДАНИЯХ С ВЫСОКИМ ТАРГЕТОМ
high_targ = unique_words(lower=100000)
rest = unique_words(upper=100000)
merged = pd.merge(high_targ, rest, left_on='word', right_on='word', how='left').fillna(0.001)
merged = merged[merged.freq_x / merged.freq_y >= 1.5].sort_values('freq_x', ascending=False)

df_all['clean_text'] = df_all.summary.apply(preprocess)
df_all['one_more_feature'] = df_all.clean_text.apply(check_unique_words)
df_all.drop('clean_text', axis=1, inplace=True)

In [788]:
# ОБЪЕДИНЕНИЕ ВСЕХ ТЕКСТОВЫХ ДАННЫХ
def concat_all_text(X):    
    com_df = pd.merge(X, df_all_com, left_on="id", right_on="issue_id", how='left')
    com_df.text = com_df.text.fillna('')
    all_text = {}
    for val in X.id:
        all_text[val] = com_df[com_df.id == val].summary.iloc[0]
        all_text[val] += ' '.join([text for text in com_df[com_df.id == val].text.values])
    X['all_text'] = all_text.values()
    
# ДАТА И ВРЕМЯ
def encode_date(X):
    X.created = pd.to_datetime(X.created)
    X['year'] = X.created.dt.year
    X['day'] = X.created.dt.day
    X['month'] = X.created.dt.month
    X['hour'] = X.created.dt.hour
    X['day_of_week'] = X.created.dt.strftime("%w").astype(int)
    X.drop('created', axis=1, inplace=True)

# КОЛИЧЕСТВО И ДЛИНА КОММЕНТАРИЕВ
def process_coments(X):
    com_df = pd.DataFrame(pd.merge(X, df_all_com, left_on="id", right_on="issue_id", how='left'))
    com_df.text = com_df.text.astype('str')
    com_df.text = com_df.text.apply(len).astype('int')
    counts = com_df.groupby('id').comment_id.count()
    lens = com_df.groupby('id').text.sum()
    X['comments_count'] = pd.merge(X, counts, left_on='id', right_index=True).comment_id
    X['comments_len'] = pd.merge(X, lens, left_on='id', right_index=True).text
    
# ИНФОРМАЦИЯ ОБ ИСПОЛНИТЕЛЕ
def get_assignee_info(X):
    df = pd.merge(X, df_emp, left_on='assignee_id', right_on='id', how='left')
    df.fillna('unknown', inplace=True)
    col_list = ['position', 'hiring_type', 'payment_type']
    X[col_list] = df[col_list]

# СТАТИСТИКИ СОТРУДНИКОВ
def get_stats(X, field):
    train_df = X[X.overall_worklogs != -1]
    stats_df = train_df.groupby(field).agg({'overall_worklogs': [min, max, np.mean, np.median]})
    stats_df.columns = [f'{field.split("_")[0]}_{i}_time' for i in stats_df.columns.droplevel(0)]
    X[stats_df.columns] = pd.merge(X, stats_df, left_on=field, right_index=True, how='left')[stats_df.columns]
    
# TF-IDF UMAP
def preprocess(text: str) -> list:
    tokenized = nltk.word_tokenize(text.lower())
    normalized = [MORPH.parse(word)[0].normal_form for word in tokenized
                  if len(word) > 1 and word.isalpha()]
    return normalized

def tf_idf(freqs):
    return (freqs / (freqs.sum(axis=1) + 0.0001)[:, np.newaxis] ) * np.log(freqs.shape[0] / ((freqs >= 1).sum(axis=0) + 1))

def mess_to_vec(vocab: dict, text):
    vec = np.zeros(len(vocab) + 1)
    for token in text:
        if token in vocab:
            vec[vocab[token]] += 1
        else:
            vec[0] += 1
    return vec

def get_vocab(messages):
    vocab = {}
    pos = 1
    cc = Counter()
    for message in messages:
        n_cc = Counter(message)
        for key in n_cc:
            n_cc[key] = 1
        cc += n_cc
    for key, val in cc.items():
        if val > 1:
            vocab[key] = pos
            pos += 1
    return vocab

def get_tf_idf_enc(vals, new_df, col):
    vocab = get_vocab(vals)
    all_freq = []
    for val in vals:
        all_freq.append(mess_to_vec(vocab, val))
    all_freq = np.array(all_freq)
    
    all_freq = tf_idf(all_freq)
    reduct = UMAP(densmap=True,
                  dens_lambda=1.,
                  n_neighbors=60,
                  min_dist=0.1,
                  n_components=3,
                  random_state=42,
                  low_memory=False,
                  metric='euclidean',
                  output_metric='euclidean'
                  )

    umap_res = reduct.fit_transform(all_freq).T
    for i in range(len(umap_res)):
        new_df[col + f'_umap_{i}'] = umap_res[i]

# BERT UMAP
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def get_bert_enc(X):   
    embeddings = []
    print('Making BERT embeddings...')
    for sentence in tqdm(X['all_text']):
        embeddings.append(embed_bert_cls(sentence, BERT, TOKENIZER))
    embeddings = np.array(embeddings)

    reduct = UMAP(densmap=True,
                  dens_lambda=1.,
                  n_neighbors=60,
                  min_dist=0.1,
                  n_components=3,
                  random_state=42,
                  low_memory=False,
                  metric='euclidean',
                  output_metric='euclidean'
                  )

    umap_res = reduct.fit_transform(embeddings).T
    for i in range(len(umap_res)):
        X[f'bert_umap_{i}'] = umap_res[i]

# КАТЕГОРИАЛЬНЫЕ ПРИЗНАКИ
def make_categorical(df, col):
    df[col] = pd.Categorical(df[col])
    df[col] = df[col].astype('category').cat.codes

# ПОДСЧЕТ КОЛИЧЕСТВА ТЕКУЩИХ ЗАЯВОК У ИСПОЛНИТЕЛЯ
def get_current_tasks_count(X):    
    res = []
    X.created = pd.to_datetime(X.created)
    for df in tqdm(X.expanding()):
        curr = df.iloc[-1]
        tasks_count = 0
        for task in df[df.assignee_id == curr.assignee_id].itertuples():
            if pd.to_datetime(task.created) + pd.Timedelta(task.overall_worklogs, 'sec') > curr.created:
                tasks_count += 1
        res.append(tasks_count)
    X['curr_tasks_count'] = res
    
# ВСЕ ПРЕОБРАЗОВАНИЯ
def apply_technicals(X):
    concat_all_text(X)
    get_current_tasks_count(X)
    encode_date(X)
    process_coments(X)
    get_assignee_info(X)
    
    cat_cols = ['position', 'hiring_type', 'payment_type']
    for column in cat_cols:
        make_categorical(X, column)
    
    vals = [preprocess(word) for word in X.summary]
    get_tf_idf_enc(vals, X, 'summary')
    
    for field in ['assignee_id', 'creator_id', 'project_id']:
        get_stats(df_all, field)
    
    X['is_self_assigned'] = np.where(X.assignee_id == X.creator_id, 1, 0)
    X['len_summary'] = X['summary'].apply(len)
    get_bert_enc(X)
    X.drop(['key', 'id', 'summary', 'all_text'], axis=1, inplace=True)

In [789]:
apply_technicals(df_all)

10659it [03:41, 48.07it/s]


Making BERT embeddings...


100%|████████████████████████████████████████████████████████████████████████████| 10659/10659 [09:54<00:00, 17.93it/s]


## Разбивка датасета

In [790]:
df_test = df_all[df_all.overall_worklogs == -1].copy()
df_test.drop('overall_worklogs', axis=1, inplace=True)

In [801]:
df_train = df_all[(df_all.overall_worklogs != -1) & (df_all.overall_worklogs < 2000000)].copy()

y = df_train.overall_worklogs
y = np.log(y)
X = df_train.drop('overall_worklogs', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Обучение модели

In [792]:
def lgbm_r2(y_true, y_pred):
    return 'r2', r2_score(y_true, y_pred), True 

In [818]:
params = {'num_leaves': 30, 'n_estimators': 300, 'learning_rate': 0.05}
lgbm = lightgbm.LGBMRegressor(**params)
lgbm = lgbm.fit(X_train, y_train, 
                eval_set=[(X_train, y_train), (X_test, y_test)],
                eval_names=['train', 'val'], eval_metric=lgbm_r2, 
                callbacks=[lightgbm.log_evaluation(100)])
res_val_df = pd.DataFrame(np.exp(y_test))
res_val_df['pred'] = np.exp(lgbm.predict(X_test))

print(f'R2 score: {r2_score(res_val_df.overall_worklogs, res_val_df.pred)}')

[100]	train's l2: 0.839416	train's r2: 0.444954	val's l2: 1.07405	val's r2: 0.300821
[200]	train's l2: 0.670258	train's r2: 0.556806	val's l2: 1.07024	val's r2: 0.303304
[300]	train's l2: 0.550616	train's r2: 0.635917	val's l2: 1.06931	val's r2: 0.303905
R2 score: 0.2905315976172511


In [817]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, random_state=40, shuffle=True)
CV_res = []
splits = {}
counter = 0
for trn_idx, tst_idx in kf.split(df_train):
    X_train1, X_test1, y_train1, y_test1 = X.iloc[trn_idx], X.iloc[tst_idx], y.iloc[trn_idx], y.iloc[tst_idx]
    lgbm = lightgbm.LGBMRegressor(**params)
    lgbm = lgbm.fit(X_train1, y_train1)
    CV_res.append(r2_score(np.exp(y_test1), np.exp(lgbm.predict(X_test1))))
    splits[counter] = [trn_idx, tst_idx]
    counter += 1
print('R2 scores:')
for i in CV_res: print(i)
print(f'Mean R2: {np.mean(CV_res)}')

R2 scores:
0.17253208494927108
0.10776343357248486
0.35741388932455975
0.07876192839302121
0.09405300114607851
0.22810330156247405
0.10397750976820408
0.32624141568296394
0.23811946567277809
0.11254193917136945
Mean R2: 0.1819507969243205


In [805]:
res_val_df.describe()

Unnamed: 0,overall_worklogs,pred
count,2397.0,2397.0
mean,14604.405507,9823.580404
std,33718.880108,18295.597442
min,60.0,588.415102
25%,3000.0,4650.172485
50%,7200.0,6614.388602
75%,14400.0,9644.453471
max,594600.0,582549.157247


In [806]:
lgbm_feat = pd.Series(lgbm.feature_importances_, index=X_train.columns)
lgbm_feat.sort_values(ascending=False)

summary_umap_0          957
summary_umap_2          953
summary_umap_1          936
bert_umap_1             907
bert_umap_2             904
bert_umap_0             872
day                     711
len_summary             707
hour                    519
month                   463
comments_len            443
curr_tasks_count        432
creator_max_time        281
day_of_week             277
assignee_mean_time      235
creator_id              230
assignee_max_time       214
assignee_id             203
comments_count          188
creator_mean_time       178
creator_median_time     171
assignee_median_time    168
assignee_min_time       110
is_self_assigned         93
creator_min_time         90
position                 80
year                     70
one_more_feature         57
payment_type             43
project_max_time         36
project_id               26
project_mean_time        24
hiring_type              14
project_median_time       8
project_min_time          0
dtype: int32

## Solution

In [807]:
pred = lgbm.predict(df_test)
pred = np.exp(pred)
solution = pd.DataFrame({'overall_worklogs': pred}, index=df_issues_test.id)
solution.to_csv('data/solution_43.csv')