## Библиотеки и глобальные переменные

In [908]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pymorphy2
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import Counter
import lightgbm

from transformers import AutoTokenizer, AutoModel
from umap.umap_ import UMAP
import torch
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('russian') + stopwords.words('english')
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Roman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Roman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
TOKENIZER = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
BERT = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
MORPH = pymorphy2.MorphAnalyzer()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Загрузка данных

In [920]:
df_issues_train = pd.read_csv("data/train_issues.csv")
df_comment_train = pd.read_csv("data/train_comments.csv")

df_issues_test = pd.read_csv("data/test_issues.csv")
df_comment_test = pd.read_csv("data/test_comments.csv")

df_emp = pd.read_csv("data/employees.csv")

In [921]:
df_issues_test['overall_worklogs'] = -1
df_all = pd.concat([df_issues_train, df_issues_test]).reset_index(drop=True)
df_all_com = pd.concat([df_comment_train, df_comment_test]).reset_index(drop=True)
df_all_com.text = df_all_com.text.apply(lambda x: x.split(':')[-1] if x.__contains__('mentioned this issue in') else x) # Чистим комментарии от автотекста

## Feature engineering

In [922]:
# ОБЪЕДИНЕНИЕ ВСЕХ ТЕКСТОВЫХ ДАННЫХ
def concat_all_text(X):    
    com_df = pd.merge(X, df_all_com, left_on="id", right_on="issue_id", how='left')
    com_df.text = com_df.text.fillna('')
    all_text = {}
    for val in X.id:
        all_text[val] = com_df[com_df.id == val].summary.iloc[0]
        all_text[val] += ' '.join([text for text in com_df[com_df.id == val].text.values])
    X['all_text'] = all_text.values()
    
# ДАТА И ВРЕМЯ
def encode_date(X):
    print('processisng date...')
    X.created = pd.to_datetime(X.created)
    X['year'] = X.created.dt.year
    X['day'] = X.created.dt.day
    X['month'] = X.created.dt.month
    X['hour'] = X.created.dt.hour
    X['day_of_week'] = X.created.dt.strftime("%w").astype(int)
    X.drop('created', axis=1, inplace=True)

# КОЛИЧЕСТВО И ДЛИНА КОММЕНТАРИЕВ
def process_coments(X):
    print('processisng comments...')
    com_df = pd.DataFrame(pd.merge(X, df_all_com, left_on="id", right_on="issue_id", how='left'))
    com_df.text = com_df.text.astype('str')
    com_df.text = com_df.text.apply(len).astype('int')
    counts = com_df.groupby('id').comment_id.count()
    lens = com_df.groupby('id').text.sum()
    X['comments_count'] = pd.merge(X, counts, left_on='id', right_index=True).comment_id
    X['comments_len'] = pd.merge(X, lens, left_on='id', right_index=True).text
    
# ИНФОРМАЦИЯ ОБ ИСПОЛНИТЕЛЕ
def get_assignee_info(X):
    print('processisng assignee info...')
    df = pd.merge(X, df_emp, left_on='assignee_id', right_on='id', how='left')
    df.fillna('unknown', inplace=True)
    col_list = ['position', 'hiring_type', 'payment_type', 'passport', 'is_labor_contract_signed', 'is_added_one_to_one']
    X[col_list] = df[col_list]

# ПОЛУЧЕНИЕ СТАТИСТИК
def get_stats(X, field):
    print(f'getting {field.split("_")[0]} statistics...')
    train_df = X[X.overall_worklogs != -1]
    stats_df = train_df.groupby(field).agg({'overall_worklogs': [min, max, np.mean, np.median]})
    stats_df.columns = [f'{field.split("_")[0]}_{i}_time' for i in stats_df.columns.droplevel(0)]
    X[stats_df.columns] = pd.merge(X, stats_df, left_on=field, right_index=True, how='left')[stats_df.columns]
    
# TF-IDF UMAP
def tf_idf(freqs):
    return (freqs / (freqs.sum(axis=1) + 0.0001)[:, np.newaxis] ) * np.log(freqs.shape[0] / ((freqs >= 1).sum(axis=0) + 1))

def preprocess(text: str) -> list:
    tokenized = nltk.word_tokenize(text.lower())
    normalized = [MORPH.parse(word)[0].normal_form for word in tokenized
                  if len(word) > 1 and word.isalpha()]
    return normalized

def to_vec(vocab: dict, text):
    vec = np.zeros(len(vocab) + 1)
    for token in text:
        if token in vocab:
            vec[vocab[token]] += 1
        else:
            vec[0] += 1
    return vec

def get_vocab(vals):
    vocab = {}
    pos = 1
    cc = Counter()
    for text in vals:
        n_cc = Counter(text)
        for key in n_cc:
            n_cc[key] = 1
        cc += n_cc
    for key, val in cc.items():
        if val > 1:
            vocab[key] = pos
            pos += 1
    return vocab

def get_tf_idf_enc(vals, X, col):
    print('TF-IDF encoding...')
    vocab = get_vocab(vals)
    all_freq = []
    for val in vals:
        all_freq.append(to_vec(vocab, val))
    all_freq = np.array(all_freq)
    
    all_freq = tf_idf(all_freq)
    reduct = UMAP(densmap=True,
                  dens_lambda=1.,
                  n_neighbors=10,
                  min_dist=0.0,
                  n_components=5,
                  random_state=42,
                  low_memory=False,
                  metric='euclidean',
                  output_metric='euclidean'
                  )

    umap_res = reduct.fit_transform(all_freq).T
    for i in range(len(umap_res)):
        X[col + f'_umap_{i}'] = umap_res[i]

# BERT UMAP
def bert(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def get_bert_enc(X):   
    embeddings = []
    print('making BERT embeddings...')
    for sentence in tqdm(X['all_text']):
        embeddings.append(bert(sentence, BERT, TOKENIZER))
    embeddings = np.array(embeddings)

    reduct = UMAP(densmap=True,
                  dens_lambda=1.,
                  n_neighbors=10,
                  min_dist=0.0,
                  n_components=5,
                  random_state=42,
                  low_memory=False,
                  metric='euclidean',
                  output_metric='euclidean'
                  )

    umap_res = reduct.fit_transform(embeddings).T
    for i in range(len(umap_res)):
        X[f'bert_umap_{i}'] = umap_res[i]

# КАТЕГОРИАЛЬНЫЕ ПРИЗНАКИ
def make_categorical(df, col):
    df[col] = pd.Categorical(df[col])
    df[col] = df[col].astype('category').cat.codes

# УНИКАЛЬНЫЕ СЛОВА В ЗАДАНИЯХ С ВЫСОКИМ ТАРГЕТОМ  
def unique_words(lower=0, upper=df_issues_train.overall_worklogs.max()):
    X = df_issues_train[df_issues_train.overall_worklogs.between(lower, upper)].copy()
    X['clean_text'] = X.summary.apply(preprocess)
    vocab = get_vocab(X.clean_text)
    vocab = pd.DataFrame({'word': vocab.keys(), 'freq': vocab.values()})
    
    all_txt = []
    texts = [all_txt + i for i in X.clean_text]
    freqs=[]
    for lst in texts:
        freqs +=lst
    freqs = Counter(freqs)
    freqs = pd.DataFrame({'word': freqs.keys(), 'freq': freqs.values()})
    freqs = freqs[~freqs.word.isin(stop_words)]
    return freqs

def check_unique_words(lst):
    for word in lst:
        if word in merged.word.values:
            return 1
    return 0

# РАЗНИЦА ВО ВРЕМЕНИ МЕЖДУ ДВУМЯ БЛИЖАЙШИМИ ЗАДАНИЯМИ У КАЖДОГО ИСПОЛНИТЕЛЯ
def get_time_difference(X):
    print('getting time differences since last task...')
    res = pd.DataFrame(columns=['id', 'time_diff'])
    X.created = pd.to_datetime(X.created)
    for df in X.sort_values('created', ascending=False).expanding():
        curr = df.iloc[-1]
        try: 
            prev = df[df.assignee_id == curr.assignee_id].iloc[-2]
            time_diff = pd.Timedelta(curr.created-prev.created).seconds
        except IndexError:
            time_diff = 0
        res.loc[len(res)] = [curr.id, time_diff]
    X['time_diff'] = pd.merge(X, res, how = 'left')['time_diff']
    
    
# ВСЕ ПРЕОБРАЗОВАНИЯ
def apply_technicals(X):
    get_time_difference(X)
    concat_all_text(X)
    encode_date(X)
    process_coments(X)
    get_assignee_info(X)
    
    cat_cols = ['position', 'hiring_type', 'payment_type', 'project_id']
    for column in cat_cols:
        make_categorical(X, column)
    
    #vals = [preprocess(word) for word in X.all_text]
    #get_tf_idf_enc(vals, X, 'summary')
    
    for field in ['assignee_id', 'creator_id']:
        get_stats(df_all, field)
        
    X['key'] = X.key.apply(lambda x: x.split('-')[1]).astype('int')        
    X['is_self_assigned'] = np.where(X.assignee_id == X.creator_id, 1, 0)
    X['len_summary'] = X['summary'].apply(len)
    #get_bert_enc(X)

In [923]:
apply_technicals(df_all)

getting time differences since last task...
processisng date...
processisng comments...
processisng assignee info...
getting assignee statistics...
getting creator statistics...


In [924]:
high_targ = unique_words(lower=60000)
rest = unique_words(upper=60000)
merged = pd.merge(high_targ, rest, left_on='word', right_on='word', how='left').fillna(0.001)
merged = merged[merged.freq_x / merged.freq_y >= 2].sort_values('freq_x', ascending=False)

df_all['clean_text'] = df_all.summary.apply(preprocess)
df_all['one_more_feature'] = df_all.clean_text.apply(check_unique_words)

In [925]:
merged.shape

(122, 3)

In [926]:
embeddings_df = pd.read_csv('data/embeddings-1.csv', index_col=0)
embeddings_df.columns

Index(['bert_umap_0', 'bert_umap_1', 'bert_umap_2', 'bert_umap_3',
       'bert_umap_4', 'summary_umap_0', 'summary_umap_1', 'summary_umap_2',
       'summary_umap_3', 'summary_umap_4'],
      dtype='object')

In [927]:
df_all = pd.concat([df_all, embeddings_df], axis=1)

In [928]:
df_all.columns

Index(['id', 'key', 'summary', 'project_id', 'assignee_id', 'creator_id',
       'overall_worklogs', 'time_diff', 'all_text', 'year', 'day', 'month',
       'hour', 'day_of_week', 'comments_count', 'comments_len', 'position',
       'hiring_type', 'payment_type', 'passport', 'is_labor_contract_signed',
       'is_added_one_to_one', 'assignee_min_time', 'assignee_max_time',
       'assignee_mean_time', 'assignee_median_time', 'creator_min_time',
       'creator_max_time', 'creator_mean_time', 'creator_median_time',
       'is_self_assigned', 'len_summary', 'clean_text', 'one_more_feature',
       'bert_umap_0', 'bert_umap_1', 'bert_umap_2', 'bert_umap_3',
       'bert_umap_4', 'summary_umap_0', 'summary_umap_1', 'summary_umap_2',
       'summary_umap_3', 'summary_umap_4'],
      dtype='object')

## Разбивка датасета

In [929]:
features_todrop = ['overall_worklogs', 'id', 'summary', 'clean_text', 'all_text'] 

In [930]:
df_test = df_all[df_all.overall_worklogs == -1].copy()
df_test.drop(features_todrop, axis=1, inplace=True)

In [931]:
df_train = df_all[(df_all.overall_worklogs != -1) & (df_all.overall_worklogs < 2000000)].copy()
y = df_train.overall_worklogs
y = np.log(y+0.001)
df_train.drop(features_todrop, axis=1, inplace=True)
X = df_train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Обучение модели

In [839]:
def lgbm_r2(y_true, y_pred):
    return 'r2', r2_score(y_true, y_pred), True 

In [932]:
params = {'num_leaves': 30, 'n_estimators': 1000, 'learning_rate': 0.1} #0.12 0.21
lgbm = lightgbm.LGBMRegressor(**params)
lgbm = lgbm.fit(X_train, y_train, 
                eval_set=[(X_train, y_train), (X_test, y_test)],
                eval_names=['train', 'val'], eval_metric=lgbm_r2, 
                callbacks=[lightgbm.log_evaluation(200)])
res_val_df = pd.DataFrame(np.exp(y_test))
res_val_df['pred'] = np.exp(lgbm.predict(X_test))

#res_val_df = pd.DataFrame(y_test)
#res_val_df['pred'] = lgbm.predict(X_test)
#res_val_df['pred'] = np.where(res_val_df.pred<0, 5000, res_val_df.pred)
print(f'R2 score: {r2_score(res_val_df.overall_worklogs, res_val_df.pred)}')

[200]	train's l2: 0.405251	train's r2: 0.732036	val's l2: 1.04342	val's r2: 0.320761
[400]	train's l2: 0.190179	train's r2: 0.874248	val's l2: 1.06862	val's r2: 0.304357
[600]	train's l2: 0.0969792	train's r2: 0.935875	val's l2: 1.08957	val's r2: 0.290718
[800]	train's l2: 0.0512606	train's r2: 0.966105	val's l2: 1.10265	val's r2: 0.282204
[1000]	train's l2: 0.0274517	train's r2: 0.981848	val's l2: 1.10904	val's r2: 0.278044
R2 score: 0.25354419506711345


In [901]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)
CV_res = []
splits = {}
counter = 0
for trn_idx, tst_idx in kf.split(df_train):
    X_train1, X_test1, y_train1, y_test1 = X.iloc[trn_idx], X.iloc[tst_idx], y.iloc[trn_idx], y.iloc[tst_idx]
    lgbm = lightgbm.LGBMRegressor(**params)
    lgbm = lgbm.fit(X_train1, y_train1)
    CV_res.append(r2_score(np.exp(y_test1), np.exp(lgbm.predict(X_test1))))
    #CV_res.append(r2_score(y_test1, lgbm.predict(X_test1)))
    splits[counter] = [trn_idx, tst_idx]
    counter += 1
print('R2 scores:')
for i in CV_res: print(i)
print(f'Mean R2: {np.mean(CV_res)}')

R2 scores:
0.37686067262510803
0.20522227393368486
0.2202491627480354
0.3304453168914895
0.3012850398050845
0.10015086831432185
0.13667726713878936
0.21912900050185846
0.03167001743579778
0.047126403913792525
Mean R2: 0.19688160233079627


In [905]:
res_val_df.describe()

Unnamed: 0,overall_worklogs,pred
count,2397.0,2397.0
mean,14604.406507,10541.34698
std,33718.880108,17619.555819
min,60.001,403.063441
25%,3000.001,4243.209513
50%,7200.001,6613.470818
75%,14400.001,10375.002708
max,594600.001,273231.799242


In [906]:
lgbm_feat = pd.Series(lgbm.feature_importances_, index=X_train.columns)
lgbm_feat.sort_values(ascending=False)

time_diff                   2145
summary_umap_0              1680
bert_umap_4                 1676
bert_umap_3                 1673
summary_umap_1              1646
summary_umap_4              1597
summary_umap_3              1572
len_summary                 1553
summary_umap_2              1495
bert_umap_1                 1494
key                         1433
bert_umap_0                 1400
day                         1294
bert_umap_2                 1237
hour                         952
comments_len                 729
day_of_week                  552
month                        533
assignee_max_time            506
creator_max_time             504
creator_id                   473
assignee_id                  453
creator_mean_time            412
assignee_mean_time           405
creator_median_time          284
assignee_median_time         232
comments_count               223
creator_min_time             135
position                     134
is_self_assigned             114
assignee_m

## Solution

In [904]:
pred = lgbm.predict(df_test)
pred = np.exp(pred)
pred = np.where(pred<0, 600, pred)
solution = pd.DataFrame({'overall_worklogs': pred}, index=df_issues_test.id)
solution.to_csv('data/solution_58.csv')

In [907]:
for i in X_train.columns: print(i)

key
project_id
assignee_id
creator_id
time_diff
year
day
month
hour
day_of_week
comments_count
comments_len
position
hiring_type
payment_type
passport
is_labor_contract_signed
is_added_one_to_one
assignee_min_time
assignee_max_time
assignee_mean_time
assignee_median_time
creator_min_time
creator_max_time
creator_mean_time
creator_median_time
is_self_assigned
len_summary
one_more_feature
bert_umap_0
bert_umap_1
bert_umap_2
bert_umap_3
bert_umap_4
summary_umap_0
summary_umap_1
summary_umap_2
summary_umap_3
summary_umap_4
