<a href="https://colab.research.google.com/github/ko74dev/collector/blob/main/Collector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Загрузим нужные библиотеки

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

### Считаем тренировочный набор

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_issues = pd.read_csv("/content/drive/MyDrive/train_issues.csv")
train_comments = pd.read_csv("/content/drive/MyDrive/train_comments.csv")
emp = pd.read_csv("/content/drive/MyDrive/employees.csv")

train_comments = train_comments.groupby(['issue_id'], 
                                        as_index=False).agg({'text': '\n!\n'.join})

train = pd.merge(train_issues, emp, left_on="assignee_id", right_on="id", how='inner')


train = pd.merge(train, emp, left_on="creator_id", right_on="id", how='inner')
train = pd.merge(train, train_comments, left_on="id_x", right_on="issue_id", how='left')

train['w_summary'] = train.summary.str.count(' ')+1
train['s_text'] = train.text.fillna('').str.count('\n!\n')+1

## Рассмотрим датасет по ближе

## Обьединим наборы
Обьединим наборы данных задач и работников по id исполнителя

Обьединим набор данных с комментариями по issue_id

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9589 entries, 0 to 9588
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_x                          9589 non-null   int64  
 1   created                       9589 non-null   object 
 2   key                           9589 non-null   object 
 3   summary                       9589 non-null   object 
 4   project_id                    9589 non-null   int64  
 5   assignee_id                   9589 non-null   int64  
 6   creator_id                    9589 non-null   int64  
 7   overall_worklogs              9589 non-null   int64  
 8   id_y                          9589 non-null   int64  
 9   active_x                      9589 non-null   int64  
 10  full_name_x                   9589 non-null   object 
 11  position_x                    7354 non-null   object 
 12  hiring_type_x                 7630 non-null   object 
 13  pay

### Преобразование столбцов

In [6]:
def pred(x):
    X = x[['active_x', 'passport_x', 'is_nda_signed_x', 'is_labor_contract_signed_x',
          'is_added_to_internal_chats_x', 'is_added_one_to_one_x',
          'active_y', 'passport_y', 'is_nda_signed_y', 'is_labor_contract_signed_y',
          'is_added_to_internal_chats_y', 'is_added_one_to_one_y'
          ]].astype(np.uint8).copy()
    X['created'] = pd.to_datetime(x.created)
    X = pd.concat([X, 
                    pd.get_dummies(X.created.dt.year, prefix='Y'),
                    pd.get_dummies(X.created.dt.month, prefix='M'),
                    pd.get_dummies(X.created.dt.day, prefix='D'),
                    pd.get_dummies(X.created.dt.weekday, prefix='W'),
                    pd.get_dummies(X.created.dt.hour, prefix='H')
                   ], axis=1)
    X['created'] = (X.created - X.created.min()).astype('timedelta64[M]')
    X = pd.get_dummies(X, columns=['created'], prefix='crtd')
    X = pd.concat([X,
                pd.get_dummies(x.w_summary.map(lambda x: int(np.log(x))), prefix='w_s'),
                pd.get_dummies(x.s_text.map(lambda x: int(np.log(x))), prefix='s_t'),
                   
                pd.get_dummies(x.key.map(lambda x: x.split('-')[0]), prefix='k0'),
                pd.get_dummies(x.key.map(lambda x:
                                      int(np.log2(float(x.split('-')[1])))), prefix='k1'),   
                pd.get_dummies(x.assignee_id, prefix='ass'),
                pd.get_dummies(x.creator_id, prefix='crea'),

                pd.get_dummies(x.position_x.fillna('X'), prefix='posx'),
                pd.get_dummies(x.hiring_type_x.fillna('X'), prefix='HTx'),
                pd.get_dummies(x.payment_type_x.fillna('X'), prefix='PTx'),
                pd.get_dummies(x.salary_calculation_type_x.fillna('X'), prefix='SCTx'),
                pd.get_dummies(x.english_level_x.fillna('X'), prefix='EnLx'),
                   
                pd.get_dummies(x.position_y.fillna('X'), prefix='posy'),
                pd.get_dummies(x.hiring_type_y.fillna('X'), prefix='HTy'),
                pd.get_dummies(x.payment_type_y.fillna('X'), prefix='PTy'),
                pd.get_dummies(x.salary_calculation_type_y.fillna('X'), prefix='SCTy'),
                pd.get_dummies(x.english_level_y.fillna('X'), prefix='EnLy'),   
                  ], axis=1)
    return X
    
X = pred(train.copy())

In [7]:
X.shape

(9589, 314)

## Выделим выборки

In [8]:
y = train_issues['overall_worklogs']

In [9]:
import random
from tensorflow.random import set_seed
import tensorflow as tf

seed = 42
random.seed(seed)
np.random.seed(seed)
set_seed(seed)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, random_state=seed)

## Обучение модели

In [10]:
from keras.models import Sequential
from keras.layers import (Dense, Dropout, Normalization,
                          Lambda, Conv2D,GlobalAveragePooling2D)
from tensorflow.keras.optimizers import Adam, schedules 
from keras import layers, Input

In [62]:
nn = Sequential([
    Input(shape=314),
    Dense(314, activation='relu'), Dropout(0.1), Normalization(),
    Lambda(lambda x: tf.transpose(
        tf.expand_dims(tf.stack([tf.roll(x, i, 0) for i in range(0,314,5)]), 2), 
        perm=[1,0,3,2])),
    Conv2D(314, 5, (3,5), activation='relu'), Dropout(0.1), Normalization(),
    Conv2D(157, 5, (3,5), activation='relu'), Dropout(0.1),  Normalization(),
    Conv2D(157, 5, (3,5), activation='relu'), Dropout(0.1),  Normalization(),
    GlobalAveragePooling2D(),
    Dense(157, activation='relu'),
    Dense(1) ])

lr_schedule = schedules.ExponentialDecay(
    initial_learning_rate=.0005,
    decay_steps=320,
    decay_rate=0.8)
optimizer1 = Adam(learning_rate=lr_schedule, epsilon=5e-05)

nn.compile(optimizer=optimizer1, loss='mean_squared_logarithmic_error')
nn.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_54 (Dense)            (None, 314)               98910     
                                                                 
 dropout_74 (Dropout)        (None, 314)               0         
                                                                 
 normalization_74 (Normaliza  (None, 314)              629       
 tion)                                                           
                                                                 
 lambda_18 (Lambda)          (None, 63, 314, 1)        0         
                                                                 
 conv2d_56 (Conv2D)          (None, 20, 62, 314)       8164      
                                                                 
 dropout_75 (Dropout)        (None, 20, 62, 314)       0         
                                                     

In [63]:
nn.fit(X_train, y_train, batch_size=128, epochs=4, 
          validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f5c26040190>

## Оценка точности

In [64]:
r2_score(y_test, nn.predict(X_test, batch_size=128))

-0.038925663473579064

##  Предсказание на тестовом наборе

In [65]:
test_issues = pd.read_csv("/content/drive/MyDrive/test_issues.csv")
test_comments = pd.read_csv("/content/drive/MyDrive/test_comments.csv")

test_comments = test_comments.groupby(['issue_id'], 
                                        as_index=False).agg({'text': '\n!\n'.join})

test = pd.merge(test_issues, emp, left_on="assignee_id", right_on="id", how='inner')


test = pd.merge(test, emp, left_on="creator_id", right_on="id", how='inner')
test = pd.merge(test, test_comments, left_on="id_x", right_on="issue_id", how='left')

test['w_summary'] = test.summary.str.count(' ')+1
test['s_text'] = test.text.fillna('').str.count('\n!\n')+1


In [66]:
test_x = pd.DataFrame(columns=X.columns)

In [67]:
pred_test = pred(test.copy())
for c in set(pred_test.columns) & set(X.columns):
    test_x[c] = pred_test[c]
test_x.shape

(1070, 314)

In [68]:
test_x.fillna(1, inplace=True)

In [69]:
sub = pd.read_csv("/content/drive/MyDrive/sample_solution.csv")
sub['overall_worklogs'] = nn.predict(test_x)
sub.to_csv('sub.csv', index=False)