In [4]:
import numpy as np
import pandas as pd
import warnings
import json
import operator

from tqdm import tqdm

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.naive_bayes import MultinomialNB

warnings.filterwarnings("ignore")
np.set_printoptions(precision=3)

## Предсказание приорита задачи (issue) для JetBrains
https://youtrack.jetbrains.com/issues/IDEA

Описание процесса:

1. Внешние пользователи могут создать новую задачу в трекере и указать аннотацию и описание проблемы, о которой репортируют. Идентификатор автора и дата создания задачи добавляются автоматически. Для простоты будем считать, что аннотация и описание задачи не могут быть изменены.
1. В какой-то момент задачи из трекера будет решена. Команде разработчков необходимо определить приоритет каждой новой задачи. 

**Цель**:  обучить модель для автоматического предсказания приоритета задачи (высокий/ не ввысокий) в момент ее создания.


**Обучающая выборка**: данные обо всех решенных задачах для IDE IDEA, созданных внешними пользователеми.


In [5]:
train_df = pd.read_csv('train.csv', index_col='id')
train_df.head()

Unnamed: 0_level_0,summary,description,reporter,created,customFields,links,is_high_priority
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,"{""login"": ""machak"", ""$type"": ""User""}",1231150644000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,"{""login"": ""sprice"", ""$type"": ""User""}",1231150705000,"[{""value"": {""name"": ""Usability Problem"", ""$typ...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,"{""login"": ""brigham"", ""$type"": ""User""}",1231183948000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,"{""login"": ""xduke"", ""$type"": ""User""}",1231241109000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,"{""login"": ""stonemack"", ""$type"": ""User""}",1231133633000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105365 entries, 25-60681 to 25-2506947
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   summary           105364 non-null  object
 1   description       103548 non-null  object
 2   reporter          105365 non-null  object
 3   created           105365 non-null  int64 
 4   customFields      105365 non-null  object
 5   links             105365 non-null  object
 6   is_high_priority  105365 non-null  bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 5.7+ MB


In [7]:
train_df.summary.fillna('', inplace=True)
train_df.description.fillna('', inplace=True)

In [8]:
X_text = train_df.summary
y = train_df.is_high_priority

In [9]:
X_text

id
25-60681         Don't sync font size within IDEA server plugin
25-60682                           Cancelling subversion update
25-60686      Suspended Breakpoint in JVM not Recognized in ...
25-60691      Good code red: IDEA incorrectly resolves neste...
25-60679      Module WIll Not Load Jar Dependencies on Intre...
                                    ...                        
25-2507313    Version Control Incoming tab missing search fi...
25-2507159    Groovy compiler can't find files with names co...
25-2507096          auto close stepped in files on debug resume
25-2506900    Keymaps don't resync after disabled settings s...
25-2506947    OK , cancel button do not appear when i do imp...
Name: summary, Length: 105365, dtype: object

In [10]:
y.value_counts()

False    100717
True       4648
Name: is_high_priority, dtype: int64

In [11]:
X_text_train, X_text_test, y_train, y_test = train_test_split(X_text, y, 
                                                              stratify=y, 
                                                              random_state=3, 
                                                              test_size=0.25)

### Модель, которая умеет читать

In [12]:
count_vec = CountVectorizer(
                            min_df=.01,
                            max_df=.5,
                            token_pattern=r'[A-Za-z]{2,}',
                            stop_words='english'
                            )

count_vec.fit(X_text_train)

X_text_vect_sample = pd.DataFrame(count_vec.transform(X_text_train.sample(5)).todense(), 
                                  columns=count_vec.get_feature_names_out())

X_text_vect_sample

Unnamed: 0,action,add,allow,android,broken,build,button,change,changes,class,...,using,variable,version,view,window,windows,work,working,wrong,xml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
#from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
#from nltk.corpus import stopwords
#from spacy.lang.en.stop_words import STOP_WORDS

In [14]:
count_vec.transform(X_text_train.sample(1000)).todense().max(axis=0)

matrix([[2, 2, 1, 2, 1, 1, 1, 3, 2, 3, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1,
         1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 1, 1, 1, 2, 3, 2, 1, 2, 1, 1, 2,
         1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 3, 1,
         2, 2, 1, 1, 3, 1, 2, 1, 2, 1, 1, 2, 2, 3, 2, 2, 1, 1, 1, 1, 1,
         2, 1, 1, 2, 1, 1, 1, 1, 1]], dtype=int64)

In [15]:
tfidf_vec = TfidfVectorizer(min_df=.015, max_df=0.95,
                            token_pattern=r'[A-Za-z]{2,}',
                            stop_words='english')

tfidf_vec.fit(X_text_train)

X_text_vect_sample = pd.DataFrame(tfidf_vec.transform(X_text_train.sample(5)).todense(), 
                                  columns=tfidf_vec.get_feature_names_out())

X_text_vect_sample

Unnamed: 0,add,allow,android,build,changes,class,code,commit,configuration,create,...,type,update,using,view,window,windows,work,working,wrong,xml
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.739458,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.669755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.742583,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
tfidf_vec.transform(X_text_train.sample(1000)).todense().max(axis=0)

matrix([[1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
         1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
         1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
         1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
         1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
         1.   , 1.   , 1.   , 0.776, 1.   , 1.   , 1.   , 1.   ]])

In [17]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [18]:
# https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes

nb_pipeline = Pipeline(steps = [('tfidf', tfidf_vec),
                                ('to_dense', DenseTransformer()), 
                                ('clf', MultinomialNB ())
                               ]
                       )

nb_pipeline.fit(X_text_train, y_train)

print (f'Train f1 = {f1_score(y_true=y_train, y_pred=nb_pipeline.predict(X_text_train))}')
print (f'Test f1 = {f1_score(y_true=y_test, y_pred=nb_pipeline.predict(X_text_test))}')

Train f1 = 0.0
Test f1 = 0.0


## Подбор параметров

In [19]:
param_grid = dict(min_df=[.01, .05], max_df=[.5, .6, .7])

results = []

for params in tqdm(ParameterGrid(param_grid)):
#     print(params)
    pipe = Pipeline(steps = [('tfidf', TfidfVectorizer(min_df=params['min_df'], max_df=params['max_df'],
                                                       token_pattern=r'[A-Za-z]{2,}',
                                                       stop_words='english')),
                             ('to_dense', DenseTransformer()), 
                             ('clf', MultinomialNB ())
                            ]
                       )
    
    #cv_train_preds = cross_val_predict(pipe, X_text_train, y_train, cv = 5, n_jobs = -1)
    
    pipe.fit(X_text_train, y_train)
    train_preds = pipe.predict(X_text_train)
    
    results.append(dict(
                        estimator=pipe,
                        parameters=params,
                        train_f1 = f1_score(y_true=y_train, y_pred=pipe.predict(X_text_train)),
                        test_f1 = f1_score(y_true=y_test, y_pred=pipe.predict(X_text_test))
    ))

100%|██████████| 6/6 [00:18<00:00,  3.04s/it]


In [20]:
pd.DataFrame(results).drop(columns='estimator').sort_values('test_f1').style.bar(vmin=0, vmax=1)

Unnamed: 0,parameters,train_f1,test_f1
0,"{'max_df': 0.5, 'min_df': 0.01}",0.0,0.0
1,"{'max_df': 0.5, 'min_df': 0.05}",0.0,0.0
2,"{'max_df': 0.6, 'min_df': 0.01}",0.0,0.0
3,"{'max_df': 0.6, 'min_df': 0.05}",0.0,0.0
4,"{'max_df': 0.7, 'min_df': 0.01}",0.0,0.0
5,"{'max_df': 0.7, 'min_df': 0.05}",0.0,0.0


## Другие признаки

In [21]:
json_fileds = pd.json_normalize(train_df.customFields.map(json.loads).map(lambda x: {field['name']: field['value'] for field in x}))
json_fileds.set_index(train_df.index, inplace=True)
json_fileds.head()

Unnamed: 0_level_0,Affected versions,Included in builds,Type.name,Type.$type,State.name,State.$type,Assignee.login,Assignee.name,Assignee.$type,Subsystem.name,Subsystem.$type,Available in,Subsystem,Assignee
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
25-60681,[],[],Bug,EnumBundleElement,Obsolete,StateBundleElement,lesya,Olesya Smirnova,User,Core. Project Settings,OwnedBundleElement,,,
25-60682,[],[],Usability Problem,EnumBundleElement,Obsolete,StateBundleElement,Kirill.Likhodedov,Kirill Likhodedov,User,Version Control. Subversion,OwnedBundleElement,,,
25-60686,[],[],Bug,EnumBundleElement,Duplicate,StateBundleElement,jeka,Eugene Zhuravlev,User,Java. Debugger,OwnedBundleElement,,,
25-60691,[],[],Bug,EnumBundleElement,Obsolete,StateBundleElement,cdr,Alexey Kudravtsev,User,Code Analysis. Inspection,OwnedBundleElement,,,
25-60679,[],[],Bug,EnumBundleElement,Fixed,StateBundleElement,sashache,Alexander Chernikov,User,Core. Project Settings,OwnedBundleElement,[],,


In [22]:
json_fileds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105365 entries, 25-60681 to 25-2506947
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Affected versions   105365 non-null  object 
 1   Included in builds  105365 non-null  object 
 2   Type.name           105365 non-null  object 
 3   Type.$type          105365 non-null  object 
 4   State.name          105365 non-null  object 
 5   State.$type         105365 non-null  object 
 6   Assignee.login      96428 non-null   object 
 7   Assignee.name       96428 non-null   object 
 8   Assignee.$type      96428 non-null   object 
 9   Subsystem.name      82778 non-null   object 
 10  Subsystem.$type     82778 non-null   object 
 11  Available in        31350 non-null   object 
 12  Subsystem           0 non-null       float64
 13  Assignee            0 non-null       float64
dtypes: float64(2), object(12)
memory usage: 12.1+ MB


In [23]:
## Не можем использовать большинство полей, потому что они будут пустыми при создании
json_columns = ['Type.name' , 'Subsystem.name'] 

In [24]:
train_df = train_df.join(json_fileds[json_columns])

train_df['reporter_name'] = train_df.reporter.map(json.loads).map(operator.itemgetter('login'))
train_df['Year'] = train_df.created.apply(lambda c: pd.to_datetime(c, unit='ms').year)
train_df['Month'] = train_df.created.apply(lambda c: pd.to_datetime(c, unit='ms').month)
train_df['Day'] = train_df.created.apply(lambda c: pd.to_datetime(c, unit='ms').day)
train_df['Hour'] = train_df.created.apply(lambda c: pd.to_datetime(c, unit='ms').hour)

In [25]:
train_df

Unnamed: 0_level_0,summary,description,reporter,created,customFields,links,is_high_priority,Type.name,Subsystem.name,reporter_name,Year,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,"{""login"": ""machak"", ""$type"": ""User""}",1231150644000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,Core. Project Settings,machak,2009,1,5,10
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,"{""login"": ""sprice"", ""$type"": ""User""}",1231150705000,"[{""value"": {""name"": ""Usability Problem"", ""$typ...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Usability Problem,Version Control. Subversion,sprice,2009,1,5,10
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,"{""login"": ""brigham"", ""$type"": ""User""}",1231183948000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,Java. Debugger,brigham,2009,1,5,19
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,"{""login"": ""xduke"", ""$type"": ""User""}",1231241109000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,Code Analysis. Inspection,xduke,2009,1,6,11
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,"{""login"": ""stonemack"", ""$type"": ""User""}",1231133633000,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,Core. Project Settings,stonemack,2009,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25-2507313,Version Control Incoming tab missing search fi...,"The Version Control tab for ""Incoming"" doesn't...","{""login"": ""markhodgson"", ""$type"": ""User""}",1569492791894,"[{""value"": {""name"": ""Usability Problem"", ""$typ...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Usability Problem,Version Control. Subversion,markhodgson,2019,9,26,10
25-2507159,Groovy compiler can't find files with names co...,1. Create a new Project with groovy support\n2...,"{""login"": ""ted.lundqvist"", ""$type"": ""User""}",1569488444807,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,,ted.lundqvist,2019,9,26,9
25-2507096,auto close stepped in files on debug resume,"Hi,\n\none coworker switched from eclipse to I...","{""login"": ""alain57"", ""$type"": ""User""}",1569486640163,"[{""value"": {""name"": ""Feature"", ""$type"": ""EnumB...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Feature,Java. Debugger,alain57,2019,9,26,8
25-2506900,Keymaps don't resync after disabled settings s...,I had to disable setting sync so I could expor...,"{""login"": ""Dmi3se"", ""$type"": ""User""}",1569452238504,"[{""value"": {""name"": ""Bug"", ""$type"": ""EnumBundl...","[{""direction"": ""BOTH"", ""linkType"": {""directed""...",False,Bug,Core. IDE Settings. Sharing,Dmi3se,2019,9,25,22


In [26]:
columns = ['summary', 'description', 'reporter_name', 'Year','Month', 'Day', 'Hour'] +  json_columns

X = train_df[columns]
y = train_df['is_high_priority']

X.head()

Unnamed: 0_level_0,summary,description,reporter_name,Year,Month,Day,Hour,Type.name,Subsystem.name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25-60681,Don't sync font size within IDEA server plugin,At home I have 1900x1200 resolution and at my ...,machak,2009,1,5,10,Bug,Core. Project Settings
25-60682,Cancelling subversion update,When cancelling an update from ie. subversion ...,sprice,2009,1,5,10,Usability Problem,Version Control. Subversion
25-60686,Suspended Breakpoint in JVM not Recognized in ...,In some breakpoint cases for a app launched fr...,brigham,2009,1,5,19,Bug,Java. Debugger
25-60691,Good code red: IDEA incorrectly resolves neste...,The relevant snippet if part of the UIDebug cl...,xduke,2009,1,6,11,Bug,Code Analysis. Inspection
25-60679,Module WIll Not Load Jar Dependencies on Intre...,I have IntelliJ 7.0.5 running on both WIndows ...,stonemack,2009,1,5,5,Bug,Core. Project Settings


### DataFrameMapper

In [27]:
from sklearn_pandas import DataFrameMapper

preprocessor = DataFrameMapper([('summary', TfidfVectorizer()),
                                (['Type.name'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
                                #(['reporter_name'], ...),
                                #(['Subsystem.name'], ...),
                                ('Year', None),
                                #('Month', ...),
                                #('Day', ...),
                                #('Hour', ...),
                                ], 
                               input_df=True, 
                               df_out=True)

preprocessor.fit_transform(X.sample(5))

Unnamed: 0_level_0,summary_are,summary_block,summary_clinet,summary_collections,summary_command,summary_commit,summary_configurations,summary_debug,summary_detect,summary_dialog,...,summary_unmodifiable,summary_updated,summary_values,summary_was,summary_way,summary_with,summary_wrapping,Type.name_x0_Bug,Type.name_x0_Feature,Year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25-476198,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2012
25-2171988,0.0,0.0,0.0,0.339992,0.0,0.0,0.0,0.0,0.339992,0.0,...,0.679984,0.0,0.339992,0.0,0.0,0.0,0.339992,0.0,1.0,2018
25-1310267,0.316228,0.0,0.0,0.0,0.0,0.0,0.316228,0.316228,0.0,0.0,...,0.0,0.316228,0.0,0.316228,0.0,0.0,0.0,1.0,0.0,2016
25-515040,0.0,0.0,0.321896,0.0,0.321896,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.321896,0.0,1.0,0.0,2013
25-375359,0.0,0.0,0.0,0.0,0.0,0.67082,0.0,0.0,0.0,0.223607,...,0.0,0.0,0.0,0.0,0.223607,0.0,0.0,1.0,0.0,2011


### Подбор порога

In [None]:
pipe = make_pipeline(preprocessor, MultinomialNB ())

pipe.fit(X, y)

In [None]:
probabilities = pipe.predict_proba(X)[:, 1]

In [None]:
probabilities

In [None]:
threshold = 0.1

In [None]:
f1_score(y_true=y, y_pred=probabilities > threshold)