In [27]:
import pandas as pd
import numpy as np

from datetime import timedelta

from functools import reduce

from feature_selector import FeatureSelector

%matplotlib inline

In [28]:
tasks = pd.read_csv('../data/raw/Tasks.csv', low_memory=False)
tasks.drop_duplicates(inplace=True)
tasks.head()

Unnamed: 0,Статус по просрочке,Срок плановый,"Просрочено, дней",ДлительностьПросрочки,ID задачи,Вид документа,Дата старта задания,Дата завершения задания плановая,Дата завершения задания фактическая,Состояние задания,id
0,Без нарушения срока,,0,без нарушения срока,E1DE844D-EE2D-4C41-AEDF-93F246749F0E,Служебная записка,"2021-12-10 00:00:00,000",,"2021-12-10 00:00:00,000",Завершено,ОРГ1-02588
1,Без нарушения срока,,0,без нарушения срока,7A92343C-8C9A-46E7-AC81-8F50F95009D0,Служебная записка,"2021-12-10 00:00:00,000",,"2021-12-10 00:00:00,000",Завершено,ОРГ1-02588
2,Без нарушения срока,,0,без нарушения срока,5CE64E52-D2D1-4DCC-B2C8-34734AA39AC0,Служебная записка,"2021-12-10 00:00:00,000",,"2021-12-10 00:00:00,000",Завершено,ОРГ1-02588
3,Без нарушения срока,,0,без нарушения срока,7A28F3DD-983F-4127-AB7F-6EDB85A69F1C,Служебная записка,"2021-12-10 00:00:00,000",,"2021-12-10 00:00:00,000",Завершено,ОРГ1-02588
4,Без нарушения срока,,0,без нарушения срока,ABFFAF61-12D8-44C2-B1F6-8402D174889E,Служебная записка,"2021-12-10 00:00:00,000",,"2021-12-10 00:00:00,000",Завершено,ОРГ1-02588


In [29]:
tasks['date_start_task'] = pd.to_datetime(
    tasks['Дата старта задания'].dropna().apply(lambda x: x[:10]), format='%Y-%m-%d')

tasks['date_final_task_plan'] = pd.to_datetime(
    tasks['Дата завершения задания плановая'].dropna().apply(lambda x: x[:10]), format='%Y-%m-%d')

tasks['date_final_task_fact'] = pd.to_datetime(
    tasks['Дата завершения задания фактическая'].dropna().apply(lambda x: x[:10]), format='%Y-%m-%d')

tasks = tasks.drop(['Дата старта задания',
                   'Дата завершения задания плановая', 'Дата завершения задания фактическая'], axis=1)

In [30]:
tasks['days_to_complete'] = (
    tasks['date_final_task_fact'] - tasks['date_start_task']).apply(lambda x: x.days)
tasks['days_to_complete'] = tasks['days_to_complete'].fillna(
    tasks['days_to_complete'].median())

max_date = tasks['date_final_task_fact'].max()

In [31]:
max_date

Timestamp('2022-06-20 00:00:00')

In [32]:
tasks['Состояние задания'].unique()

array(['Завершено', 'На приёмке', 'Не начато', 'Возврат с делегирования',
       'Отложено', 'Делегировано', 'В работе'], dtype=object)

In [33]:
tasks_status_map = {
    'Завершено':'completed',
    'На приёмке': 'on_check',
    'Не начато': 'not_started',
    'Возврат с делегирования': 'back_from_delegation',
    'Отложено': 'postponed',
    'Делегировано': 'delegated',
    'В работе': 'in_work'
}
tasks['Состояние задания'] = tasks['Состояние задания'].replace(tasks_status_map)

print(tasks['Состояние задания'].unique())

['completed' 'on_check' 'not_started' 'back_from_delegation' 'postponed'
 'delegated' 'in_work']


In [34]:
tasks['is_expired_task'] = np.where(tasks['Статус по просрочке'] == 'Без нарушения срока', 0, 1)
tasks['is_intime_task'] = np.where(tasks['Статус по просрочке'] == 'С нарушением срока', 0, 1)

In [35]:
tasks['ДлительностьПросрочки'].unique()

array(['без нарушения срока', 'до 7 дней', 'более 30 дней', 'до 30 дней'],
      dtype=object)

In [36]:
expired_on_map = {
    'без нарушения срока': 0,
    'до 7 дней': 1,
    'до 30 дней': 2,
    'более 30 дней': 3
}
tasks['expired_level'] = tasks['ДлительностьПросрочки'].replace(expired_on_map)

In [37]:
tasks

Unnamed: 0,Статус по просрочке,Срок плановый,"Просрочено, дней",ДлительностьПросрочки,ID задачи,Вид документа,Состояние задания,id,date_start_task,date_final_task_plan,date_final_task_fact,days_to_complete,is_expired_task,is_intime_task,expired_level
0,Без нарушения срока,,0,без нарушения срока,E1DE844D-EE2D-4C41-AEDF-93F246749F0E,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,2021-12-10,0.0,0,1,0
1,Без нарушения срока,,0,без нарушения срока,7A92343C-8C9A-46E7-AC81-8F50F95009D0,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,2021-12-10,0.0,0,1,0
2,Без нарушения срока,,0,без нарушения срока,5CE64E52-D2D1-4DCC-B2C8-34734AA39AC0,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,2021-12-10,0.0,0,1,0
3,Без нарушения срока,,0,без нарушения срока,7A28F3DD-983F-4127-AB7F-6EDB85A69F1C,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,2021-12-10,0.0,0,1,0
4,Без нарушения срока,,0,без нарушения срока,ABFFAF61-12D8-44C2-B1F6-8402D174889E,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,2021-12-10,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536655,Без нарушения срока,,0,без нарушения срока,9FE7B014-3015-4855-BD08-D3C5260FB122,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,2021-10-31,0.0,0,1,0
536656,Без нарушения срока,,0,без нарушения срока,B4A7AE02-2D7E-4D56-A337-0F14C27A85CC,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,2021-10-31,0.0,0,1,0
536657,Без нарушения срока,,0,без нарушения срока,698F3F66-37BA-4042-BDE9-5F8C8BEA7FA6,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,2021-10-31,0.0,0,1,0
536658,Без нарушения срока,,0,без нарушения срока,A7D0F2FA-9C2A-4F4D-A990-ABE463A5F0D5,Служебная записка,completed,ОРГ2-02838,2021-10-30,2021-11-01,2021-10-31,1.0,0,1,0


In [38]:
tasks = pd.concat([tasks, pd.get_dummies(
    tasks['expired_level'], prefix='expired_level_'),
    pd.get_dummies(tasks['Состояние задания'], prefix='task_status')], axis=1)

In [39]:
tasks.columns

Index(['Статус по просрочке', 'Срок плановый', 'Просрочено, дней',
       'ДлительностьПросрочки', 'ID задачи', 'Вид документа',
       'Состояние задания', 'id', 'date_start_task', 'date_final_task_plan',
       'date_final_task_fact', 'days_to_complete', 'is_expired_task',
       'is_intime_task', 'expired_level', 'expired_level__0',
       'expired_level__1', 'expired_level__2', 'expired_level__3',
       'task_status_back_from_delegation', 'task_status_completed',
       'task_status_delegated', 'task_status_in_work',
       'task_status_not_started', 'task_status_on_check',
       'task_status_postponed'],
      dtype='object')

In [40]:
tasks

Unnamed: 0,Статус по просрочке,Срок плановый,"Просрочено, дней",ДлительностьПросрочки,ID задачи,Вид документа,Состояние задания,id,date_start_task,date_final_task_plan,...,expired_level__1,expired_level__2,expired_level__3,task_status_back_from_delegation,task_status_completed,task_status_delegated,task_status_in_work,task_status_not_started,task_status_on_check,task_status_postponed
0,Без нарушения срока,,0,без нарушения срока,E1DE844D-EE2D-4C41-AEDF-93F246749F0E,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,...,0,0,0,0,1,0,0,0,0,0
1,Без нарушения срока,,0,без нарушения срока,7A92343C-8C9A-46E7-AC81-8F50F95009D0,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,...,0,0,0,0,1,0,0,0,0,0
2,Без нарушения срока,,0,без нарушения срока,5CE64E52-D2D1-4DCC-B2C8-34734AA39AC0,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,...,0,0,0,0,1,0,0,0,0,0
3,Без нарушения срока,,0,без нарушения срока,7A28F3DD-983F-4127-AB7F-6EDB85A69F1C,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,...,0,0,0,0,1,0,0,0,0,0
4,Без нарушения срока,,0,без нарушения срока,ABFFAF61-12D8-44C2-B1F6-8402D174889E,Служебная записка,completed,ОРГ1-02588,2021-12-10,NaT,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536655,Без нарушения срока,,0,без нарушения срока,9FE7B014-3015-4855-BD08-D3C5260FB122,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,...,0,0,0,0,1,0,0,0,0,0
536656,Без нарушения срока,,0,без нарушения срока,B4A7AE02-2D7E-4D56-A337-0F14C27A85CC,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,...,0,0,0,0,1,0,0,0,0,0
536657,Без нарушения срока,,0,без нарушения срока,698F3F66-37BA-4042-BDE9-5F8C8BEA7FA6,Служебная записка,completed,ОРГ2-02838,2021-10-31,2021-11-01,...,0,0,0,0,1,0,0,0,0,0
536658,Без нарушения срока,,0,без нарушения срока,A7D0F2FA-9C2A-4F4D-A990-ABE463A5F0D5,Служебная записка,completed,ОРГ2-02838,2021-10-30,2021-11-01,...,0,0,0,0,1,0,0,0,0,0


In [41]:
def work_period_by_tasks(dates):
    return (dates.max() - dates.min()).days

In [42]:
# agg function to find days since last call
def days_since_last_task(dates, max_date=max_date):
    return (max_date - dates.max()).days

In [43]:
def aggregation(tasks):
    grouped_tasks = tasks.groupby('id', as_index=False).agg({
    'date_start_task': [work_period_by_tasks],
    'date_final_task_fact': [days_since_last_task],
    'days_to_complete': ['min','max','median','mean','std','sum'],
    'is_expired_task': ['mean','sum'],
    'is_intime_task': ['mean','sum'],
    'expired_level': ['mean'],
    'expired_level__0': ['mean','sum'],
    'expired_level__1': ['mean','sum'],
    'expired_level__2': ['mean','sum'],
    'expired_level__3': ['mean','sum'],
    'task_status_back_from_delegation': ['mean','sum'],
    'task_status_completed': ['mean','sum'],
    'task_status_delegated': ['mean','sum'],
    'task_status_in_work': ['mean','sum'],
    'task_status_not_started': ['mean','sum'],
    'task_status_on_check': ['mean','sum'],
    'task_status_postponed': ['mean','sum']
    })
    
    grouped_tasks.columns = grouped_tasks.columns.map('_'.join).str.strip('_')
    return grouped_tasks

In [44]:
tasks_last_150 = tasks[tasks['date_start_task'] > (max_date - timedelta(days=150))]
tasks_last_100 = tasks[tasks['date_start_task'] > (max_date - timedelta(days=100))]
tasks_last_50 = tasks[tasks['date_start_task'] > (max_date - timedelta(days=50))]

In [45]:
grouped_tasks = aggregation(tasks)
grouped_tasks.columns = grouped_tasks.columns.map(
    lambda x: x+'_total' if x != 'id' else x)

grouped_tasks_last_50 = aggregation(tasks_last_50)
grouped_tasks_last_50.columns = grouped_tasks_last_50.columns.map(
    lambda x: x+'_last_50' if x != 'id' else x)

grouped_tasks_last_100 = aggregation(tasks_last_100)
grouped_tasks_last_100.columns = grouped_tasks_last_100.columns.map(
    lambda x: x+'_last_100' if x != 'id' else x)

grouped_tasks_last_150 = aggregation(tasks_last_150)
grouped_tasks_last_150.columns = grouped_tasks_last_150.columns.map(
    lambda x: x+'_last_150' if x != 'id' else x)

In [46]:
# merge all dfs in one
tasks_dfs_for_merge = [grouped_tasks, grouped_tasks_last_50,
                            grouped_tasks_last_100, grouped_tasks_last_150]

grouped_data = reduce(lambda left, right: pd.merge(left, right, on=['id'],
                                                   how='left'), tasks_dfs_for_merge)

In [47]:
grouped_data.fillna(0, inplace=True)

In [48]:
grouped_data['exists_in_tasks'] = 1

In [49]:
grouped_data.to_csv('../data/prepared/grouped_tasks.csv', index=False)

In [51]:
# import train dataset
train = pd.read_csv('../data/train_dataset_train.csv')
# function of feature selection from grouped data


def feature_selection(grouped_data, train):
    # merging data
    marked_data = pd.merge(grouped_data, train, how='inner', on='id')
    # features and marks defining
    X = marked_data.drop(['id', 'type'], axis=1)
    y = marked_data['type']
    # feature selector defining
    fs = FeatureSelector(data=X, labels=y)
    # find colinear features by threshold
    fs.identify_collinear(correlation_threshold=0.80)
    # cleaning data
    data_cleaned = fs.remove(methods = ['collinear'])
    # selected feature list
    feature_cleaned_list = list(data_cleaned)
    # back id to feature list
    feature_cleaned_list.append('id')
    # cleaned data : 
    cleaned_data = grouped_data[feature_cleaned_list]
    return cleaned_data


# clean data
cleaned_tasks = feature_selection(grouped_data, train)

# add mark if id exists in calls
cleaned_tasks['exists_in_tasks'] = 1

print(cleaned_tasks.shape)

# save cleaned calls
cleaned_tasks.to_csv('../data/prepared/grouped_tasks_cleaned.csv', index=False)

10 features with a correlation magnitude greater than 0.80.

Removed 10 features.
(1272, 132)


  record_collinear = record_collinear.append(temp_df, ignore_index = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_tasks['exists_in_tasks'] = 1


In [52]:
cleaned_tasks

Unnamed: 0,date_start_task_work_period_by_tasks_total,date_final_task_fact_days_since_last_task_total,days_to_complete_min_total,days_to_complete_max_total,days_to_complete_median_total,days_to_complete_std_total,days_to_complete_sum_total,is_expired_task_mean_total,is_expired_task_sum_total,is_intime_task_sum_total,...,task_status_in_work_mean_last_150,task_status_in_work_sum_last_150,task_status_not_started_mean_last_150,task_status_not_started_sum_last_150,task_status_on_check_mean_last_150,task_status_on_check_sum_last_150,task_status_postponed_mean_last_150,task_status_postponed_sum_last_150,exists_in_tasks,id
0,374,152.0,0.0,176.0,0.0,6.883021,2316.0,0.103839,119,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ1-00004
1,702,109.0,0.0,661.0,1.0,17.079169,7388.0,0.025237,56,2163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ1-00005
2,479,172.0,0.0,198.0,0.0,12.750526,291.0,0.036885,9,235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ1-00028
3,342,173.0,0.0,12.0,0.0,2.550288,47.0,0.138889,5,31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ1-00030
4,538,109.0,-4.0,567.0,1.0,50.061767,4183.0,0.286624,90,224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ1-00044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,336,115.0,0.0,94.0,0.0,7.460045,949.0,0.116456,46,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ2-06006
1268,314,180.0,0.0,137.0,0.0,14.129672,344.0,0.203704,22,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ2-06061
1269,275,186.0,0.0,81.0,1.0,16.286904,201.0,0.285714,8,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ2-06084
1270,244,172.0,0.0,1.0,0.0,0.408248,1.0,0.166667,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,ОРГ2-06115


In [37]:
grouped_data['exists_in_taskas'] = 1

In [38]:
grouped_data.to_csv('data/prepared/grouped_tasks.csv', index=False)

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

train = pd.read_csv('data/train_dataset_train.csv')

marked_calls = pd.merge(grouped_tasks, train, how='inner', on='id').drop('id', axis=1)

for col in list(marked_calls):
    marked_calls[col] = marked_calls[col].fillna(marked_calls[col].mean())

scaler = StandardScaler()
marked_calls_st = scaler.fit_transform(marked_calls)
marked_calls_st_df = pd.DataFrame(marked_calls_st, columns = marked_calls.columns)

X_train = marked_calls_st_df.drop('type', axis=1)
y_train = marked_calls['type']
svc_lin=SVC(kernel='linear')
svm_rfe_model=RFE(estimator=svc_lin)
svm_rfe_model_fit=svm_rfe_model.fit(X_train, y_train)
feat_index = pd.Series(data = svm_rfe_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

print('Significant features from RFE',signi_feat_rfe)

sig_cols = signi_feat_rfe.to_list()
sig_cols.append('id')

grouped_tasks_sig = grouped_tasks[sig_cols].copy()
grouped_tasks_sig['exists_in_tasks'] = 1

grouped_tasks_sig.to_csv('data/prepared/grouped_tasks_sig.csv', index=False)

Significant features from RFE Index(['date_final_task_fact_days_since_last_task', 'days_to_complete_min',
       'days_to_complete_max', 'days_to_complete_mean', 'days_to_complete_std',
       'days_to_complete_sum', 'is_expired_task_mean', 'is_intime_task_mean',
       'expired_level_mean', 'expired_level__0_mean', 'expired_level__1_mean',
       'expired_level__1_sum', 'expired_level__3_mean', 'expired_level__3_sum',
       'task_status_completed_mean', 'task_status_in_work_mean',
       'task_status_not_started_mean', 'task_status_not_started_sum'],
      dtype='object')
