#  BackSlices Algorythm  
## Data preprocessing and features generation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from monthdelta import monthdelta, monthmod

from dateutil import relativedelta
pd.set_option('display.max_columns', 150)

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Transformation Parameters

In [3]:
## output file suffix
suffix = 'Transform_data'

In [4]:
## prediction window width, months:
window = 1
## prediction step, months: (time window offset)
step = 1
## buffer period, months:
buffer_period = 1
## back slices qty:
n_slices = 10
## minimum historу length, months
history_min = 10

### Load the data

In [5]:
data = pd.read_csv('data/data_long_prep_add.csv', sep=';', 
                   parse_dates=['Timeline', 
                                'DateofTermination'],
                  decimal='.')

In [6]:
data.shape

(308303, 98)

In [7]:
# DOT = data.DateofTermination
# print(np.min(DOT))
# print(np.max(DOT))

### Data preparation

In [8]:
data['Projects'] = data['#ofProjects']
data = data.drop('#ofProjects', axis=1)

In [9]:
#for col in data.columns: print(col)

In [10]:
## Numeric features list

number_cols = [

'Number of courses',
'Training days',
'English training',
'English training days',
'Projects',
'SickLeave',
'DurationOfSickLeaves',
'Vacation',
'DurationOfVacations',
'pages_loaded',
'likes',
'comments',
'posts',
   
]

In [11]:
data[number_cols].dtypes

Number of courses        float64
Training days            float64
English training         float64
English training days    float64
Projects                 float64
SickLeave                float64
DurationOfSickLeaves     float64
Vacation                 float64
DurationOfVacations      float64
pages_loaded             float64
likes                    float64
comments                 float64
posts                    float64
dtype: object

In [12]:
data.Risk.value_counts()

No risk    87864
Low        61973
High       42794
Medium     33270
Avg        23992
Unknown     1471
Name: Risk, dtype: int64

In [13]:
data.loc[:,'dRisk']=0
index=(data.Risk=='Low')
data.loc[index,'dRisk']=1
index=(data.Risk=='Medium')|(data.Risk=='Avg')
data.loc[index,'dRisk']=2
index=(data.Risk=='High')
data.loc[index,'dRisk']=3
data.dRisk.value_counts()

0    146274
1     61973
2     57262
3     42794
Name: dRisk, dtype: int64

In [14]:
data.Value.value_counts()

High                         99032
Medium                       63889
Avg                          47340
Low                          37798
Candidate for termination     1888
Name: Value, dtype: int64

In [15]:
data.loc[:,'dValue']=0
index=(data.Value=='Low')
data.loc[index,'dValue']=1
index=(data.Value=='Medium')|(data.Value=='Avg')
data.loc[index,'dValue']=2
index=(data.Value=='High')
data.loc[index,'dValue']=3
data.dValue.value_counts()

2    111229
3     99032
0     60244
1     37798
Name: dValue, dtype: int64

In [16]:
number_cols.append('dRisk')
number_cols.append('dValue')

### Filters

In [17]:
print(data.shape)
#data = data[data.Experience >= history_min]
data = data[~data.Risk.isnull()]
data = data[~data.Value.isnull()]
print(data.shape)

(308303, 100)
(205823, 100)


### New structure formation

In [18]:
tmp = data[['TL_number', 'Timeline', 'TerminatedOrResigned']].copy()
#tmp.groupby(['TL_number', 'Timeline']).count()

#### Remove a month with incomplete data

In [19]:
print(data.shape)
data = data[data.Timeline <= '2018-06-01']
print(data.shape)

(205823, 100)
(198384, 100)


In [20]:
Timelines = data.Timeline
Timelines = Timelines.drop_duplicates()
Timelines = sorted(Timelines, reverse=True)
TL = len(Timelines)  
TL

36

In [21]:
Timelines[0]

Timestamp('2018-06-01 00:00:00')

In [22]:
Timelines[35]

Timestamp('2015-07-01 00:00:00')

#### Missing values processing

In [23]:
for col in number_cols:
    if np.sum(data[col].isnull())>0: 
        print("'"+col+"',")

'Number of courses',
'Training days',
'English training',
'English training days',
'pages_loaded',
'likes',
'comments',
'posts',


In [24]:
nul_cols = [
'Number of courses',
'Training days',
'English training',
'English training days',
]

In [25]:
minus_one_cols = [
'pages_loaded',
'likes',
'comments',
'posts',
]

In [26]:
for col in nul_cols:
    data.loc[:,col] = data.loc[:,col].fillna(0)

for col in minus_one_cols:
    data.loc[:,col] = data.loc[:,col].fillna(-1)

In [27]:
data.shape

(198384, 100)

### Data transformation

In [28]:
number_cols.append('Timeline')
number_cols.append('PIN')
number_cols

['Number of courses',
 'Training days',
 'English training',
 'English training days',
 'Projects',
 'SickLeave',
 'DurationOfSickLeaves',
 'Vacation',
 'DurationOfVacations',
 'pages_loaded',
 'likes',
 'comments',
 'posts',
 'dRisk',
 'dValue',
 'Timeline',
 'PIN']

In [29]:
tmp = data[number_cols].copy()          

In [30]:
%%time
melt_data = pd.melt(tmp, id_vars=['PIN','Timeline'], 
                       value_name='value')
melt_data['variables'] = melt_data.variable + '_' + melt_data.Timeline.astype(str)
print(melt_data.shape)

(2975760, 5)
Wall time: 28.9 s


In [31]:
melt_data.head()

Unnamed: 0,PIN,Timeline,variable,value,variables
0,25,2015-07-01,Number of courses,0.0,Number of courses_2015-07-01
1,137,2015-07-01,Number of courses,0.0,Number of courses_2015-07-01
2,213,2015-07-01,Number of courses,0.0,Number of courses_2015-07-01
3,233,2015-07-01,Number of courses,0.0,Number of courses_2015-07-01
4,343,2015-07-01,Number of courses,0.0,Number of courses_2015-07-01


In [32]:
%%time
cast_data = melt_data.pivot(index='PIN', columns='variables', values='value')
cast_data.reset_index(level=[0], inplace=True)
print(cast_data.shape)

(10993, 541)
Wall time: 2.13 s


In [33]:
cast_data.iloc[:,:4].head()

variables,PIN,DurationOfSickLeaves_2015-07-01,DurationOfSickLeaves_2015-08-01,DurationOfSickLeaves_2015-09-01
0,10,0.0,0.0,0.0
1,13,0.0,0.0,0.0
2,21,,,
3,22,0.0,0.0,0.0
4,23,0.0,0.0,0.0


In [34]:
Timelines = np.unique(data.Timeline.astype(str))
Timelines = list(Timelines)
Timelines.reverse()
Timelines[:5]

['2018-06-01', '2018-05-01', '2018-04-01', '2018-03-01', '2018-02-01']

In [35]:
#range(window-1, TL-(n_slices + buffer_period), step)

In [36]:
%%time
df = pd.DataFrame()
for i in range(window-1, TL-(n_slices + buffer_period), step):
   
    buffer_df = pd.DataFrame()
    tl0 = Timelines[i]
    print(tl0)
    
    for j in range(n_slices,0,-1): #for RNN need back order, oldest - first!!!
        tl = '_' + Timelines[i + j + buffer_period]
        suffix_ = '_' + str(j-1)
        names = [c for c in cast_data.columns if c.endswith(tl)]
        cast = cast_data[names]
        col  = map(lambda x: x.replace(tl,suffix_), names)   
        cast.columns = col
        buffer_df = pd.concat((buffer_df,cast), axis=1)    
    
    buffer_df.loc[:,'Timeline'] = tl0
    buffer_df.loc[:,'PIN'] = cast_data.PIN
    buffer_df = buffer_df.fillna(0)
    
    df = df.append(buffer_df, ignore_index=True)

2018-06-01
2018-05-01
2018-04-01
2018-03-01
2018-02-01
2018-01-01
2017-12-01
2017-11-01
2017-10-01
2017-09-01
2017-08-01
2017-07-01
2017-06-01
2017-05-01
2017-04-01
2017-03-01
2017-02-01
2017-01-01
2016-12-01
2016-11-01
2016-10-01
2016-09-01
2016-08-01
2016-07-01
2016-06-01
Wall time: 8 s


In [37]:
number_cols.remove('PIN')
number_cols.remove('Timeline')
tmp = data.drop(number_cols, axis=1)

In [38]:
selector = [
'PIN',
'Timeline',    
'DateofTermination',
# 'City',
'Country',
# 'Specialization',
# 'Sex',
'TerminatedOrResigned',
# 'MonthsInCurrentPosition',
'TL_number',
]
tmp = tmp[selector]

#### Target creation

In [39]:
index = (tmp.TerminatedOrResigned==0)
tmp.loc[:, 'target']=0
tmp.loc[index, 'target']=1
tmp.target.value_counts()

0    195650
1      2734
Name: target, dtype: int64

In [40]:
df.Timeline = pd.to_datetime(df.Timeline)

In [41]:
print(df.shape)
df = pd.merge(left=tmp, right=df, how = 'inner', on=['PIN','Timeline'])
print(df.shape)

(274825, 152)
(156243, 157)


In [42]:
df = df.sort_values(by=['Timeline','PIN'], ascending=False)

#### Save to csv

In [43]:
df.target.value_counts()

0    154078
1      2165
Name: target, dtype: int64

In [44]:
df.columns

Index(['PIN', 'Timeline', 'DateofTermination', 'Country',
       'TerminatedOrResigned', 'TL_number', 'target', 'DurationOfSickLeaves_9',
       'DurationOfVacations_9', 'English training days_9',
       ...
       'Projects_0', 'SickLeave_0', 'Training days_0', 'Vacation_0',
       'comments_0', 'dRisk_0', 'dValue_0', 'likes_0', 'pages_loaded_0',
       'posts_0'],
      dtype='object', length=157)

In [45]:
## null check
for col in df.columns:
    if np.sum(df[col].isnull())>0: print(col, np.sum(df[col].isnull()))

DateofTermination 154078
TerminatedOrResigned 154078


In [46]:
path = 'data/'+suffix+'.csv'
print(path)
df.to_csv(path, sep=';', index=False)

data/Transform_data.csv
