In [19]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

from functools import reduce

from feature_selector import FeatureSelector

%matplotlib inline

In [20]:
network = pd.read_csv('../data/raw/TimenNetwork.csv')
#network.drop_duplicates(inplace=True)
network.head()

Unnamed: 0,Вых/Будни,monitor_Time,startTime,id
0,Будни,300,"2021-08-16 00:00:00,000",ОРГ1-01402
1,Будни,300,"2021-08-18 00:00:00,000",ОРГ1-01402
2,Будни,300,"2021-08-19 00:00:00,000",ОРГ1-01402
3,Будни,300,"2021-08-23 00:00:00,000",ОРГ1-01402
4,Будни,300,"2021-08-26 00:00:00,000",ОРГ1-01402


In [21]:
network.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216110 entries, 0 to 216109
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Вых/Будни     216110 non-null  object
 1   monitor_Time  216110 non-null  int64 
 2   startTime     216110 non-null  object
 3   id            216110 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.6+ MB


In [22]:
network['date_network'] = pd.to_datetime(network['startTime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
print('Start period:', network['date_network'].min())
print('End period:', network['date_network'].max())

max_date=network['date_network'].max()

Start period: 2021-06-20 00:00:00
End period: 2021-12-29 00:00:00


In [23]:
network['is_workday_connection'] = np.where(network['Вых/Будни'] == 'Будни', 1, 0)
network['is_weekend_connection'] = np.where(network['Вых/Будни'] == 'Выходные дни', 1, 0)

In [24]:
def work_period_by_network(dates):
    if (dates.max() - dates.min()).days == 0:
        return 1
    else:
        return (dates.max() - dates.min()).days

In [25]:
def days_since_last_network_appearence(dates, max_date=max_date):
    return (max_date - dates.max()).days

In [26]:
network.sort_values(by=['id','date_network'])

Unnamed: 0,Вых/Будни,monitor_Time,startTime,id,date_network,is_workday_connection,is_weekend_connection
59987,Будни,6603,"2021-06-21 00:00:00,000",ОРГ1-00004,2021-06-21,1,0
60412,Будни,24605,"2021-06-21 00:00:00,000",ОРГ1-00004,2021-06-21,1,0
6966,Будни,600,"2021-06-22 00:00:00,000",ОРГ1-00004,2021-06-22,1,0
67276,Будни,4505,"2021-06-22 00:00:00,000",ОРГ1-00004,2021-06-22,1,0
67702,Будни,17102,"2021-06-22 00:00:00,000",ОРГ1-00004,2021-06-22,1,0
...,...,...,...,...,...,...,...
182088,Будни,30607,"2021-12-24 00:00:00,000",ОРГ2-08387,2021-12-24,1,0
22134,Будни,30008,"2021-12-25 00:00:00,000",ОРГ2-08387,2021-12-25,1,0
203296,Выходные дни,10205,"2021-12-26 00:00:00,000",ОРГ2-08387,2021-12-26,0,1
138855,Будни,31523,"2021-12-27 00:00:00,000",ОРГ2-08387,2021-12-27,1,0


In [27]:
network = network.groupby(['id','date_network'], as_index=False).agg({
    'monitor_Time': 'sum',
    'is_workday_connection': 'mean',
    'is_weekend_connection': 'mean'
})

In [28]:
def aggregation(network):

    grouped_network = network.groupby('id', as_index=False).agg({
    'date_network': ['count', work_period_by_network, days_since_last_network_appearence],
    'monitor_Time': ['mean','median','sum'],
    'is_workday_connection': ['mean', 'sum'],
    'is_weekend_connection': ['mean', 'sum']
    })
    grouped_network.columns = grouped_network.columns.map('_'.join).str.strip('_')
    return grouped_network

In [29]:
network_last_150 = network[network['date_network'] > (max_date - timedelta(days=150))]
network_last_100 = network[network['date_network'] > (max_date - timedelta(days=100))]
network_last_50 = network[network['date_network'] > (max_date - timedelta(days=50))]

In [30]:
grouped_network = aggregation(network)
grouped_network['network_mean_by_work_period'] = grouped_network['monitor_Time_sum'] / grouped_network['date_network_work_period_by_network']
grouped_network.columns = grouped_network.columns.map(
    lambda x: x+'_total' if x != 'id' else x)

grouped_network_last_50 = aggregation(network_last_50)
grouped_network_last_50.columns = grouped_network_last_50.columns.map(
    lambda x: x+'_last_50' if x != 'id' else x)

grouped_network_last_100 = aggregation(network_last_100)
grouped_network_last_100.columns = grouped_network_last_100.columns.map(
    lambda x: x+'_last_100' if x != 'id' else x)

grouped_network_last_150 = aggregation(network_last_150)
grouped_network_last_150.columns = grouped_network_last_150.columns.map(
    lambda x: x+'_last_150' if x != 'id' else x)

In [31]:
# merge all dfs in one
connection_dfs_for_merge = [grouped_network, grouped_network_last_50,
                            grouped_network_last_100, grouped_network_last_150]

grouped_data = reduce(lambda left, right: pd.merge(left, right, on=['id'],
                                                   how='left'), connection_dfs_for_merge)

In [32]:
grouped_data['exists_in_network'] = 1

In [33]:
grouped_data.to_csv('../data/prepared/grouped_network.csv', index=False)

In [37]:
# import train dataset
train = pd.read_csv('../data/train_dataset_train.csv')
# function of feature selection from grouped data


def feature_selection(grouped_data, train):
    # merging data
    marked_data = pd.merge(grouped_data, train, how='inner', on='id')
    # features and marks defining
    X = marked_data.drop(['id', 'type'], axis=1)
    y = marked_data['type']
    # feature selector defining
    fs = FeatureSelector(data=X, labels=y)
    # find colinear features by threshold
    fs.identify_collinear(correlation_threshold=0.80)
    # cleaning data
    data_cleaned = fs.remove(methods = ['collinear'])
    # selected feature list
    feature_cleaned_list = list(data_cleaned)
    # back id to feature list
    feature_cleaned_list.append('id')
    # cleaned data : 
    cleaned_data = grouped_data[feature_cleaned_list]
    return cleaned_data


# clean dataa
cleaned_network = feature_selection(grouped_data, train)

# add mark if id exists in calls
cleaned_network['exists_in_network'] = 1

print(cleaned_network.shape)

# save cleaned calls
cleaned_network.to_csv('../data/prepared/grouped_network_cleaned.csv', index=False)

33 features with a correlation magnitude greater than 0.80.

Removed 33 features.
(1491, 10)


  record_collinear = record_collinear.append(temp_df, ignore_index = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_network['exists_in_network'] = 1


In [38]:
cleaned_network

Unnamed: 0,date_network_count_total,date_network_work_period_by_network_total,date_network_days_since_last_network_appearence_total,monitor_Time_mean_total,is_workday_connection_mean_total,date_network_count_last_50,date_network_work_period_by_network_last_50,date_network_work_period_by_network_last_100,exists_in_network,id
0,165,191,0,22800.187879,0.812121,45.0,49.0,99.0,1,ОРГ1-00004
1,130,191,0,44418.169231,0.992308,37.0,49.0,99.0,1,ОРГ1-00028
2,126,191,0,47420.746032,0.976190,37.0,49.0,96.0,1,ОРГ1-00030
3,157,191,0,21542.796178,0.853503,48.0,49.0,99.0,1,ОРГ1-00044
4,173,192,0,16519.358382,0.699422,49.0,49.0,99.0,1,ОРГ1-00046
...,...,...,...,...,...,...,...,...,...,...
1486,146,189,2,15135.575342,0.972603,42.0,47.0,97.0,1,ОРГ2-06144
1487,175,191,0,17675.897143,0.914286,44.0,49.0,99.0,1,ОРГ2-06145
1488,140,191,0,16157.542857,0.992857,37.0,49.0,99.0,1,ОРГ2-06373
1489,17,23,0,22933.352941,0.941176,17.0,23.0,23.0,1,ОРГ2-08025


In [36]:
grouped_data['exists_in_network'] = 1

In [131]:
grouped_data.to_csv('../data/prepared/grouped_network.csv', index=False)