In [61]:
# import libraries
import pandas as pd
import numpy as np
# matplotlib
import matplotlib.pyplot as plt
# datetime
from datetime import datetime, timedelta
# functools for reduce
from functools import reduce
# feature selector library
from feature_selector import FeatureSelector

%matplotlib inline

### Preparing data

In [62]:
# import calls df
calls = pd.read_csv('../data/raw/Calls.csv')
# drop duplicates
# calls = calls.drop_duplicates()
# display head
display(calls.head())
# display calls info
display(calls.info())
# calls shape of data
print('calls shape: ', calls.shape)

Unnamed: 0,Date,CallTime,NumberOfCalls,Вид учета времени,InOut,id
0,"2021-08-16 00:00:00,000",27777777777777778,1,Будни,ToUser,ОРГ1-01945
1,"2021-09-21 00:00:00,000",27777777777777778,1,Будни,ToUser,ОРГ1-01945
2,"2021-01-11 00:00:00,000",27777777777777778,1,Будни,ToUser,ОРГ1-01945
3,"2021-01-18 00:00:00,000",27777777777777778,1,Будни,ToUser,ОРГ1-01945
4,"2021-01-27 00:00:00,000",27777777777777778,1,Будни,ToUser,ОРГ1-01945


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407867 entries, 0 to 407866
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Date               407867 non-null  object
 1   CallTime           407867 non-null  object
 2   NumberOfCalls      407867 non-null  int64 
 3   Вид учета времени  392655 non-null  object
 4   InOut              407867 non-null  object
 5   id                 407867 non-null  object
dtypes: int64(1), object(5)
memory usage: 18.7+ MB


None

calls shape:  (407867, 6)


In [63]:
# date to datetime
calls['date_calls'] = pd.to_datetime(calls['Date'].apply(lambda x: x[:10]), format='%Y-%m-%d')
# drop date column
calls.drop('Date', axis=1, inplace=True)

In [64]:
# information about period of data
max_date = calls['date_calls'].max()
print('Data starts at: ', calls['date_calls'].min().date())
print('Data ends at : ', calls['date_calls'].max().date())
print('Total days presented: ', calls['date_calls'].nunique())
print('Period of data (days): ', (calls['date_calls'].max() - calls['date_calls'].min()).days + 1)

Data starts at:  2021-01-01
Data ends at :  2021-12-30
Total days presented:  364
Period of data (days):  364


In [65]:
# CallTime convert to float
calls['CallTime'] = calls['CallTime'].apply(lambda x: float(x.replace(',','.')))

In [66]:
# select all dates and weekend marks
dates = calls[['date_calls', 'Вид учета времени']].dropna().sort_values('date_calls').reset_index(drop=True)
# weekend marks for fill na
weekeend_days_map = dates.groupby('date_calls')['Вид учета времени'].agg(pd.Series.mode).to_dict()

In [67]:
# fill na with modes type of data
calls['Вид учета времени'] = np.where(calls['Вид учета времени'].isnull(), 
                                      calls['date_calls'].map(weekeend_days_map), calls['Вид учета времени'])

In [68]:
# agg function to calculate work period by first and last calls
def work_period_by_calls(dates):
    return (dates.max() - dates.min()).days

In [69]:
# agg function to find days since last call
def days_since_last_call(dates, max_date=max_date):
    return (max_date - dates.max()).days

In [70]:
def call_time_features(calls):
    # mean calltime less than 10 seconds on day goes to blank ( 10 seconds / 3600 seconds)
    calls['less_10_secs_call'] = np.where(
        calls['CallTime']/calls['NumberOfCalls'] < 0.0028, 1, 0)
    # mean calltime more than 20 seconds
    calls['between_10_and_60_secs_call'] = np.where(
        (calls['CallTime']/calls['NumberOfCalls']).between(0.0028, 0.017), 1, 0)
    # mean calltime more than 1 minutes
    calls['more_60_secs_call'] = np.where(
        calls['CallTime']/calls['NumberOfCalls'] > 0.017, 1, 0)
    # mean calltime more than 3 minutes
    calls['more_180_secs_call'] = np.where(
        calls['CallTime']/calls['NumberOfCalls'] > 0.05, 1, 0)
    # mean calltime more than 5 minutes
    calls['more_300_secs_call'] = np.where(
        calls['CallTime']/calls['NumberOfCalls'] > 0.084, 1, 0)
    # mean calltime more than 10 minutes
    calls['more_600_secs_call'] = np.where(
        calls['CallTime']/calls['NumberOfCalls'] > 0.167, 1, 0)
    
    # holiday-workdays marks
    calls['weekend_work_days_calls'] = np.where(calls['Вид учета времени'] == 'Выходные дни', 1 , 0)
    calls['workday_work_days_calls'] = np.where(calls['Вид учета времени'] == 'Будни', 1, 0)
    
    
    return calls

### aggregations

In [71]:
# types of call
print('Types of call: ', calls['InOut'].unique())
# to_user or from_user marks
#calls['to_user_calls'] = np.where(calls['InOut'] =='ToUser', 1, 0)
#calls['from_user_calls'] = np.where(calls['InOut'] =='FromUser', 1, 0)

Types of call:  ['ToUser' 'FromUser']


In [72]:
# divide df to ToUser and FromUser calls for aggregation
calls_to_user = calls[calls['InOut'] == 'ToUser']
calls_from_user = calls[calls['InOut'] == 'FromUser']

In [73]:
# df for last 150 days
calls_last_150 = calls[calls['date_calls'] > max_date - timedelta(days=150) ]
# df for last 100 days
calls_last_100 = calls[calls['date_calls'] > max_date - timedelta(days=100) ]
#df for last 50 days
calls_last_50 = calls[calls['date_calls'] > max_date - timedelta(days=50)]

In [74]:
# df for last 150 days calls to user and from user
calls_last_150_to_user = calls_last_150[calls_last_150['InOut'] == 'ToUser']
calls_last_150_from_user = calls_last_150[calls_last_150['InOut'] == 'FromUser']
# df for last 100 days calls to user and from user
calls_last_100_to_user = calls_last_100[calls_last_100['InOut'] == 'ToUser']
calls_last_100_from_user = calls_last_100[calls_last_100['InOut'] == 'FromUser']
# df for last 50 days calls to user and from user
calls_last_50_to_user = calls_last_50[calls_last_50['InOut'] == 'ToUser']
calls_last_50_from_user = calls_last_50[calls_last_50['InOut'] == 'FromUser']


In [75]:
calls_dfs_for_aggregation = [calls, calls_to_user, calls_from_user, 
                             calls_last_150, calls_last_100, calls_last_50,
                             calls_last_150_to_user, calls_last_150_from_user, 
                             calls_last_100_to_user, calls_last_100_from_user,
                             calls_last_50_to_user, calls_last_50_from_user]

In [76]:
calls = calls.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_to_user = calls_to_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_from_user = calls_from_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_150 = calls_last_150.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_100 = calls_last_100.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_50 = calls_last_50.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_150_to_user = calls_last_150_to_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_150_from_user = calls_last_150_from_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_100_to_user = calls_last_100_to_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_100_from_user = calls_last_100_from_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_50_to_user = calls_last_50_to_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()
calls_last_50_from_user = calls_last_50_from_user.groupby(['Вид учета времени', 'InOut', 'id', 'date_calls'], as_index=False).sum()


In [77]:
calls_dfs_for_aggregation = [calls, calls_to_user, calls_from_user, 
                             calls_last_150, calls_last_100, calls_last_50,
                             calls_last_150_to_user, calls_last_150_from_user, 
                             calls_last_100_to_user, calls_last_100_from_user,
                             calls_last_50_to_user, calls_last_50_from_user]

In [78]:
# add features
for df in calls_dfs_for_aggregation:
    df = call_time_features(df)

In [79]:
# create features about last call since IN and FROM calls
# aggregation
since_last_calls_to_user = calls[calls['InOut'] == 'FromUser'].groupby('id', as_index=False).agg({
                    'date_calls': days_since_last_call})

since_last_calls_from_user = calls[calls['InOut'] == 'ToUser'].groupby('id', as_index=False).agg({
                    'date_calls': days_since_last_call})
        
# merge dfs
last_calls = pd.merge(since_last_calls_to_user,
                      since_last_calls_from_user, how='outer', on='id')
# rename columns
last_calls.columns = ['id', 'days_since_last_call_from_user', 'days_since_last_call_to_user']

#fill N/A's with 364 (that means - never calls)
last_calls.fillna(364, inplace=True)

In [80]:
def aggregation_calls(calls):
    calls = calls.groupby('id', as_index=False).agg({
        'date_calls': ['nunique', work_period_by_calls, days_since_last_call],
        'CallTime': ['mean', 'sum', 'count', 'std', 'min', 'max'],
        'NumberOfCalls': ['mean', 'sum', 'count', 'std', 'min', 'max'],
        'less_10_secs_call': ['mean', 'sum'],
        'between_10_and_60_secs_call': ['mean', 'sum'],
        'more_60_secs_call': ['mean', 'sum'],
        'more_180_secs_call': ['mean', 'sum'],
        'more_300_secs_call': ['mean', 'sum'],
        'more_600_secs_call': ['mean', 'sum'],
        'weekend_work_days_calls': ['mean', 'sum'],
        'workday_work_days_calls': ['mean', 'sum']
    })
    calls.columns = calls.columns.map('_'.join).str.strip('_')
    calls['true_calltime_mean'] = calls['CallTime_sum'] / calls['NumberOfCalls_sum']
    return calls

In [81]:
calls = aggregation_calls(calls)
calls_to_user = aggregation_calls(calls_to_user)
calls_from_user = aggregation_calls(calls_from_user)
calls_last_150 = aggregation_calls(calls_last_150)
calls_last_100 = aggregation_calls(calls_last_100)
calls_last_50 = aggregation_calls(calls_last_50)
calls_last_150_to_user = aggregation_calls(calls_last_150_to_user)
calls_last_150_from_user = aggregation_calls(calls_last_150_from_user)
calls_last_100_to_user = aggregation_calls(calls_last_100_to_user)
calls_last_100_from_user = aggregation_calls(calls_last_100_from_user)
calls_last_50_to_user = aggregation_calls(calls_last_50_to_user)
calls_last_50_from_user = aggregation_calls(calls_last_50_from_user)

In [82]:
# add suffixes to dfs columns 
calls.columns = calls.columns.map(lambda x: x+'_total' if x != 'id' else x)
calls_to_user.columns = calls_to_user.columns.map(
    lambda x: x+'_to_user' if x != 'id' else x)
calls_from_user.columns = calls_from_user.columns.map(
    lambda x: x+'_from_user' if x != 'id' else x)
calls_last_150.columns = calls_last_150.columns.map(
    lambda x: x+'_last_150' if x != 'id' else x)
calls_last_100.columns = calls_last_100.columns.map(
    lambda x: x+'_last_100' if x != 'id' else x)
calls_last_50.columns = calls_last_50.columns.map(
    lambda x: x+'_last_50' if x != 'id' else x)
calls_last_150_to_user.columns = calls_last_150_to_user.columns.map(
    lambda x: x+'_last_150_to_user' if x != 'id' else x)
calls_last_150_from_user.columns = calls_last_150_from_user.columns.map(
    lambda x: x+'_last_150_from_user' if x != 'id' else x)
calls_last_100_to_user.columns = calls_last_100_to_user.columns.map(
    lambda x: x+'_last_100_to_user' if x != 'id' else x)
calls_last_100_from_user.columns = calls_last_100_from_user.columns.map(
    lambda x: x+'_last_100_from_user' if x != 'id' else x)
calls_last_50_to_user.columns = calls_last_50_to_user.columns.map(
    lambda x: x+'_last_50_to_user' if x != 'id' else x)
calls_last_50_from_user.columns = calls_last_50_from_user.columns.map(
    lambda x: x+'_last_50_from_user' if x != 'id' else x)

In [83]:
calls_dfs_for_merge = [calls, calls_to_user, calls_from_user, 
                             calls_last_150, calls_last_100, calls_last_50,
                             calls_last_150_to_user, calls_last_150_from_user, 
                             calls_last_100_to_user, calls_last_100_from_user,
                             calls_last_50_to_user, calls_last_50_from_user, since_last_calls_to_user, since_last_calls_from_user]

grouped_data = reduce(lambda left, right: pd.merge(left, right, on=['id'],
                                            how='left'), calls_dfs_for_merge)

In [84]:
# fill na values with 0
grouped_data.fillna(0, inplace=True)

In [85]:
# to-from calls ratio
grouped_data['to_from_ratio_total'] = grouped_data['NumberOfCalls_sum_to_user'] / \
    (grouped_data['NumberOfCalls_sum_from_user'] + 1)
# to-from calls ratio last 150
grouped_data['to_from_last_150_ratio_total'] = grouped_data['NumberOfCalls_sum_last_150_to_user'] / \
    (grouped_data['NumberOfCalls_sum_last_150_from_user'] + 1)
# to-from calls ratio last 100
grouped_data['to_from_last_100_ratio_total'] = grouped_data['NumberOfCalls_sum_last_100_to_user'] / \
    (grouped_data['NumberOfCalls_sum_last_100_from_user'] + 1)
# to-from calls ratio last 50
grouped_data['to_from_last_50_ratio_total'] = grouped_data['NumberOfCalls_sum_last_50_to_user'] / \
    (grouped_data['NumberOfCalls_sum_last_50_from_user'] + 1)

In [86]:
grouped_data['exists_in_calls'] = 1

In [87]:
grouped_data.to_csv('../data/prepared/grouped_calls.csv', index=False)

In [None]:
# import train dataset
train = pd.read_csv('../data/train_dataset_train.csv')
# function of feature selection from grouped data


def feature_selection(grouped_data, train):
    # merging data
    marked_data = pd.merge(grouped_data, train, how='inner', on='id')
    # features and marks defining
    X = marked_data.drop(['id', 'type'], axis=1)
    y = marked_data['type']
    # feature selector defining
    fs = FeatureSelector(data=X, labels=y)
    # find colinear features by threshold
    fs.identify_collinear(correlation_threshold=0.80)
    # find zero_importance features (lightgbm inside)
    # cleaning data
    data_cleaned = fs.remove(methods = ['collinear'])
    # selected feature list
    feature_cleaned_list = list(data_cleaned)
    # back id to feature list
    feature_cleaned_list.append('id')
    # cleaned data : 
    cleaned_data = grouped_data[feature_cleaned_list]
    return cleaned_data


# clean dataa
cleaned_calls = feature_selection(grouped_data, train)

# add mark if id exists in calls
cleaned_calls['exists_in_calls'] = 1

print(cleaned_calls.shape)

# save cleaned calls
cleaned_calls.to_csv('../data/prepared/grouped_calls_cleaned.csv', index=False)

In [39]:
cleaned_calls

Unnamed: 0,date_calls_nunique_total,date_calls_work_period_by_calls_total,date_calls_days_since_last_call_total,CallTime_mean_total,CallTime_sum_total,CallTime_std_total,NumberOfCalls_mean_total,less_10_secs_call_mean_total,less_10_secs_call_sum_total,between_10_and_60_secs_call_mean_total,...,more_60_secs_call_mean_last_50_from_user,more_180_secs_call_mean_last_50_from_user,more_300_secs_call_mean_last_50_from_user,more_600_secs_call_mean_last_50_from_user,workday_work_days_calls_mean_last_50_from_user,to_from_ratio_total,to_from_last_100_ratio_total,to_from_last_50_ratio_total,id,exists_in_calls
0,199,352,0,0.409560,162.595278,0.296862,14.244332,0.000000,0,0.249370,...,0.771429,0.142857,0.000000,0.000000,1.000000,0.610020,0.595048,0.684211,ОРГ1-00004,1
1,214,359,0,0.662028,272.755556,0.865917,8.497573,0.043689,18,0.233010,...,0.800000,0.342857,0.228571,0.085714,0.942857,0.664449,0.715789,0.733119,ОРГ1-00005,1
2,215,353,0,0.151212,59.880000,0.271606,5.260101,0.030303,12,0.436869,...,0.515152,0.090909,0.060606,0.000000,1.000000,1.065411,1.016949,0.865801,ОРГ1-00028,1
3,202,353,0,0.162173,62.760833,0.172911,7.759690,0.007752,3,0.501292,...,0.531250,0.156250,0.000000,0.000000,1.000000,0.848615,0.925558,1.022523,ОРГ1-00030,1
4,225,358,0,0.361930,153.458333,0.469940,9.509434,0.009434,4,0.165094,...,0.783784,0.297297,0.081081,0.000000,0.972973,0.961576,1.031457,1.016807,ОРГ1-00044,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,5,74,198,1.225611,6.128056,0.631943,1.600000,0.000000,0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ОРГ2-05629,1
1067,3,113,238,0.105093,0.315278,0.178179,1.666667,0.666667,2,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ОРГ2-05862,1
1068,9,79,265,0.663735,5.973611,0.443417,2.555556,0.222222,2,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ОРГ2-05932,1
1069,1,0,209,0.220278,0.220278,0.000000,1.000000,0.000000,0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ОРГ2-06006,1


---

In [None]:
# aggregation calls
grouped_calls = calls.groupby('id', as_index=False).agg({
    'date_calls': ['nunique', work_period_by_calls, days_since_last_call],
    'CallTime': ['mean', 'sum', 'count','std'],
    'NumberOfCalls': ['mean', 'sum','std'],
    'to_user_calls': ['sum', 'mean','std', 'median'],
    'from_user_calls': ['sum', 'mean', 'std', 'median'],
    'blank_calltime_days': ['sum', 'mean'],
    'long_calltime_days': ['sum', 'mean'],
    'holiday_work_days_calls': ['sum', 'mean'],
    'workday_work_days_calls': ['sum', 'mean']
    })

In [None]:
# grouping columns multiindex
grouped_calls.columns = grouped_calls.columns.map('_'.join).str.strip('_')

In [None]:
# true calltime mean calc (there is doubling days)
grouped_calls['true_calltime_mean'] = grouped_calls['CallTime_sum'] / grouped_calls['NumberOfCalls_sum']

In [None]:
# to-from calls ratio
grouped_calls['to_from_ratio'] = grouped_calls['to_user_calls_sum'] / (grouped_calls['from_user_calls_sum'] + 1)

In [None]:
# merge df with last calls features
grouped_calls = pd.merge(grouped_calls, last_calls, how='inner', on='id')

In [None]:
# grouped_calls info
grouped_calls.info()

In [None]:
# write to csv
grouped_calls.to_csv('data/prepared/grouped_calls.csv', index=False)

In [None]:
# import train dataset
train = pd.read_csv('data/train_dataset_train.csv')
# function of feature selection from grouped data


def feature_selection(grouped_data, train):
    # merging data
    marked_data = pd.merge(grouped_data, train, how='inner', on='id')
    # features and marks defining
    X = marked_data.drop(['id', 'type'])
    y = marked_data['type']
    # feature selector defining
    fs = FeatureSelector(data=X, labels=y)
    # find colinear features by threshold
    fs.identify_collinear(correlation_threshold=0.90)
    # find zero_importance features (lightgbm inside)
    fs.identify_zero_importance(
        task='classification', eval_metric='multi_logloss', n_iterations=20, early_stopping=True)
    # plot feature importances
    fs.plot_feature_importances(threshold = 0.99)
    # cleaning data
    data_cleaned = fs.remove(methods = ['collinear', 'zero_importance'])
    # selected feature list
    feature_cleaned_list = list(data_cleaned)
    # back id to feature list
    feature_cleaned_list.append('id')
    # cleaned data : 
    cleaned_data = grouped_data[feature_cleaned_list]
    return cleaned_data

    

In [None]:
grouped_calls_fs = feature_selection(grouped_calls, train)

In [None]:
# import train dataset (with class marks)
train = pd.read_csv('data/train_dataset_train.csv')
# merge train and grouped datasets inner
marked_calls = pd.merge(grouped_calls, train, how='inner', on='id').drop('id', axis=1)

In [None]:
X = marked_calls.drop('type', axis=1)
y = marked_calls['type']

In [None]:
fs = FeatureSelector(data=X, labels=y)

In [None]:
fs.identify_collinear(correlation_threshold=0.90)

In [None]:
fs.identify_zero_importance(task = 'classification', eval_metric = 'multi_logloss', 
                            n_iterations = 40, early_stopping = True)

In [None]:
fs.plot_feature_importances(threshold = 0.99)


In [None]:
calls_cleaned = fs.remove(methods = ['collinear', 'zero_importance'])

In [None]:
feature_calls_list = list(calls_cleaned)

In [None]:
feature_calls_list.append('id')

grouped_calls_fs = grouped_calls[feature_calls_list].copy()
grouped_calls_fs['exists_in_calls'] = 1
grouped_calls_fs.to_csv('data/prepared/grouped_calls_fs.csv', index=False)

In [None]:
grouped_calls_fs

In [None]:
train = pd.read_csv('data/train_dataset_train.csv')

marked_calls = pd.merge(grouped_calls, train, how='inner', on='id').drop('id', axis=1)

for col in list(marked_calls):
    marked_calls[col] = marked_calls[col].fillna(marked_calls[col].mean())

scaler = StandardScaler()
marked_calls_st = scaler.fit_transform(marked_calls)
marked_calls_st_df = pd.DataFrame(marked_calls_st, columns = marked_calls.columns)

X = marked_calls_st_df.drop('type', axis=1)
y = marked_calls['type']

# stratified KFold
cv = RepeatedStratifiedKFold(n_splits=3, random_state=42)
estimator = SVC(kernel='linear')
rfecv = RFECV(estimator, step=1, cv=cv, min_features_to_select=1, scoring='recall_macro')
rfecv.fit(X, y)

feat_index = pd.Series(data = rfecv.ranking_, index = X.columns)
signi_feat_rfecv = feat_index[feat_index==1].index

print('count_features: ', rfecv.n_features_)
print('Significant features from RFE',signi_feat_rfecv)



In [None]:
feat_index = pd.Series(data = rfecv.ranking_, index = X.columns)
signi_feat_rfecv = feat_index[feat_index==1].index

print('Significant features from RFE',signi_feat_rfecv)

sig_cols = signi_feat_rfecv.to_list()
sig_cols.append('id')

grouped_calls_sig = grouped_calls[sig_cols].copy()
grouped_calls_sig['exists_in_calls'] = 1
grouped_calls_sig.to_csv('data/prepared/grouped_calls_sig_2.csv', index=False)

In [None]:
rfecv.n_features_

In [None]:
df_features = pd.DataFrame(columns = ['feature', 'support', 'ranking'])

for i in range(X.shape[1]):
    row = {'feature': i, 'support': rfecv.support_[i], 'ranking': rfecv.ranking_[i]}
    df_features = df_features.append(row, ignore_index=True)
    
df_features.sort_values(by='ranking').head(10)

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(
    range(1, len(rfecv.grid_scores_) + 1),
    rfecv.grid_scores_,
)
plt.show()

In [None]:
df_features[df_features['support']==True]

In [None]:
rfecv.feature_names_in_

In [None]:
print("Optimal number of features : %d" % clf.n_features_)
print("Optimal number of features : ", clf.n_features_)


In [None]:
plt.plot(range(1, 31), clf.grid_scores_)
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.show()

In [None]:
svc_lin=SVC(kernel='linear')
svm_rfe_model=RFE(estimator=svc_lin)
svm_rfe_model_fit=svm_rfe_model.fit(X_train, y_train)
feat_index = pd.Series(data = svm_rfe_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

print('Significant features from RFE',signi_feat_rfe)

sig_cols = signi_feat_rfe.to_list()
sig_cols.append('id')

grouped_calls_sig = grouped_calls[sig_cols].copy()
grouped_calls_sig['exists_in_calls'] = 1
grouped_calls_sig.to_csv('data/prepared/grouped_calls_sig.csv', index=False)

In [None]:



print('Significant features from RFE',signi_feat_rfe)

sig_cols = signi_feat_rfe.to_list()
sig_cols.append('id')

grouped_calls_sig = grouped_calls[sig_cols].copy()
grouped_calls_sig['exists_in_calls'] = 1
grouped_calls_sig.to_csv('data/prepared/grouped_calls_sig.csv', index=False)