In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict
from ipynb.fs.full.lookup_table import custom_lookup_table
from ipynb.fs.full.common_functions import call_spacy_nlp, build_tokens_list, \
    build_lemmas_list, remove_not_alpha_list, convert_list_to_string, model_eval, import_data
from glob import glob

In [2]:
pd.options.display.max_colwidth = 300

Import and unify all datasets:

In [3]:
TMP_Actions = import_data(
            glob=glob,
            pd=pd,
            low_memory=False, 
            file_name_start='data/TMP_Actions.',
            names=['id','snapnum','meas_action_datetime','meas_action_acct_code_concerned','meas_action_cust_code_concerned','meas_action_comment'],
            lineterminator='\n',
            sep='\t'
           )

In [4]:
TMP_Actions.dropna(subset=['snapnum'], inplace=True)
TMP_Actions = TMP_Actions.astype({"snapnum": int})

We check if there are duplicate comments made for the same account in the same snapshot (excluding comments such as ######):

In [5]:
# Group by three columns and find the size of each group
group_sizes = TMP_Actions.groupby(['meas_action_acct_code_concerned', 'snapnum', 'meas_action_comment']).size()

# Select rows from the original DataFrame where group size is greater than 1
result = TMP_Actions.loc[(TMP_Actions.set_index(['meas_action_acct_code_concerned', 'snapnum', 'meas_action_comment'])\
    .index.isin(group_sizes[group_sizes > 1].index)) & (~(TMP_Actions['meas_action_comment'].str.contains('##########',\
    na=False)))]

# Reset the index if needed
result.reset_index(drop=True, inplace=True)

result.sort_values(by=['meas_action_acct_code_concerned','snapnum'])


Unnamed: 0,index,id,snapnum,meas_action_datetime,meas_action_acct_code_concerned,meas_action_cust_code_concerned,meas_action_comment
817058,167873,231646664,47,2019-11-27,344984.0,393604.0,εντολη διενεργειασ εαπ
817059,167874,231632140,47,2019-11-27,344984.0,373503.0,εντολη διενεργειασ εαπ
816586,164999,231646665,47,2019-11-27,344986.0,345338.0,εντολη διενεργειασ εαπ
816587,165000,231632141,47,2019-11-27,344986.0,345338.0,εντολη διενεργειασ εαπ
964619,375237,225837321,46,2019-10-04,344991.0,403750.0,### δα
...,...,...,...,...,...,...,...
916425,710988,230782994,47,2019-11-08,1122560.0,681951.0,[xxxx] δ.α. στο #######. το ####### ανενεργο
916426,710989,230782995,47,2019-11-08,1122560.0,681951.0,[xxxx] δ.α. στο #######. το ####### ανενεργο
916427,710990,230782997,47,2019-11-08,1122560.0,683068.0,[xxxx] δ.α.
744261,231363,237195158,48,2019-12-30,1122560.0,681951.0,####ε 30/9/#### [xxxx] (παραδοθηκε σε [xxxx] προσωπο)


It seems that we have accounts with more than one same comments in one month. This should be a point of consideration for our later analysis.

We continue with importing and preparing the Customers and Accounts datasets:

In [6]:
DMCR_Unstruct_Customers = import_data(
            glob=glob,
            pd=pd,
            low_memory=False, 
            file_name_start='data/DMCR_UNSTRUCT_CUSTOMERS.',
            names=['id','snapnum','meas_cusl_min_communication_date_3m'],
            lineterminator='\n',
            sep='	'
           )

In [7]:
DMCR_Unstruct_Customers

Unnamed: 0,index,id,snapnum,meas_cusl_min_communication_date_3m
0,0,240628,37,2018-11-06\r
1,1,240629,37,2018-11-14\r
2,2,240716,37,\r
3,3,240630,37,2018-11-14\r
4,4,240631,37,2018-11-01\r
...,...,...,...,...
2368267,197351,396451,38,2018-12-04\r
2368268,197352,396452,38,\r
2368269,197353,396453,38,2018-12-11\r
2368270,197354,396454,38,2019-01-28\r


In [8]:
DMCR_Unstruct_Customers['meas_cusl_min_communication_date_3m'] = \
    DMCR_Unstruct_Customers['meas_cusl_min_communication_date_3m'].str.replace("\r","")


In [9]:
DMCR_Unstruct_Customers

Unnamed: 0,index,id,snapnum,meas_cusl_min_communication_date_3m
0,0,240628,37,2018-11-06
1,1,240629,37,2018-11-14
2,2,240716,37,
3,3,240630,37,2018-11-14
4,4,240631,37,2018-11-01
...,...,...,...,...
2368267,197351,396451,38,2018-12-04
2368268,197352,396452,38,
2368269,197353,396453,38,2018-12-11
2368270,197354,396454,38,2019-01-28


In [10]:
DMCR_Unstruct_Accounts = import_data(
            glob=glob,
            pd=pd,
            low_memory=False, 
            file_name_start='data/DMCR_UNSTRUCT_ACCOUNTS.',
            names=['id','snapnum','meas_acch_date_nominal','meas_acct_cust_code','meas_accl_appl_status',\
            'meas_accl_application_bucket','meas_accl_application_pending','meas_accl_paid_in_full_cm'],
            lineterminator='\n',
            sep='	'
           )

In [11]:
DMCR_Unstruct_Accounts['meas_accl_paid_in_full_cm'] = \
    DMCR_Unstruct_Accounts['meas_accl_paid_in_full_cm'].str.replace("\r","")

In [12]:
DMCR_Unstruct_Accounts

Unnamed: 0,index,id,snapnum,meas_acch_date_nominal,meas_acct_cust_code,meas_accl_appl_status,meas_accl_application_bucket,meas_accl_application_pending,meas_accl_paid_in_full_cm
0,0,421635,38,2019-02-01,219463,,,,No
1,1,437283,38,2019-02-01,219465,,,,No
2,2,429674,38,2019-02-01,219467,,,,No
3,3,351213,38,2019-02-01,219468,,,,No
4,4,358497,38,2019-02-01,219470,,,,No
...,...,...,...,...,...,...,...,...,...
4074749,342395,1121760,37,2019-01-01,681743,,,,No
4074750,342396,1121761,37,2019-01-01,682781,,,,No
4074751,342397,1122348,37,2019-01-01,681873,,,,No
4074752,342398,1122349,37,2019-01-01,681370,,,,No


Each account in DMCR_Unstruct_Accounts is related to only one customer:

In [13]:
DMCR_Unstruct_Accounts.groupby('id')['meas_acct_cust_code'].nunique()\
    [DMCR_Unstruct_Accounts.groupby('id')['meas_acct_cust_code'].nunique()>1]

Series([], Name: meas_acct_cust_code, dtype: int64)

We have 3 datasets: TMP_Actions, DMCR_Unstruct_Customers and DMCR_Unstruct_Accounts

As a first step, we have to combine the data from DMCR_Unstruct_Customers and DMCR_Unstruct_Accounts and build the perimeter, in order to keep only the relevant data points.

First, we are going to join Customers and Accounts dataframes based on customer_id. Also, the Customers snapnum should be the next of that of Accounts (because we need MEAS_CUSL_MIN_COMMUNICATION_DATE_3M of the next snapshot):

In [14]:
DMCR_Unstruct_Customers['snapnum_prev'] = DMCR_Unstruct_Customers['snapnum']-1

In [15]:
Accounts_Customers = DMCR_Unstruct_Accounts.merge(DMCR_Unstruct_Customers, how='inner', \
    left_on=['meas_acct_cust_code','snapnum'], right_on=['id','snapnum_prev'])


Accounts_Customers.rename(columns={"id_x": "account_id", "id_y": "customer_id","snapnum_x": "snapnum", \
    "meas_cusl_min_communication_date_3m": "meas_cusl_min_communication_date_3m_next"},inplace=True)


Accounts_Customers['meas_acch_date_nominal'] = pd.to_datetime(Accounts_Customers['meas_acch_date_nominal'])
Accounts_Customers['meas_cusl_min_communication_date_3m_next'] = pd.to_datetime(Accounts_Customers[\
    'meas_cusl_min_communication_date_3m_next'])


Accounts_Customers = Accounts_Customers[['account_id','customer_id','snapnum','meas_acch_date_nominal',\
    'meas_accl_appl_status','meas_accl_application_bucket','meas_accl_application_pending',\
    'meas_accl_paid_in_full_cm','meas_cusl_min_communication_date_3m_next']]

In [16]:
Accounts_Customers

Unnamed: 0,account_id,customer_id,snapnum,meas_acch_date_nominal,meas_accl_appl_status,meas_accl_application_bucket,meas_accl_application_pending,meas_accl_paid_in_full_cm,meas_cusl_min_communication_date_3m_next
0,421635,219463,38,2019-02-01,,,,No,2019-01-04
1,437283,219465,38,2019-02-01,,,,No,2019-01-17
2,466056,219465,38,2019-02-01,,,,No,2019-01-17
3,429674,219467,38,2019-02-01,,,,No,NaT
4,528515,219467,38,2019-02-01,,,,No,NaT
...,...,...,...,...,...,...,...,...,...
3737970,1121760,681743,37,2019-01-01,,,,No,NaT
3737971,1121761,682781,37,2019-01-01,,,,No,NaT
3737972,1122348,681873,37,2019-01-01,,,,No,NaT
3737973,1122349,681370,37,2019-01-01,,,,No,NaT


Now we have included **meas_cusl_min_communication_date_3m_next** (next indicating that it belongs to the next snapshot) to the Accounts dataframe. Furthermore, **meas_acch_date_nominal** and **meas_cusl_min_communication_date_3m_next** have been converted to datetimes.

As a next step, we also have to create **meas_acch_date_nominal_next**, the date of the next snapshot. This will be helpful for the monthly difference between **meas_cusl_min_communication_date_3m_next** and **meas_acch_date_nominal_next** which is required by the definition of the perimeter:

In [17]:
Accounts_Customers['meas_acch_date_nominal_next'] = Accounts_Customers['meas_acch_date_nominal'] \
 + pd.DateOffset(months=1)

Next we also need to join the unified Accounts_Customers with Accounts twice, in order to have **meas_accl_paid_in_full_cm** column twice, both for next and for two snapshots after. These will be used to create the target: 

In [18]:
DMCR_Unstruct_Accounts['snapnum_prev'] = DMCR_Unstruct_Accounts['snapnum'] - 1
DMCR_Unstruct_Accounts['snapnum_pre_prev'] = DMCR_Unstruct_Accounts['snapnum'] - 2

In [19]:
Accounts_Customers = Accounts_Customers.merge(DMCR_Unstruct_Accounts, how='inner', left_on=['account_id','snapnum'], \
    right_on=['id','snapnum_prev'])


In [20]:
Accounts_Customers = Accounts_Customers[['account_id', 'customer_id', 'snapnum_x', 'meas_acch_date_nominal_x',
       'meas_accl_appl_status_x', 'meas_accl_application_bucket_x',
       'meas_accl_application_pending_x', 'meas_accl_paid_in_full_cm_x',
       'meas_cusl_min_communication_date_3m_next',
       'meas_acch_date_nominal_next', 'meas_accl_paid_in_full_cm_y']]

In [21]:
Accounts_Customers = Accounts_Customers.rename(columns={"meas_accl_paid_in_full_cm_y":\
    "meas_accl_paid_in_full_cm_next","snapnum_x":"snapnum","meas_acch_date_nominal_x":"meas_acch_date_nominal",\
    "meas_accl_appl_status_x":"meas_accl_appl_status","meas_accl_application_bucket_x":"meas_accl_application_bucket",\
    "meas_accl_application_pending_x":"meas_accl_application_pending","meas_accl_paid_in_full_cm_x":\
    "meas_accl_paid_in_full_cm"})


In [22]:
Accounts_Customers = Accounts_Customers.merge(DMCR_Unstruct_Accounts, how='inner', left_on=['account_id','snapnum'], \
    right_on=['id','snapnum_pre_prev'])


In [23]:
Accounts_Customers = Accounts_Customers[['account_id', 'customer_id', 'snapnum_x', 'meas_acch_date_nominal_x',
       'meas_accl_appl_status_x', 'meas_accl_application_bucket_x',
       'meas_accl_application_pending_x', 'meas_accl_paid_in_full_cm_x',
       'meas_cusl_min_communication_date_3m_next',
       'meas_acch_date_nominal_next', 'meas_accl_paid_in_full_cm_next', 'meas_accl_paid_in_full_cm_y']]

In [24]:
Accounts_Customers = Accounts_Customers.rename(columns={"meas_accl_paid_in_full_cm_y": "meas_accl_paid_in_full_cm_next_2",\
    "snapnum_x":"snapnum","meas_acch_date_nominal_x":"meas_acch_date_nominal","meas_accl_appl_status_x":\
    "meas_accl_appl_status","meas_accl_application_bucket_x":"meas_accl_application_bucket",\
    "meas_accl_application_pending_x":"meas_accl_application_pending","meas_accl_paid_in_full_cm_x":\
    "meas_accl_paid_in_full_cm"})


Calculate the time difference between **meas_acch_date_nominal** and **meas_cusl_min_communication_date_3m** on the next snapshot to use it for the perimeter:

In [25]:
Accounts_Customers['meas_cusl_communication_acch_month_difference'] = \
    (Accounts_Customers.meas_acch_date_nominal_next.dt.month - \
    Accounts_Customers.meas_cusl_min_communication_date_3m_next.dt.month).fillna(99999).astype(int)


Apply the perimeter:

In [26]:
Accounts_Customers_Perimeter =  Accounts_Customers[\
    ((Accounts_Customers['meas_accl_application_pending'].isna()) \
         | (Accounts_Customers['meas_accl_application_pending']=='No')) \
                                                   
    & (~(Accounts_Customers['meas_accl_appl_status'].isin(['Approved', 'Running', 'Fulfilled', 'Partially Fulfilled', \
    'Out of Collection'])))\
                                                   
    & (Accounts_Customers['meas_cusl_communication_acch_month_difference'] == 0)]

In [27]:
Accounts_Customers_Perimeter

Unnamed: 0,account_id,customer_id,snapnum,meas_acch_date_nominal,meas_accl_appl_status,meas_accl_application_bucket,meas_accl_application_pending,meas_accl_paid_in_full_cm,meas_cusl_min_communication_date_3m_next,meas_acch_date_nominal_next,meas_accl_paid_in_full_cm_next,meas_accl_paid_in_full_cm_next_2,meas_cusl_communication_acch_month_difference
192,387954,219952,38,2019-02-01,,,,No,2019-03-14,2019-03-01,No,No,0
264,345618,220000,38,2019-02-01,,,,No,2019-03-20,2019-03-01,No,No,0
276,404114,220013,38,2019-02-01,,,,No,2019-03-06,2019-03-01,No,No,0
277,407268,220013,38,2019-02-01,,,,No,2019-03-06,2019-03-01,No,No,0
278,615320,220013,38,2019-02-01,,,,No,2019-03-06,2019-03-01,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3389523,1121737,681122,37,2019-01-01,,,,No,2019-02-26,2019-02-01,No,No,0
3389645,1122387,682656,37,2019-01-01,,,,No,2019-02-05,2019-02-01,No,No,0
3389707,1122120,681149,37,2019-01-01,,,,No,2019-02-08,2019-02-01,No,No,0
3389877,1122023,681766,37,2019-01-01,,,,No,2019-02-22,2019-02-01,No,No,0


Create the target based on **meas_accl_paid_in_full_cm** being "Yes" for either one of the next two snapshots:

In [28]:
Accounts_Customers_Perimeter.loc[(Accounts_Customers_Perimeter.meas_accl_paid_in_full_cm_next == 'Yes') \
    | (Accounts_Customers_Perimeter['meas_accl_paid_in_full_cm_next_2'] == 'Yes'),'target'] = 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Accounts_Customers_Perimeter.loc[(Accounts_Customers_Perimeter.meas_accl_paid_in_full_cm_next == 'Yes') \


In [29]:
Accounts_Customers_Perimeter.loc[Accounts_Customers_Perimeter.target.isna(), 'target'] = 0

In [30]:
Accounts_Customers_Perimeter = Accounts_Customers_Perimeter[['account_id', 'customer_id', 'snapnum', 'meas_acch_date_nominal',
       'meas_accl_appl_status', 'meas_accl_application_bucket',
       'meas_accl_application_pending', 'meas_accl_paid_in_full_cm',
       'meas_cusl_min_communication_date_3m_next',
       'meas_acch_date_nominal_next',
       'meas_cusl_communication_acch_month_difference', 
       'meas_accl_paid_in_full_cm_next', 'meas_accl_paid_in_full_cm_next_2',
       'target']]

Create a new dataset Accounts_Customers_Final, keeping only the useful columns for the prediction:

In [31]:
Accounts_Customers_Final = Accounts_Customers_Perimeter[['account_id', 'customer_id', 'snapnum',\
    'meas_accl_paid_in_full_cm','meas_accl_appl_status','meas_accl_application_bucket','target']]

The distribution of the classes:

In [32]:
Accounts_Customers_Final['target'].value_counts()

0.0    253633
1.0       545
Name: target, dtype: int64

In [33]:
Accounts_Customers_Final['target'].value_counts(normalize=True).round(4)

0.0    0.9979
1.0    0.0021
Name: target, dtype: float64

We can see that an account/customer can be related to more than one comments in one snapshot:

In [34]:
TMP_Actions.groupby(['meas_action_acct_code_concerned','snapnum']).size()\
    [TMP_Actions.groupby(['meas_action_acct_code_concerned','snapnum']).size()>1]


meas_action_acct_code_concerned  snapnum
344984.0                         37         5
                                 38         6
                                 39         8
                                 40         4
                                 42         3
                                           ..
1122558.0                        48         2
1122559.0                        47         5
1122560.0                        41         4
                                 47         8
                                 48         2
Length: 1383326, dtype: int64

Let's see an example of an account with more than on comments in the same month:

In [35]:
TMP_Actions.loc[(TMP_Actions.meas_action_acct_code_concerned==344984) & (TMP_Actions.snapnum==38)]

Unnamed: 0,index,id,snapnum,meas_action_datetime,meas_action_acct_code_concerned,meas_action_cust_code_concerned,meas_action_comment
2084865,11609,151946553,38,2019-02-01,344984.0,373503.0,##########
2172572,99316,152136330,38,2019-02-06,344984.0,373503.0,##########
2258290,185034,152311698,38,2019-02-08,344984.0,373503.0,##########
2422738,349482,152611980,38,2019-02-15,344984.0,373503.0,##########
2529551,456295,152833818,38,2019-02-20,344984.0,373503.0,##########
2621059,547803,152994832,38,2019-02-25,344984.0,373503.0,##########


To create our dataset, we have to join Accounts_Customers_Final (perimeter applied) with TMP_Actions df which contains the comments:

In [135]:
# final_dataset = Accounts_Customers_Final.merge(TMP_Actions, how='inner', left_on=['account_id','snapnum'],\
#     right_on=['meas_action_acct_code_concerned','snapnum'])

final_dataset = Accounts_Customers_Final.merge(TMP_Actions, how='left', left_on=['account_id','snapnum'],\
    right_on=['meas_action_acct_code_concerned','snapnum'])

In [136]:
final_dataset.shape

(286002, 13)

Let's see accounts with more than one comments per snapshot:

In [137]:
final_dataset.groupby(['account_id','snapnum']).size()\
    [final_dataset.groupby(['account_id','snapnum']).size()>1].sort_values(ascending=False)

account_id  snapnum
563449      38         36
556750      38         32
623907      38         30
553371      38         30
374256      40         26
                       ..
559968      41          2
443364      40          2
443409      39          2
443419      44          2
510721      38          2
Length: 13814, dtype: int64

And an example:

In [116]:
final_dataset.loc[(final_dataset.account_id == 563449) & (final_dataset.snapnum == 38)]

Unnamed: 0,account_id,customer_id,snapnum,meas_accl_paid_in_full_cm,meas_accl_appl_status,meas_accl_application_bucket,target,index,id,meas_action_datetime,meas_action_acct_code_concerned,meas_action_cust_code_concerned,meas_action_comment
3702,563449,354938,38,No,,,0.0,484127,151994294,2019-02-04,563449.0,325640.0,"κα [xxxx] ζητησε επνκλ αυριο στισ ####,"
3703,563449,354938,38,No,,,0.0,484128,151994295,2019-02-04,563449.0,325640.0,"κα [xxxx] ζητησε επνκλ αυριο στισ ####,"
3704,563449,354938,38,No,,,0.0,560742,152040398,2019-02-05,563449.0,325640.0,δα
3705,563449,354938,38,No,,,0.0,560743,152040399,2019-02-05,563449.0,325640.0,δα
3706,563449,354938,38,No,,,0.0,565735,152042903,2019-02-05,563449.0,325640.0,δα
3707,563449,354938,38,No,,,0.0,565736,152042904,2019-02-05,563449.0,325640.0,δα
3708,563449,354938,38,No,,,0.0,565896,152042982,2019-02-05,563449.0,355207.0,δεδ
3709,563449,354938,38,No,,,0.0,565897,152042983,2019-02-05,563449.0,355207.0,δεδ
3710,563449,354938,38,No,,,0.0,648572,152117285,2019-02-06,563449.0,325640.0,δα
3711,563449,354938,38,No,,,0.0,648575,152117288,2019-02-06,563449.0,325640.0,δα


In [138]:
final_dataset['meas_action_datetime']=pd.to_datetime(final_dataset['meas_action_datetime'])

In [139]:
final_dataset['meas_action_comment_str']=final_dataset['meas_action_comment'].astype(str)

SAMESNAP, XXBOCOMMENT etc separators will be used later with TextCNN. For now, we will just concatenate the comments together. However, it is critical to retain the chronological order. We first concatenate all the comments belonging to the current snapshot:

In [140]:
# final_dataset['meas_action_comment_concat'] = final_dataset.groupby(['account_id','snapnum'])['meas_action_comment_str']\
#     .transform(lambda x:  'SAMESNAP ' + ' XXBOCOMMENT '.join(x))


final_dataset['meas_action_comment_concat'] = final_dataset.sort_values(['meas_action_datetime'], ascending=False).\
    groupby(['account_id','snapnum'])['meas_action_comment_str'].transform(lambda x: ' '.join(x))


In [141]:
final_dataset = final_dataset[['account_id', 'customer_id', 'snapnum', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket', 'meas_action_comment_concat', 'target']]

There are duplicates in the resulting dataframe, because we have not applied an aggregate function, we just simply concatenated comments together. Thus we need to remove the duplicates:

In [142]:
final_dataset.drop_duplicates(inplace=True)

final_dataset = final_dataset.reset_index(drop=True)

final_dataset.groupby(['account_id','snapnum']).size()[final_dataset.groupby(['account_id','snapnum']).size()>1]

Series([], dtype: int64)

In [143]:
final_dataset.target.value_counts()

0.0    253633
1.0       545
Name: target, dtype: int64

In [153]:
final_dataset.query("""meas_action_comment_concat == 'nan' and target == 1""")

Unnamed: 0,account_id,customer_id,snapnum,meas_accl_paid_in_full_cm,meas_accl_appl_status,meas_accl_application_bucket,meas_action_comment_concat,target
651,423126,222549,38,No,,,,1.0
652,507914,222549,38,No,,,,1.0
1146,409205,250385,38,No,,,,1.0
2160,415130,265624,38,No,,,,1.0
2161,418920,265624,38,No,,,,1.0
...,...,...,...,...,...,...,...,...
253240,648666,423051,37,No,,,,1.0
253359,610186,360868,37,No,,,,1.0
253656,633048,353134,37,No,,,,1.0
254151,673802,402602,37,No,,,,1.0


In [154]:
TMP_Actions.query("""meas_action_acct_code_concerned == 507914 and snapnum==38""")

Unnamed: 0,index,id,snapnum,meas_action_datetime,meas_action_acct_code_concerned,meas_action_cust_code_concerned,meas_action_comment


Likewise, we will include the concatenated comments of the previous and pre-previous snapshots:

In [123]:
final_dataset['prev_snapnum']=final_dataset['snapnum']-1

In [124]:
final_dataset = final_dataset.merge(TMP_Actions, how='left', left_on=['account_id','prev_snapnum'],\
    right_on=['meas_action_acct_code_concerned','snapnum'])

In [126]:
final_dataset = final_dataset[['account_id', 'customer_id', 'snapnum_x', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket',
       'meas_action_comment_concat', 'target', 'meas_action_datetime', 'meas_action_acct_code_concerned',
       'meas_action_cust_code_concerned', 'meas_action_comment']]

We can see that more than one customers may have left a comment for the same account. This does not affect our prediction task however, since we want to predict the probability of repayment in terms of an account:

In [127]:
final_dataset.groupby(['account_id'])['meas_action_cust_code_concerned'].nunique().groupby(final_dataset.groupby\
    (['account_id'])['meas_action_cust_code_concerned'].nunique()).size()

meas_action_cust_code_concerned
1    12899
2      243
3       46
4        6
Name: meas_action_cust_code_concerned, dtype: int64

In [128]:
final_dataset = final_dataset[['account_id', 'snapnum_x', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket',
       'meas_action_comment_concat', 'target', 'meas_action_datetime',
       'meas_action_comment']]

In [129]:
final_dataset.rename(columns={'snapnum_x':'snapnum','meas_action_comment':'meas_action_comment_prev_snap',\
    'meas_action_datetime':'meas_action_datetime_prev_snap'},inplace=True)

In [130]:
final_dataset['meas_action_comment_prev_snap'] = final_dataset['meas_action_comment_prev_snap'].astype(str)

final_dataset['meas_action_datetime_prev_snap'] = pd.to_datetime(final_dataset['meas_action_datetime_prev_snap'])


# final_dataset['meas_action_comment_prev_concat'] = final_dataset.groupby(['account_id','snapnum'])\
#     ['meas_action_comment_prev_snap'].transform(lambda x: 'PREVSNAP ' + ' XXBOCOMMENT '.join(x))

final_dataset['meas_action_comment_prev_concat'] = final_dataset.sort_values(['meas_action_datetime_prev_snap'],\
    ascending=False).groupby(['account_id','snapnum'])['meas_action_comment_prev_snap']\
        .transform(lambda x: ' '.join(x))

In [131]:
final_dataset = final_dataset.drop(columns=['meas_action_comment_prev_snap','meas_action_datetime_prev_snap'])

Now we have again duplicates that we need to take care off:

In [132]:
final_dataset.groupby(['account_id','snapnum']).size()[final_dataset.groupby(['account_id','snapnum']).size()>1]

account_id  snapnum
344991      40         2
345023      39         2
345039      41         4
345054      40         3
345097      40         2
                      ..
1122028     40         2
1122100     46         2
1122101     46         2
1122404     44         2
1122541     38         3
Length: 8557, dtype: int64

In [133]:
final_dataset.drop_duplicates(inplace=True)

final_dataset = final_dataset.reset_index()

final_dataset.groupby(['account_id','snapnum']).size()[final_dataset.groupby(['account_id','snapnum']).size()>1]

Series([], dtype: int64)

In [112]:
final_dataset[final_dataset['meas_action_comment_prev_concat'].apply(lambda x: len(x)) < 5]\
    ['meas_action_comment_prev_concat'].value_counts()

nan     13490
δ.α.        2
Name: meas_action_comment_prev_concat, dtype: int64

In [None]:
final_dataset['pre_prev_snapnum']=final_dataset['snapnum']-2

final_dataset = final_dataset.merge(TMP_Actions,how='inner',left_on=['account_id','pre_prev_snapnum'],right_on=\
    ['meas_action_acct_code_concerned','snapnum'])

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum_x', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket',
       'meas_action_comment_concat', 'target',
       'meas_action_comment_prev_concat', 'meas_action_comment', 'meas_action_datetime']]

In [None]:
final_dataset.rename(columns={'snapnum_x':'snapnum','meas_action_comment':'meas_action_comment_pre_prev_snap',\
    'meas_action_datetime':'meas_action_datetime_pre_prev_snap'},inplace=True)

In [None]:
final_dataset['meas_action_comment_pre_prev_snap'] = final_dataset['meas_action_comment_pre_prev_snap'].astype('str')

final_dataset['meas_action_datetime_pre_prev_snap'] = pd.to_datetime(final_dataset['meas_action_datetime_pre_prev_snap'])


# final_dataset['meas_action_comment_pre_prev_concat'] = final_dataset.groupby(['account_id','snapnum'])\
#     ['meas_action_comment_pre_prev_snap'].transform(lambda x: 'PREPREVSNAP ' + ' XXBOCOMMENT '.join(x))

final_dataset['meas_action_comment_pre_prev_concat'] = final_dataset.sort_values(['meas_action_datetime_pre_prev_snap']\
    ,ascending=False).groupby(['account_id','snapnum'])['meas_action_comment_pre_prev_snap'].\
        transform(lambda x: ' '.join(x))

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket',
       'meas_action_comment_concat',
       'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat', 'target']]

For the same reason as above, since we do not apply a real aggregate function, we need to deal with the duplicate records now:

In [None]:
final_dataset.drop_duplicates(inplace=True)

final_dataset.groupby(['account_id','snapnum']).size()[final_dataset.groupby(['account_id','snapnum']).size()>1]

We will also include the buckets from previous and pre-previous snapshots:

In [None]:
DMCR_Unstruct_Accounts['snapnum_next'] = DMCR_Unstruct_Accounts['snapnum']+1

final_dataset = final_dataset.merge(DMCR_Unstruct_Accounts, how='inner', left_on=['account_id','snapnum'], \
    right_on=['id','snapnum_next'])


In [None]:
final_dataset = final_dataset[['account_id', 'snapnum_x', 'meas_accl_paid_in_full_cm_x',
       'meas_accl_appl_status_x', 'meas_accl_application_bucket_x',
       'meas_action_comment_concat', 'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat', 'target', 'meas_accl_application_bucket_y']]

In [None]:
final_dataset = final_dataset.rename(columns={'snapnum_x':'snapnum','meas_accl_paid_in_full_cm_x':\
    'meas_accl_paid_in_full_cm','meas_accl_appl_status_x':'meas_accl_appl_status','meas_accl_application_bucket_x':\
    'meas_accl_application_bucket','meas_accl_application_bucket_y':'meas_accl_application_bucket_prev'})

In [None]:
DMCR_Unstruct_Accounts['snapnum_after_next'] = DMCR_Unstruct_Accounts['snapnum']+2

final_dataset = final_dataset.merge(DMCR_Unstruct_Accounts, how='inner', left_on=['account_id','snapnum'], \
    right_on=['id','snapnum_after_next'])

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum_x', 'meas_accl_paid_in_full_cm_x',
       'meas_accl_appl_status_x', 'meas_accl_application_bucket_x','meas_accl_application_bucket_prev',
       'meas_action_comment_concat', 'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat', 'target', 'meas_accl_application_bucket_y']]

In [None]:
final_dataset = final_dataset.rename(columns={'snapnum_x':'snapnum','meas_accl_paid_in_full_cm_x':\
    'meas_accl_paid_in_full_cm','meas_accl_appl_status_x':'meas_accl_appl_status','meas_accl_application_bucket_x':\
    'meas_accl_application_bucket','meas_accl_application_bucket_y':'meas_accl_application_bucket_pre_prev'})

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum', 'meas_accl_paid_in_full_cm',
       'meas_accl_appl_status', 'meas_accl_application_bucket', 'meas_accl_application_bucket_prev', 
       'meas_accl_application_bucket_pre_prev','meas_action_comment_concat', 'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat', 'target']]

In [None]:
final_dataset

Next, we are applying one-hot encoding for **meas_accl_paid_in_full_cm** and **meas_accl_appl_status**:

In [None]:
final_dataset = pd.get_dummies(final_dataset, columns=['meas_accl_paid_in_full_cm','meas_accl_appl_status'])

In [None]:
final_dataset.columns

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum', 'meas_accl_application_bucket',
       'meas_accl_application_bucket_prev',
       'meas_accl_application_bucket_pre_prev', 'meas_accl_paid_in_full_cm_No',
       'meas_accl_appl_status_Cancelled',
       'meas_accl_appl_status_Cancelled - Client Rejection',
       'meas_accl_appl_status_Cancelled - Communication Failed', 'meas_accl_appl_status_Not Fulfilled',
       'meas_action_comment_concat',
       'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat', 'target'
       ]]

In [None]:
final_dataset

In [None]:
final_dataset.columns

We are going to split X (features) and y (target) to train and test dataset:

In [None]:
X = final_dataset[['account_id', 'snapnum', 'meas_accl_application_bucket',
       'meas_accl_application_bucket_prev',
       'meas_accl_application_bucket_pre_prev', 'meas_accl_paid_in_full_cm_No',
       'meas_accl_appl_status_Cancelled',
       'meas_accl_appl_status_Cancelled - Client Rejection',
       'meas_accl_appl_status_Cancelled - Communication Failed',
       'meas_accl_appl_status_Not Fulfilled', 'meas_action_comment_concat',
       'meas_action_comment_prev_concat',
       'meas_action_comment_pre_prev_concat']]

In [None]:
y = final_dataset[['target']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
y_train.groupby(['target']).size()/len(y_train)

In [None]:
y_test.groupby(['target']).size()/len(y_test)

Next, we are going to concatenate all columns containing comments in order to find our vocabulary:

In [None]:
word_dictionary = ' '.join(X_train['meas_action_comment_concat']) + ' ' + \
    ' '.join(X_train['meas_action_comment_prev_concat']) + ' ' +\
    ' '.join(X_train['meas_action_comment_pre_prev_concat'])

In [None]:
words = word_dictionary.split()

In [None]:
len(words)

In [None]:
words_set = set(words)

In [None]:
len(words_set)

We concatenate together the comments from current, previous and pre-previous snapshots, to create 3-months history:

In [None]:
X_train['meas_action_comment_concat_3m'] = X_train['meas_action_comment_concat'] + ' ' + \
    X_train['meas_action_comment_prev_concat'] + ' ' + X_train['meas_action_comment_pre_prev_concat']

X_test['meas_action_comment_concat_3m'] = X_test['meas_action_comment_concat'] + ' ' + \
    X_test['meas_action_comment_prev_concat'] + ' ' + X_test['meas_action_comment_pre_prev_concat']

We will create a Language object using SpaCy library and enhance the lemmatizer rules with the custom lookup table:

In [None]:
nlp = spacy.load('el_core_news_lg')
nlp.tokenizer.infix_finditer = spacy.util.compile_infix_regex(nlp.Defaults.infixes + [r"(?<!\d)\.(?!\d)"]).finditer
nlp.tokenizer.url_match = None
custom_lookup_table(nlp)

In [None]:
X_train['meas_action_comment_concat_3m_doc'] = X_train['meas_action_comment_concat_3m'].\
    apply(lambda x: call_spacy_nlp(nlp, x))

# X_train['meas_action_comment_concat_tokens'] = X_train['meas_action_comment_concat_doc']\
#     .apply(lambda x: build_tokens_list(x))

X_train['meas_action_comment_concat_3m_lemmas'] = X_train['meas_action_comment_concat_3m_doc']\
    .apply(lambda x: build_lemmas_list(x))

# X_train['meas_action_comment_concat_cleaned'] = X_train['meas_action_comment_concat_doc']\
#     .apply(lambda x: remove_not_alpha_list(x))

# X_train['meas_action_comment_concat_cleaned_string'] = X_train.meas_action_comment_concat_cleaned.apply\
#     (lambda x: convert_list_to_string(x))

X_train['meas_action_comment_concat_3m_lemmas_string'] = X_train.meas_action_comment_concat_3m_lemmas.apply\
    (lambda x: convert_list_to_string(x))

In [None]:
X_train

In [None]:
nlp1 = spacy.load('el_core_news_lg')
nlp1.tokenizer.infix_finditer = spacy.util.compile_infix_regex(nlp1.Defaults.infixes + [r"(?<!\d)\.(?!\d)"]).finditer
nlp1.tokenizer.url_match = None
custom_lookup_table(nlp1)

In [None]:
X_test['meas_action_comment_concat_3m_doc'] = X_test['meas_action_comment_concat_3m'].\
    apply(lambda x: call_spacy_nlp(nlp1, x))

# X_test['meas_action_comment_concat_tokens'] = X_test['meas_action_comment_concat_doc']\
#     .apply(lambda x: build_tokens_list(x))

X_test['meas_action_comment_concat_3m_lemmas'] = X_test['meas_action_comment_concat_3m_doc']\
    .apply(lambda x: build_lemmas_list(x))

# X_test['meas_action_comment_concat_cleaned'] = X_test['meas_action_comment_concat_doc']\
#     .apply(lambda x: remove_not_alpha_list(x))

# X_test['meas_action_comment_concat_cleaned_string'] = X_test.meas_action_comment_concat_cleaned.apply\
#     (lambda x: convert_list_to_string(x))

X_test['meas_action_comment_concat_3m_lemmas_string'] = X_test.meas_action_comment_concat_3m_lemmas.apply\
    (lambda x: convert_list_to_string(x))

X_test

In [None]:
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train['meas_action_comment_concat_3m_lemmas_string'])

X_test_bow = vectorizer.transform(X_test['meas_action_comment_concat_3m_lemmas_string'])

In [None]:
X_train_bow_df_init = pd.DataFrame(X_train_bow.toarray())
X_train_bow_df_init.columns = vectorizer.get_feature_names_out()

X_test_bow_df_init = pd.DataFrame(X_test_bow.toarray())
X_test_bow_df_init.columns = vectorizer.get_feature_names_out()

In [None]:
X_train.columns

In [None]:
X_train_reduced = X_train[['account_id','snapnum','meas_accl_application_bucket',
       'meas_accl_application_bucket_prev',
       'meas_accl_application_bucket_pre_prev', 'meas_accl_paid_in_full_cm_No',
       'meas_accl_appl_status_Cancelled',
       'meas_accl_appl_status_Cancelled - Client Rejection',
       'meas_accl_appl_status_Cancelled - Communication Failed',
       'meas_accl_appl_status_Not Fulfilled']]

X_train_bow_df = pd.concat([X_train_reduced.reset_index(drop=True), X_train_bow_df_init.reset_index(drop=True)], \
                           axis=1)

In [None]:
y_train = y_train.reset_index(drop=True)

In [None]:
X_test_reduced = X_test[['account_id','snapnum','meas_accl_application_bucket',
       'meas_accl_application_bucket_prev',
       'meas_accl_application_bucket_pre_prev', 'meas_accl_paid_in_full_cm_No',
       'meas_accl_appl_status_Cancelled',
       'meas_accl_appl_status_Cancelled - Client Rejection',
       'meas_accl_appl_status_Cancelled - Communication Failed',
       'meas_accl_appl_status_Not Fulfilled']]

X_test_bow_df = pd.concat([X_test_reduced.reset_index(drop=True), X_test_bow_df_init.reset_index(drop=True)], axis=1)

In [None]:
y_test = y_test.reset_index(drop=True)

In [None]:
X_train_bow_df_fin = X_train_bow_df.drop(columns=['account_id','snapnum'])

X_test_bow_df_fin = X_test_bow_df.drop(columns=['account_id','snapnum'])

In [None]:
# X_test.columns[X_test.isna().sum()>0]

X_train_bow_df_fin.fillna(0, inplace=True)
X_test_bow_df_fin.fillna(0, inplace=True)

In [None]:
X_train_bow_df_fin.shape

In [None]:
X_test_bow_df_fin.shape

In [None]:
nb_clf_bow = MultinomialNB()

tree_clf_bow = DecisionTreeClassifier(random_state=32)

svc_clf_bow = SVC(probability=True, random_state=33)

rf_clf_bow = RandomForestClassifier(random_state=34)

xgb_clf_bow = XGBClassifier(random_state=35)

ada_clf_bow = AdaBoostClassifier(random_state=36)



models = []
models.append(('MNB',  nb_clf_bow))
models.append(('SVC', svc_clf_bow))
models.append(('ABC',  ada_clf_bow))
models.append(('DT',  tree_clf_bow))
models.append(('RF',  rf_clf_bow))
models.append(('XGB',  xgb_clf_bow))

In [None]:
results_bow = defaultdict(dict)

# The expanding data split of train/test begins at position 5447 and will repeat for 20 time steps, until we predict the last
# day's target


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_bow_df_fin, y_train, \
                                          X_test_bow_df_fin, y_test, accuracy_score, classification_report)
    
    
    results_bow[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_bow[model[0]]['accuracy'] = acc
    results_bow[model[0]]['recall'] = rec
    results_bow[model[0]]['precision'] = prec
    results_bow[model[0]]['f1_score'] = f1_score
    

In [None]:
results_bow

In [None]:
normalizer1 = Normalizer()
X_train_bow_df_fin_norm = normalizer1.fit_transform(X_train_bow_df_fin)
X_test_bow_df_fin_norm = normalizer1.transform(X_test_bow_df_fin)

In [None]:
tree_clf_bow_norm = DecisionTreeClassifier(random_state=41)

svc_clf_bow_norm = SVC(probability=True, random_state=42)

rf_clf_bow_norm = RandomForestClassifier(random_state=43)

xgb_clf_bow_norm = XGBClassifier(random_state=44)

ada_clf_bow_norm = AdaBoostClassifier(random_state=45)



models = []
models.append(('SVC', svc_clf_bow_norm))
models.append(('ABC',  ada_clf_bow_norm))
models.append(('DT',  tree_clf_bow_norm))
models.append(('RF',  rf_clf_bow_norm))
models.append(('XGB',  xgb_clf_bow_norm))

In [None]:
results_bow_norm = defaultdict(dict)

# The expanding data split of train/test begins at position 5447 and will repeat for 20 time steps, until we predict the last
# day's target


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_bow_df_fin_norm, y_train, X_test_bow_df_fin_norm, y_test, \
                                          accuracy_score, classification_report)
    
    
    results_bow_norm[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_bow_norm[model[0]]['accuracy'] = acc
    results_bow_norm[model[0]]['recall'] = rec
    results_bow_norm[model[0]]['precision'] = prec
    results_bow_norm[model[0]]['f1_score'] = f1_score
    

In [None]:
results_bow_norm

In [None]:
weights = {0:1, 1:5}

tree_clf_bow_norm_weight = DecisionTreeClassifier(random_state=41, class_weight=weights)

svc_clf_bow_norm_weight = SVC(probability=True, random_state=42, class_weight=weights)

rf_clf_bow_norm_weight = RandomForestClassifier(random_state=43, class_weight=weights)

models = []
models.append(('SVC', svc_clf_bow_norm_weight))
# models.append(('ABC',  ada_clf_bow_norm))
models.append(('DT',  tree_clf_bow_norm_weight))
models.append(('RF',  rf_clf_bow_norm_weight))
# models.append(('XGB',  xgb_clf_bow_norm))


results_bow_norm_weight = defaultdict(dict)


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_bow_df_fin_norm, y_train, X_test_bow_df_fin_norm, y_test, \
                                          accuracy_score, classification_report)
    
    
    results_bow_norm_weight[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_bow_norm_weight[model[0]]['accuracy'] = acc
    results_bow_norm_weight[model[0]]['recall'] = rec
    results_bow_norm_weight[model[0]]['precision'] = prec
    results_bow_norm_weight[model[0]]['f1_score'] = f1_score
    

In [None]:
results_bow_norm_weight

We will repeat the same methodology using n_grams (1,3). We will first train and evaluate on non-normalized data, then we will normalize, and then we will also use class weights on the normalized data.

In [None]:
vectorizer_n_grams = CountVectorizer(ngram_range=(1,3))

X_train_n_grams = vectorizer_n_grams.fit_transform(X_train['meas_action_comment_concat_3m_lemmas_string'])
X_train_n_grams_df_init = pd.DataFrame(X_train_n_grams.toarray())
X_train_n_grams_df_init.columns = vectorizer_n_grams.get_feature_names_out()

X_test_n_grams = vectorizer_n_grams.transform(X_test['meas_action_comment_concat_3m_lemmas_string'])
X_test_n_grams_df_init = pd.DataFrame(X_test_n_grams.toarray())
X_test_n_grams_df_init.columns = vectorizer_n_grams.get_feature_names_out()

In [None]:
X_train_n_grams_df = pd.concat([X_train_reduced.reset_index(drop=True), X_train_n_grams_df_init.reset_index(drop=True)], \
                           axis=1)

X_test_n_grams_df = pd.concat([X_test_reduced.reset_index(drop=True), X_test_n_grams_df_init.reset_index(drop=True)], \
                           axis=1)

X_train_n_grams_df_fin = X_train_n_grams_df.drop(columns=['account_id','snapnum'])

X_test_n_grams_df_fin = X_test_n_grams_df.drop(columns=['account_id','snapnum'])

In [None]:
# X_test.columns[X_test.isna().sum()>0]

X_train_n_grams_df_fin.fillna(0, inplace=True)
X_test_n_grams_df_fin.fillna(0, inplace=True)

In [None]:
X_train_n_grams_df_fin.shape

In [None]:
nb_clf_ng = MultinomialNB()

tree_clf_ng = DecisionTreeClassifier(random_state=12)

svc_clf_ng = SVC(probability=True, random_state=13)

rf_clf_ng = RandomForestClassifier(random_state=14)

xgb_clf_ng = XGBClassifier(random_state=15)

ada_clf_ng = AdaBoostClassifier(random_state=16)



models = []
models.append(('MNB',  nb_clf_ng))
models.append(('SVC', svc_clf_ng))
models.append(('ABC',  ada_clf_ng))
models.append(('DT',  tree_clf_ng))
models.append(('RF',  rf_clf_ng))
models.append(('XGB',  xgb_clf_ng))

In [None]:
results_n_grams = defaultdict(dict)

# The expanding data split of train/test begins at position 5447 and will repeat for 20 time steps, until we predict the last
# day's target


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_n_grams_df_fin, y_train, X_test_n_grams_df_fin, y_test, \
                                          accuracy_score, classification_report)
    
    
    results_n_grams[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_n_grams[model[0]]['accuracy'] = acc
    results_n_grams[model[0]]['recall'] = rec
    results_n_grams[model[0]]['precision'] = prec
    results_n_grams[model[0]]['f1_score'] = f1_score
    

In [None]:
results_n_grams

In [None]:
# Creating importances_df dataframe
importances_df = pd.DataFrame({"feature_names" : rf_clf_ng.feature_names_in_, 
                               "importances" : rf_clf_ng.feature_importances_})
                             
# # Plotting bar chart, g is from graph
# g = sns.barplot(x=importances_df["feature_names"], 
#                 y=importances_df["importances"])
# g.set_title("Feature importances", fontsize=14);   

In [None]:
importances_df.sort_values(by=['importances'],ascending=False).head(20)

In [None]:
describe_df = X_train_n_grams_df_fin.describe()

In [None]:
describe_df.iloc[1].sort_values(ascending=False)

In [None]:
normalizer2 = Normalizer()
X_train_n_grams_df_fin_norm = normalizer2.fit_transform(X_train_n_grams_df_fin)
X_test_n_grams_df_fin_norm = normalizer2.transform(X_test_n_grams_df_fin)

In [None]:
nb_clf_ng_norm = MultinomialNB()

tree_clf_ng_norm = DecisionTreeClassifier(random_state=22)

svc_clf_ng_norm = SVC(probability=True, random_state=23)

rf_clf_ng_norm = RandomForestClassifier(random_state=24)

xgb_clf_ng_norm = XGBClassifier(random_state=25)

ada_clf_ng_norm = AdaBoostClassifier(random_state=26)



models = []
models.append(('MNB',  nb_clf_ng_norm))
models.append(('SVC', svc_clf_ng_norm))
models.append(('ABC',  ada_clf_ng_norm))
models.append(('DT',  tree_clf_ng_norm))
models.append(('RF',  rf_clf_ng_norm))
models.append(('XGB',  xgb_clf_ng_norm))

In [None]:
results_ng_norm = defaultdict(dict)

# The expanding data split of train/test begins at position 5447 and will repeat for 20 time steps, until we predict the last
# day's target


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_n_grams_df_fin_norm, y_train, \
                                          X_test_n_grams_df_fin_norm, y_test, accuracy_score, classification_report)
    
    
    results_ng_norm[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_ng_norm[model[0]]['accuracy'] = acc
    results_ng_norm[model[0]]['recall'] = rec
    results_ng_norm[model[0]]['precision'] = prec
    results_ng_norm[model[0]]['f1_score'] = f1_score
    

In [None]:
results_ng_norm

In [None]:
weights = {0:1, 1:5}

tree_clf_ng_norm_weight = DecisionTreeClassifier(random_state=61, class_weight=weights)

svc_clf_ng_norm_weight = SVC(probability=True, random_state=62, class_weight=weights)

rf_clf_ng_norm_weight = RandomForestClassifier(random_state=63, class_weight=weights)

models = []
models.append(('SVC', svc_clf_ng_norm_weight))
# models.append(('ABC',  ada_clf_bow_norm))
models.append(('DT',  tree_clf_ng_norm_weight))
models.append(('RF',  rf_clf_ng_norm_weight))
# models.append(('XGB',  xgb_clf_bow_norm))


results_ng_norm_weight = defaultdict(dict)


for model in models:
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    rec, prec, f1_score, acc = model_eval(model[1], X_train_n_grams_df_fin_norm, y_train, X_test_n_grams_df_fin_norm, y_test, \
                                          accuracy_score, classification_report)
    
    
    results_ng_norm_weight[model[0]] = {'accuracy':[], 'recall':[], 'precision':[], 'f1_score':[]}
    results_ng_norm_weight[model[0]]['accuracy'] = acc
    results_ng_norm_weight[model[0]]['recall'] = rec
    results_ng_norm_weight[model[0]]['precision'] = prec
    results_ng_norm_weight[model[0]]['f1_score'] = f1_score
    

In [None]:
results_ng_norm_weight