In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install ipynb
!pip install lime
!python -m spacy download el_core_news_lg

%cd '/content/drive/MyDrive/MSc Thesis'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2024-01-25 11:58:06.242086: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-25 11:58:06.242165: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-25 11:58:06.245046: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting el-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/el_core_news_lg-3.6.0/el_core_news_lg-3.6.0-py3-none-any.whl (568.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import numpy as np
import spacy
import lime
import warnings
from ipynb.fs.full.common_functions import call_spacy_nlp, build_tokens_list, build_lemmas_list, remove_not_alpha_list, convert_list_to_string

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.options.display.max_colwidth = 300

In [None]:
%run 'lookup_table.ipynb'
# %run 'Dataset Construction.ipynb'

In [None]:
final_dataset = pd.read_csv('final_dataset.csv', dtype={0: np.int64, 1: np.int64, 2: str, 3: str, 4: np.float64, 5: np.float64, 6: np.float64, 7: str, 8: str, 9: str, 10: np.int64})

Next, we are applying one-hot encoding for **meas_accl_paid_in_full_cm** and **meas_accl_appl_status**:

In [None]:
final_dataset = pd.get_dummies(final_dataset, columns=['meas_accl_paid_in_full_cm','meas_accl_appl_status'])

In [None]:
final_dataset = final_dataset[['account_id', 'snapnum', 'meas_accl_application_bucket',
       'meas_accl_application_bucket_prev_-1', 'meas_accl_application_bucket_prev_-2',
       'meas_accl_paid_in_full_cm_No', 'meas_accl_paid_in_full_cm_Yes',
       'meas_accl_appl_status_Cancelled',
       'meas_accl_appl_status_Cancelled - Client Rejection',
       'meas_accl_appl_status_Cancelled - Communication Failed',
       'meas_accl_appl_status_Not Fulfilled',
       'meas_action_comment_concat_0',
       'meas_action_comment_concat_-1',
       'meas_action_comment_concat_-2', 'target'
       ]]

In [None]:
final_dataset['meas_accl_application_bucket'].fillna(-1, inplace=True)
final_dataset['meas_accl_application_bucket_prev_-1'].fillna(-1, inplace=True)
final_dataset['meas_accl_application_bucket_prev_-2'].fillna(-1, inplace=True)
final_dataset['meas_action_comment_concat_0'].fillna('', inplace=True)
final_dataset['meas_action_comment_concat_-1'].fillna('', inplace=True)
final_dataset['meas_action_comment_concat_-2'].fillna('', inplace=True)

Here we are taking a stratified sample of our dataset:

In [None]:
stratified_sample = final_dataset.groupby('target', as_index=False).apply(
    lambda x: x.sample(frac=0.01)
).reset_index(drop=True)

We concatenate together the comments from current, previous and pre-previous snapshots, to create 3-months history:

In [None]:
stratified_sample['meas_action_comment_concat_3m'] = stratified_sample['meas_action_comment_concat_0'] + ' ' + \
    stratified_sample['meas_action_comment_concat_-1'] + ' ' + stratified_sample['meas_action_comment_concat_-2']

stratified_sample['meas_action_comment_concat_3m'] = stratified_sample['meas_action_comment_concat_3m'].str.strip()

We will create a Language object using SpaCy library and enhance the lemmatizer rules with the custom lookup table:

In [None]:
nlp = spacy.load('el_core_news_lg')
nlp.tokenizer.infix_finditer = spacy.util.compile_infix_regex(nlp.Defaults.infixes + [r"(?<!\d)\.(?!\d)"]).finditer
nlp.tokenizer.url_match = None
custom_lookup_table(nlp)

In [None]:
stratified_sample['meas_action_comment_concat_3m_doc'] = stratified_sample['meas_action_comment_concat_3m'].\
    apply(lambda x: call_spacy_nlp(nlp, x))


stratified_sample['meas_action_comment_concat_3m_lemmas'] = stratified_sample['meas_action_comment_concat_3m_doc']\
    .apply(lambda x: build_lemmas_list(x))


stratified_sample['meas_action_comment_concat_3m_lemmas_string'] = stratified_sample.meas_action_comment_concat_3m_lemmas.apply\
    (lambda x: convert_list_to_string(x))

In [None]:
present_directory = %pwd
nlp.to_disk(present_directory)

In [None]:
# Checkpointing
stratified_sample.to_pickle('stratified_sample_01_perc.pkl')

In [None]:
stratified_sample['meas_action_comment_concat_3m_lemmas_string'].to_csv('stratified_sample_01_perc.txt', index=False, header=False)

In [None]:
# Restaring VM
!kill -9 -1