In [1]:
import pandas as pd
import numpy as np

In [2]:
# for processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# for bag-of-words
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## for train test split
import imblearn

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
cfpb_df = pd.read_csv('../../data/CFPB with Duplicate Marked.csv')
print(cfpb_df.columns)
print(cfpb_df.shape)

  cfpb_df = pd.read_csv('../../data/CFPB with Duplicate Marked.csv')


Index(['Unnamed: 0', 'Date received', 'Product', 'Sub-product', 'Issue',
       'Sub-issue', 'Consumer complaint narrative', 'Company public response',
       'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID', 'narr_len',
       'days_to_today', 'dupi_id', 'dupi_len'],
      dtype='object')
(1300361, 23)


In [5]:
%%time
# Drop duplicates based on 'dupi_id' column
cfpb_df = cfpb_df.drop_duplicates(subset='dupi_id')
print(cfpb_df.shape)

(1106587, 23)
Wall time: 455 ms


### Some cleaning and normalizing while trying to keep some the context of data

our first try without any normalization did not yield good results, the accuracy was below 50% and the recal for debtcollection is very low

In [6]:
## our choices of stop words
lst_stopwords = nltk.corpus.stopwords.words('english')
#our choice of stemmer
stemm=nltk.stem.porter.PorterStemmer()
#our choice of lemmatizer
lemm=nltk.stem.wordnet.WordNetLemmatizer()


def nltk_text_normalizer(text):
    
    '''
    Preprocess a string using nltk tools.
    you need to import nltk
    download nltk.download('wordnet')
    and from nltk.stem import WordNetLemmatizer
    :parameter
        :param text: string - name of column containing text
        :param stemm: object - stemmer to be used like nltk.stem.porter.PorterStemmer()
        :param lemm: object - lemmatizer() to be used, like WordNetLemmatizer() 
        :param case_folding - whether text will be convert to lower case
        :param lst_stopwords - words to be moved
    :return
        cleaned text
    '''
    
    ## First we will clean up the XXXX maskings in the text created by CFPB
    text = re.sub(r'X{2,}', '', str(text).strip())
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', text)
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = text.lower().strip()
            
    ## Tokenize (convert from string to list)
    ## split is the simplest way
    # lst_text = text.split()
    ## here we use nltk tools to tokenize a text
    lst_text = word_tokenize(text)
    
    ## remove Stopwords
    # lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Lemmatisation (convert the word into root word)
    lst_text = [lemm.lemmatize(word) for word in lst_text]
        
    ## Stemming (remove -ing, -ly, ...)
    ## This stemming parts works pretty bad, but is faster and simplified version of Lemmatization
    lst_text = [stemm.stem(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

def normalize_narr(df):
    df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(nltk_text_normalizer)
    return df

In [None]:
%%time
cfpb_df = cfpb_df.groupby(['Product', 'Issue', 'State', 'ZIP code']).apply(func=normalize_narr)

In [None]:
cfpb_df.to_csv('clean_narr_tmp.csv', index=False)

In [None]:
cfpb_df = pd.read_csv('clean_narr_tmp.csv')

In [None]:
# split dataset train, dev, test
train_df, dev_df, test_df = np.split(cfpb_df.sample(len(cfpb_df), random_state = 42), 
                                     [int(len(cfpb_df)*0.75), int(len(cfpb_df)*0.9)])
train_df.to_csv('cfpb_train.csv', index=False)
test_df.to_csv('cfpb_test.csv', index=False)
dev_df.to_csv('cfpb_dev.csv', index=False)