### Import Modules

In [1]:
import numpy as np
import pandas as pd
import datetime as dt

# Text
import nltk, re, string
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Load Data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
processed_df=pd.read_csv('gdrive/My Drive/IBM/processed_df.csv')

Mounted at /content/gdrive


### Tokenize and Stem

In [4]:
stemmer = WordNetLemmatizer()

def tokenize_and_stem(text):
    lowers = text.lower()
    tokens = [word for sent in nltk.sent_tokenize(lowers) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and not token in stopwords.words('english'):
            filtered_tokens.append(re.sub(r'[^\w\s]','',token))
    stems = [stemmer.lemmatize(t) for t in filtered_tokens]
    return stems

In [5]:
processed_df['tokenized narrative'] = [tokenize_and_stem(x) for x in processed_df['Consumer complaint narrative']]

In [17]:
processed_df.head(10)

Unnamed: 0,Sub-product,Consumer complaint narrative,tokenized narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,"[citi, bank, response, xxxx, asked, idiotic, q..."
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,"[account, result, identity, theft]"
2,Other debt,Pro Collect has sent documents that dont match...,"[pro, collect, sent, document, dont, match, mo..."
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,"[xxxx2019, sent, plusfour, inc, certified, let..."
4,Credit card debt,Client Services is reporting negative informat...,"[client, service, reporting, negative, informa..."
5,Credit reporting,What they have reported : i was late on a paym...,"[reported, late, payment, loan, paid, full, ne..."
6,Other debt,I checked my credit report which shows delinqu...,"[checked, credit, report, show, delinquent, co..."
7,Medical debt,I started this journey XX/XX/XXXX of this year...,"[started, journey, xxxxxxxx, year, checked, cr..."
8,Medical debt,In XX/XX/XXXX I was approved for 100 % financi...,"[xxxxxxxx, approved, financial, aid, xxxx, xxx..."
9,Credit reporting,ARS ACCOUNT RESOLUTIONS are still reporting in...,"[ar, account, resolution, still, reporting, in..."


In [18]:
processed_df['stemmed narrative']  = processed_df['tokenized narrative'].apply(lambda x: re.sub('x{2,}', '', ' '.join(x)))

In [19]:
processed_df.head()

Unnamed: 0,Sub-product,Consumer complaint narrative,tokenized narrative,stemmed narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,"[citi, bank, response, xxxx, asked, idiotic, q...",citi bank response asked idiotic question ina...
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,"[account, result, identity, theft]",account result identity theft
2,Other debt,Pro Collect has sent documents that dont match...,"[pro, collect, sent, document, dont, match, mo...",pro collect sent document dont match move date...
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,"[xxxx2019, sent, plusfour, inc, certified, let...",2019 sent plusfour inc certified letter via us...
4,Credit card debt,Client Services is reporting negative informat...,"[client, service, reporting, negative, informa...",client service reporting negative information ...


In [20]:
processed_df.drop('tokenized narrative', axis=1, inplace=True)
processed_df.head()

Unnamed: 0,Sub-product,Consumer complaint narrative,stemmed narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,citi bank response asked idiotic question ina...
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,account result identity theft
2,Other debt,Pro Collect has sent documents that dont match...,pro collect sent document dont match move date...
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,2019 sent plusfour inc certified letter via us...
4,Credit card debt,Client Services is reporting negative informat...,client service reporting negative information ...


In [21]:
from google.colab import files

processed_df.to_csv('lemmatized.csv', index = False)
files.download('lemmatized.csv')