# Load and install necessary packages


In [1]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# !pip install --user gensim 
# !pip install --user gensim pyLDAvis wordlcoud

# Read the Dataset into a dataframe with apt encoding

In [3]:
import os, pandas as pd
# Add asset from file system
complaints_df = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/Complaints_All_2018_Consolidated_Final_For_upload.csv',
                            encoding='ISO-8859-1')
complaints_df.columns



Index(['ConcernID', 'IssueID', 'ConcernStatus', 'Team', 'ReceivedDate',
       'AcknowledgementDate', 'ResolutionDate', 'ANZReceivedDate',
       'HardshipDate', '21DayLetterDueDate', '45DayLetterDueDate',
       'ResponseDueDate', 'AcknowledgementTurnAroundTime',
       'ResolutionTurnAroundTime', 'IssueType', 'Channel', 'Breach',
       'ConcernDescription', 'IssueDescription', 'ResolutionType',
       'ResolutionDescription', 'EncounterCategory', 'EncounterType',
       'BusinessArea', 'BusinessDepartment', 'ProductCategory', 'ProductType',
       'FOSFlag', 'FOSFileNumber', 'ChargedToBU', 'Tribunal', 'Segment',
       'Clause'],
      dtype='object')

# Explore Data

In [4]:
complaints_df.head(3)

Unnamed: 0,ConcernID,IssueID,ConcernStatus,Team,ReceivedDate,AcknowledgementDate,ResolutionDate,ANZReceivedDate,HardshipDate,21DayLetterDueDate,...,BusinessArea,BusinessDepartment,ProductCategory,ProductType,FOSFlag,FOSFileNumber,ChargedToBU,Tribunal,Segment,Clause
0,964163,1016404,CLOSE,Internal Dispute Team,2017-07-18,2017-05-24,2017-09-05,2017-05-24,,2017-06-13,...,Operations,Other Servicing,Not Product related,Not Product related,,,,,,13
1,965539,1017305,CLOSE,Internal Dispute Team,2017-07-19,2017-06-02,2017-08-24,2017-06-02,2016-12-06,2017-06-22,...,Operations,Collections / Hardship,Business Lending,Other,,,,,Small Business Banking,32
2,967004,1015985,CLOSE,Internal Dispute Team,2017-07-02,2017-06-14,2017-08-23,2017-06-14,,2017-07-04,...,Products,Mortgages,Mortgages,Variable Rate Home Loan,,,,,,13


# Combine the text corpus to create the final dataframe

In [5]:
columns= ['ConcernDescription','IssueDescription','ResolutionDescription']
keywords_df = complaints_df[columns]
keywords_df['keyword_corpus'] = keywords_df[columns[0]]+"."+keywords_df[columns[1]]+"."+keywords_df[columns[2]]
keywords_df['keyword_corpus'].head(3)

0    Unclaimed Monies  - ANZ Account - $1,978,74. I...
1    Customer is unable to pay his business loans s...
2    I am writing to express our grave concern abou...
Name: keyword_corpus, dtype: object

In [6]:
keywords_df['keyword_corpus'][0]

"Unclaimed Monies  - ANZ Account - $1,978,74. I am emailing from London as a former Sydney ANZ customer and have been trying to claim my ANZ bank account money ($1,978.74,) for over 12 months now. Over 12 months ago submitted by post all of the required documentation, including the ANZ unclaimed monies form plus all of the certified Proof of ID documentation and bank account details, together with an ASICs print out showing that $1,978.74 is listed as unclaimed funds under my name and current London address (See Below.) Once again this year I submitted a request by post from London to 'ANZ Unclaimed Monies Melbourne' for the above amount in February 2017 and it was signed for in Melbourne on 7 March 2017 at the below address. Once again I included all of the relevant applications forms, bank details and other required certified supporting documentation. You should have everything you require to complete the transfer..unclaimed money request.CA - 5 September 2017 - Feedback noted. Estat

# Clean the data with regex


In [7]:
import re
from nltk.tokenize import word_tokenize

In [8]:
#remove email
def noemail(string):
    string = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+',' ',string)
    return string

In [9]:
#clean the strings
def cleanstring(string):
    cstring = []
    for x in string:
        if x.isalpha():
            cstring.append(x)
        elif x ==' ':
            cstring.append(x)
        else:
            cstring = cstring
    cstring = ''.join(cstring)
    return cstring

In [10]:
def preprocess(doc):
    doc = nltk.word_tokenize(doc)
    doc = nltk.pos_tag(doc)
    return doc

In [11]:
# remove names
def removeNNP(string):
    doc = []
    for wordy in string:
        if wordy[1]=='NNP':
            doc = doc
        else:
            doc.append(wordy[0])
    return(doc)

In [12]:
#create new column for tokens
keywords_df['keyword_tokens'] = keywords_df['keyword_corpus'].copy()

In [13]:
#fill blanks: NOTE: this is not needed with Nijesh's update
keywords_df['keyword_tokens'] = keywords_df['keyword_corpus'].where((pd.notnull(keywords_df['keyword_tokens'])), "nblank")

In [14]:
#remove emails
keywords_df['keyword_tokens'] = keywords_df['keyword_tokens'].apply(noemail)

In [15]:
#clean to only is alpha
keywords_df['keyword_tokens'] = keywords_df['keyword_tokens'].apply(cleanstring)

# Remove the stopwords using Spacy


In [18]:
#! pip install spacy
! python -m spacy download en

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 54.4MB/s ta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25l- \ done
[?25hSuccessfully installed en-core-web-sm-2.1.0
[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.5/site-packages/en_core_web_sm -->
/user-home/1068/.local/lib/python3.5/site-packages/spacy/data/en
You can now load the

In [19]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en")
nlpstop = nlp.Defaults.stop_words

In [20]:
from gensim.parsing.preprocessing import remove_stopwords

In [21]:
def removestop(docs):
    remaining = remove_stopwords(docs)
    return remaining

In [22]:
keywords_df['keyword_tokens'] = keywords_df['keyword_tokens'].apply(removestop)
keywords_df['keyword_tokens'][0]

'Unclaimed Monies ANZ Account I emailing London Sydney ANZ customer trying claim ANZ bank account money months Over months ago submitted post required documentation including ANZ unclaimed monies form plus certified Proof ID documentation bank account details ASICs print showing listed unclaimed funds current London address See Below Once year I submitted request post London ANZ Unclaimed Monies Melbourne February signed Melbourne March address Once I included relevant applications forms bank details required certified supporting documentation You require complete transferunclaimed money requestCA September Feedback noted Estates confirm funds remitted customer August June Referred unclaimed monies deceased estates team Unclaimed Monies team responsed cust june All necessary d'

# Tokenise the data

In [23]:
def tokens(docs):
    tokes = nltk.word_tokenize(docs)
    return tokes

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /user-
[nltk_data]     home/1068/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
#tokenize
keywords_df['keyword_tokens'] = keywords_df['keyword_tokens'].apply(tokens)

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/user-home/1068/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [86]:
print(keywords_df['keyword_tokens'].head(5))

0    [Unclaimed, Monies, ANZ, Account, I, emailing,...
1    [Customer, unable, pay, business, loans, expec...
2    [I, writing, express, grave, concern, processi...
3    [Customer, request, return, documents, sensiti...
4    [Customer, disappointed, collections, calls, r...
Name: keyword_tokens, dtype: object
