# 1. DATA LOADING

In [None]:
import json
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt') # necessary for NLTK's tokenizers
nltk.download('averaged_perceptron_tagger') # needed for NLTK's part-of-speech (POS) tagging

import re, spacy, string

import en_core_web_sm
nlp = en_core_web_sm.load() #spaCy for text processing

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
with open("/content/customertickets.json") as f:
    data = json.load(f)

# Normalize JSON data and create DataFrame
df = pd.json_normalize(data)

# 2. TEXT PROCESSING


## Feature selection

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78313 entries, 0 to 78312
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   _index                             78313 non-null  object 
 1   _type                              78313 non-null  object 
 2   _id                                78313 non-null  object 
 3   _score                             78313 non-null  float64
 4   _source.tags                       10900 non-null  object 
 5   _source.zip_code                   71556 non-null  object 
 6   _source.complaint_id               78313 non-null  object 
 7   _source.issue                      78313 non-null  object 
 8   _source.date_received              78313 non-null  object 
 9   _source.state                      76322 non-null  object 
 10  _source.consumer_disputed          78313 non-null  object 
 11  _source.product                    78313 non-null  obj

In [None]:
df.head()

Unnamed: 0,index,type,id,score,tags,zip_code,complaint_id,subject,date_received,state,...,company_response,company,submitted_via,date_sent_to_company,company_public_response,sub_product,timely,complaint_what_happened,sub_issue,consumer_consent_provided
0,complaint-public-v2,complaint,3211475,0.0,,90301,3211475,Attempts to collect debt not owed,2019-04-13T12:00:00-05:00,CA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-13T12:00:00-05:00,,Credit card debt,Yes,,Debt is not yours,Consent not provided
1,complaint-public-v2,complaint,3229299,0.0,Servicemember,319XX,3229299,Written notification about debt,2019-05-01T12:00:00-05:00,GA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-05-01T12:00:00-05:00,,Credit card debt,Yes,Good morning my name is XXXX XXXX and I apprec...,Didn't receive enough information to verify debt,Consent provided
2,complaint-public-v2,complaint,3199379,0.0,,77069,3199379,"Other features, terms, or problems",2019-04-02T12:00:00-05:00,TX,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-02T12:00:00-05:00,,General-purpose credit card or charge card,Yes,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Problem with rewards from credit card,Consent provided
3,complaint-public-v2,complaint,2673060,0.0,,48066,2673060,Trouble during payment process,2017-09-13T12:00:00-05:00,MI,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2017-09-14T12:00:00-05:00,,Conventional home mortgage,Yes,,,Consent not provided
4,complaint-public-v2,complaint,3203545,0.0,,10473,3203545,Fees or interest,2019-04-05T12:00:00-05:00,NY,...,Closed with explanation,JPMORGAN CHASE & CO.,Referral,2019-04-05T12:00:00-05:00,,General-purpose credit card or charge card,Yes,,Charged too much interest,


In [None]:
print(df.columns)

Index(['index', 'type', 'id', 'score', 'tags', 'zip_code', 'complaint_id',
       'subject', 'date_received', 'state', 'consumer_disputed', 'product',
       'company_response', 'company', 'submitted_via', 'date_sent_to_company',
       'company_public_response', 'sub_product', 'timely',
       'complaint_what_happened', 'sub_issue', 'consumer_consent_provided'],
      dtype='object')


In [None]:
df.columns = ['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'subject', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'complaint_what_happened', 'sub_issue','consumer_consent_provided']

In [None]:
df.head()

Unnamed: 0,index,type,id,score,tags,zip_code,complaint_id,subject,date_received,state,...,company_response,company,submitted_via,date_sent_to_company,company_public_response,sub_product,timely,complaint_what_happened,sub_issue,consumer_consent_provided
0,complaint-public-v2,complaint,3211475,0.0,,90301,3211475,Attempts to collect debt not owed,2019-04-13T12:00:00-05:00,CA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-13T12:00:00-05:00,,Credit card debt,Yes,,Debt is not yours,Consent not provided
1,complaint-public-v2,complaint,3229299,0.0,Servicemember,319XX,3229299,Written notification about debt,2019-05-01T12:00:00-05:00,GA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-05-01T12:00:00-05:00,,Credit card debt,Yes,Good morning my name is XXXX XXXX and I apprec...,Didn't receive enough information to verify debt,Consent provided
2,complaint-public-v2,complaint,3199379,0.0,,77069,3199379,"Other features, terms, or problems",2019-04-02T12:00:00-05:00,TX,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-02T12:00:00-05:00,,General-purpose credit card or charge card,Yes,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Problem with rewards from credit card,Consent provided
3,complaint-public-v2,complaint,2673060,0.0,,48066,2673060,Trouble during payment process,2017-09-13T12:00:00-05:00,MI,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2017-09-14T12:00:00-05:00,,Conventional home mortgage,Yes,,,Consent not provided
4,complaint-public-v2,complaint,3203545,0.0,,10473,3203545,Fees or interest,2019-04-05T12:00:00-05:00,NY,...,Closed with explanation,JPMORGAN CHASE & CO.,Referral,2019-04-05T12:00:00-05:00,,General-purpose credit card or charge card,Yes,,Charged too much interest,


In [None]:
print(df['type'].unique()) #no need
print(df['index'].unique()) #no need
print(df['score'].unique()) #no need
print(df['company_response'].unique()) #no need as we gonna give response
print(df['timely'].unique()) #no need as we gonna give response
print(df['submitted_via'].unique())
print(df['tags'].unique())

['complaint']
['complaint-public-v2']
[0.]
['Closed with explanation' 'Closed with monetary relief'
 'Closed with non-monetary relief' 'Closed with relief' 'Closed'
 'Closed without relief' 'In progress' 'Untimely response']
['Yes' 'No']
['Web' 'Referral' 'Phone' 'Postal mail' 'Fax' 'Email']
[None 'Servicemember' 'Older American' 'Older American, Servicemember']


In [None]:
df[df['id'] == df['complaint_id']].shape[0]

78313

In [None]:
df = df.drop(['type','tags','sub_issue','index','id','score','zip_code','timely','company','company_public_response','submitted_via','consumer_consent_provided','company_response','sub_product','product','state','consumer_disputed'], axis=1)

In [None]:
df.head()

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint_what_happened
0,3211475,Attempts to collect debt not owed,2019-04-13T12:00:00-05:00,2019-04-13T12:00:00-05:00,
1,3229299,Written notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,Good morning my name is XXXX XXXX and I apprec...
2,3199379,"Other features, terms, or problems",2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgraded my XXXX XXXX card in XX/XX/2018 and...
3,2673060,Trouble during payment process,2017-09-13T12:00:00-05:00,2017-09-14T12:00:00-05:00,
4,3203545,Fees or interest,2019-04-05T12:00:00-05:00,2019-04-05T12:00:00-05:00,


In [None]:
df.shape

(78313, 5)

## Removing all Complaints which is blank in both subject and complaint columns

In [None]:
df.shape

(78313, 5)

In [None]:
df[df.loc[:,'subject'] == ''] = np.nan
df[df.loc[:,'date_received'] == ''] = np.nan
df[df.loc[:,'date_sent_to_company'] == ''] = np.nan
df[df.loc[:,'complaint_what_happened'] == ''] = np.nan

In [None]:
df.dropna(subset=['subject', 'complaint_what_happened'], how='all', inplace=True)

In [None]:
df.shape

(21072, 5)

## Text cleaning

1. Make the text lowercase
2. Remove punctuation

In [None]:
import re
def clean_text(sent):
    sent = sent.lower() # Text to lowercase

    pattern = '[^\w\s]' # Removing punctuation
    sent = re.sub(pattern, '', sent)

    return sent

In [None]:
df['complaint_what_happened'] = df['complaint_what_happened'].apply(clean_text)
df['subject'] = df['subject'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint_what_happened
1,3229299,written notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name is xxxx xxxx and i apprec...
2,3199379,other features terms or problems,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,i upgraded my xxxx xxxx card in xxxx2018 and w...
10,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card was reported on xxxx2019 however fr...
11,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on xxxx2018 while trying to book a xxxx xxxx ...
14,3224980,managing an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give me check for 160000 i deposi...


## Lemmatization

In [None]:
def lemmmatize_text(text):
    sent = []
    doc = nlp(text)
    for token in doc:
        sent.append(token.lemma_)
    return " ".join(sent)

In [None]:
df['complaint'] = df['complaint_what_happened'].apply(lemmmatize_text)
df['subject'] = df['subject'].apply(lemmmatize_text)

In [None]:
df = df.drop(['complaint_what_happened'], axis=1)

In [None]:
df

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint
1,3229299,write notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name be and I appreciate it ...
2,3199379,other feature term or problem,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgrade my card in 2018 and be tell by the...
10,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card be report on 2019 however fraudulen...
11,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on 2018 while try to book a ticket I com...
14,3224980,manage an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give I check for 160000 I deposit...
...,...,...,...,...,...
78303,3094545,advertising and marketing include promotional ...,2018-12-07T12:00:00-05:00,2018-12-07T12:00:00-05:00,after be a chase card customer for well over a...
78309,3091984,other feature term or problem,2018-12-05T12:00:00-05:00,2018-12-05T12:00:00-05:00,on wednesday I call chas my visa credit car...
78310,3133355,problem with a lender or other company charge ...,2019-01-25T12:00:00-05:00,2019-01-25T12:00:00-05:00,I be not familiar with pay and do not underst...
78311,3110963,other feature term or problem,2018-12-27T12:00:00-05:00,2018-12-27T12:00:00-05:00,I have have flawless credit for 30 yrs I ve ha...


## 4. The personal details of customer has been masked in the dataset with xxxx. remove the masked text.

In [None]:
df['complaint'] = df['complaint'].str.replace('xxxx','')

In [None]:
df

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint
1,3229299,write notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name be and I appreciate it ...
2,3199379,other feature term or problem,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgrade my card in 2018 and be tell by the...
10,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card be report on 2019 however fraudulen...
11,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on 2018 while try to book a ticket I com...
14,3224980,manage an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give I check for 160000 I deposit...
...,...,...,...,...,...
78303,3094545,advertising and marketing include promotional ...,2018-12-07T12:00:00-05:00,2018-12-07T12:00:00-05:00,after be a chase card customer for well over a...
78309,3091984,other feature term or problem,2018-12-05T12:00:00-05:00,2018-12-05T12:00:00-05:00,on wednesday I call chas my visa credit car...
78310,3133355,problem with a lender or other company charge ...,2019-01-25T12:00:00-05:00,2019-01-25T12:00:00-05:00,I be not familiar with pay and do not underst...
78311,3110963,other feature term or problem,2018-12-27T12:00:00-05:00,2018-12-27T12:00:00-05:00,I have have flawless credit for 30 yrs I ve ha...


# 3. FEATURE EXTRACTION

The purpose of TF-IDF features is to represent the importance of a term in a document relative to a collection of documents.

1. They normalise the frequency of each word between 0 to 1
2. high value, high importance

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english') # very frequent, less frequent and stop words are removed

In [None]:
dtm = tfidf.fit_transform(df['complaint'])
dtm

<21072x12965 sparse matrix of type '<class 'numpy.float64'>'
	with 1220170 stored elements in Compressed Sparse Row format>

In [None]:
print(dtm)

  (0, 1327)	0.20804894713930847
  (0, 11532)	0.14949658496856288
  (0, 1197)	0.06414397700682926
  (0, 3306)	0.14466067423122175
  (0, 6451)	0.10344886407793524
  (0, 6914)	0.10984623821482416
  (0, 10151)	0.13768201553584736
  (0, 2997)	0.18901005584767386
  (0, 1850)	0.1377584517100997
  (0, 7747)	0.10343185438538349
  (0, 7302)	0.1307580680759404
  (0, 9553)	0.08063818905897885
  (0, 6566)	0.15848767262032518
  (0, 12422)	0.22994925509995698
  (0, 1167)	0.24695055765154886
  (0, 11013)	0.12751351745773576
  (0, 10472)	0.09133276292709772
  (0, 12478)	0.20504031788384117
  (0, 3754)	0.44833843042428806
  (0, 1769)	0.19246358596477756
  (0, 12876)	0.13414887790017987
  (0, 336)	0.17036916878529987
  (0, 10512)	0.10836846299968501
  (0, 2545)	0.24771356260138783
  (0, 2022)	0.14674358880609692
  :	:
  (21071, 12660)	0.09179901232416174
  (21071, 7112)	0.08222478879389493
  (21071, 12928)	0.20930397029231562
  (21071, 8599)	0.24704127459823852
  (21071, 6961)	0.1726344127756443
  (21071

1. (i, j) is a tuple representing the indices of a non-zero element in the matrix.
2. The value after the tuple (i, j) is the TF-IDF score of the term at index j in document i.

In [None]:
len(tfidf.get_feature_names_out())

12965

In [None]:
tfidf_scores = [(term, score) for term, score in zip(tfidf.get_feature_names_out(), dtm.toarray().mean(axis=0))]

sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

print("Top 5 most frequent words:")
for term, score in sorted_tfidf_scores[:5]:
    print(f"{term}: {score}")

Top 5 most frequent words:
chase: 0.08024550631783418
account: 0.0691619998034286
credit: 0.05639800573660306
card: 0.052718163864988704
bank: 0.0452348441006841


# 4. TOPIC MODELLING

NMF (Non-negative Matrix Factorization)


Topic Modeling: It's commonly used in topic modeling tasks to discover latent topics within text data, enabling insights into the main themes or subjects present in a corpus.

In [None]:
len(tfidf.get_feature_names_out())  #unique terms extracted from the text data using the TF-IDF vectorizer.

12965

In [None]:
from sklearn.decomposition import NMF

num_topics = 5

nmf_model = NMF(n_components=num_topics, random_state=40)
nmf_model.fit(dtm)

Each row represents a topic or latent feature identified by the NMF model.

Each column represents a term (word) in the vocabulary obtained from the TF-IDF vectorizer.

In [None]:
basis_matrix = nmf_model.components_
basis_matrix

array([[1.64063749e-03, 1.65883841e-02, 6.23587269e-04, ...,
        0.00000000e+00, 2.79313011e-05, 7.67434478e-05],
       [0.00000000e+00, 2.43209269e-02, 0.00000000e+00, ...,
        6.02327179e-04, 0.00000000e+00, 5.69957037e-05],
       [7.96456368e-04, 0.00000000e+00, 6.32935872e-04, ...,
        2.32998162e-03, 3.22207133e-03, 0.00000000e+00],
       [3.38076184e-03, 1.80026271e-02, 0.00000000e+00, ...,
        3.13224847e-04, 2.34182800e-03, 5.92235057e-04],
       [3.64274687e-03, 4.62579691e-02, 0.00000000e+00, ...,
        0.00000000e+00, 3.00971392e-03, 0.00000000e+00]])

In [None]:
print(basis_matrix.shape)

(5, 12965)


In [None]:
words = np.array(tfidf.get_feature_names_out())

#creating a empty dataframe
topic_words = pd.DataFrame(np.zeros((num_topics, 10)), index=[f'Topic {i + 1}' for i in range(num_topics)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)

# sort
for i in range(num_topics):
    ix = basis_matrix[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,account,check,bank,chase,money,deposit,close,fund,tell,open
Topic 2,credit,report,card,inquiry,chase,hard,remove,account,apply,score
Topic 3,loan,mortgage,chase,home,modification,property,letter,send,year,document
Topic 4,charge,card,chase,dispute,transaction,purchase,fee,merchant,refund,receive
Topic 5,payment,late,pay,make,balance,fee,month,statement,monthly,credit


From extracted TOP 10 words for each topic we can manually create our own topics according to the department available in the specified company to forward the messages


DEPARTMENTS

* Topic 1 = Retail Banking Operations
* Topic 2 = Credit card Management
* Topic 3 = Payment and Billing
* Topic 4 = Dispute reporting
* Topic 5 = Mortgages/loan




In [None]:
topic_results = nmf_model.transform(dtm)

df['Topic'] = topic_results.argmax(axis=1)

df.head()

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint,Topic
1,3229299,write notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name be and I appreciate it ...,2
2,3199379,other feature term or problem,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgrade my card in 2018 and be tell by the...,3
10,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card be report on 2019 however fraudulen...,1
11,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on 2018 while try to book a ticket I com...,1
14,3224980,manage an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give I check for 160000 I deposit...,0


In [None]:
Topic_names = { 0:"Retail Banking Operations", 1:"Credit card Management", 2:"Payment and Billing",
               3:"Dispute reporting", 4:"Mortgages/loans" }

df['Topic'] = df['Topic'].map(Topic_names)
df.head()

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint,Topic
1,3229299,write notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name be and I appreciate it ...,Payment and Billing
2,3199379,other feature term or problem,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgrade my card in 2018 and be tell by the...,Dispute reporting
10,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card be report on 2019 however fraudulen...,Credit card Management
11,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on 2018 while try to book a ticket I com...,Credit card Management
14,3224980,manage an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give I check for 160000 I deposit...,Retail Banking Operations


# 5.SAVING MODEL

In [None]:
df.to_csv('output.csv', index=False)