In [1]:
#Imports
## 3 years worth of data
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import setuptools
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt
import numpy as np

#Create a spark session
spark = SparkSession.builder.appName("CompanyFinancialComplaints").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/19 21:46:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading and Pre-processing

In [2]:
#Load training data into a data frame
complaints_df = spark.read.json('./input/complaints.json')

#Verify
complaints_df.printSchema()
complaints_df.show(5)

                                                                                

root
 |-- _id: string (nullable = true)
 |-- _index: string (nullable = true)
 |-- _score: string (nullable = true)
 |-- _source: struct (nullable = true)
 |    |-- company: string (nullable = true)
 |    |-- company_public_response: string (nullable = true)
 |    |-- company_response: string (nullable = true)
 |    |-- complaint_id: string (nullable = true)
 |    |-- complaint_what_happened: string (nullable = true)
 |    |-- consumer_consent_provided: string (nullable = true)
 |    |-- consumer_disputed: string (nullable = true)
 |    |-- date_received: string (nullable = true)
 |    |-- date_sent_to_company: string (nullable = true)
 |    |-- issue: string (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sub_issue: string (nullable = true)
 |    |-- sub_product: string (nullable = true)
 |    |-- submitted_via: string (nullable = true)
 |    |-- tags: string (nullable = true)
 |    |-- timely: string (nullable = true)


                                                                                

In [3]:
print(f"df length: {complaints_df.count()}")
complaints_df.show(25)

df length: 71175
+--------+-------------------+------+--------------------+-----+------+
|     _id|             _index|_score|             _source|_type|  sort|
+--------+-------------------+------+--------------------+-----+------+
|10734962|complaint-public-v2|  NULL|{AMERICA FIRST FE...| _doc| [242]|
|10332134|complaint-public-v2|  NULL|{PENNYMAC LOAN SE...| _doc| [281]|
|10749062|complaint-public-v2|  NULL|{Onity Group Inc....| _doc| [292]|
|10310883|complaint-public-v2|  NULL|{AMERISAVE MORTGA...| _doc| [310]|
|11306698|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [395]|
|10285294|complaint-public-v2|  NULL|{Mr. Cooper Group...| _doc| [569]|
|10222910|complaint-public-v2|  NULL|{WELLS FARGO & CO...| _doc| [695]|
|10187626|complaint-public-v2|  NULL|{RoundPoint Mortg...| _doc| [711]|
|10213760|complaint-public-v2|  NULL|{Freedom Mortgage...| _doc| [718]|
|10171606|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [807]|
| 8161248|complaint-public-v2|  NULL|{M&T BANK 

In [4]:
# Unnest _source and keep all fields
from pyspark.sql.functions import col

columns_to_keep = [
    "_source.company",
    "_source.company_public_response",
    "_source.company_response",
    "_source.complaint_id",
    "_source.complaint_what_happened",
    "_source.consumer_consent_provided",
    "_source.consumer_disputed",
    "_source.date_received",
    "_source.date_sent_to_company",
    "_source.issue",
    "_source.product",
    "_source.state",
    "_source.sub_issue",
    "_source.sub_product",
    "_source.submitted_via",
    "_source.tags",
    "_source.timely",
    "_source.zip_code",
]

# Select and rename columns
unnested_df = complaints_df.select([col(column).alias(column.split(".")[1]) for column in columns_to_keep])

# Show the result
unnested_df.show(truncate=False)

+----------------------------------+-----------------------------------------------------------------------------------------------+-------------------------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
# Grabbing only rows with complaints
clean_df = unnested_df.filter((col("complaint_what_happened").isNotNull()) & (col("complaint_what_happened") != ""))

# Selecting only relevant columns
final_df = clean_df.select(["issue", "complaint_what_happened"])

print(f"df length: {final_df.count()}")
final_df.show()

                                                                                

df length: 38423
+--------------------+-----------------------+
|               issue|complaint_what_happened|
+--------------------+-----------------------+
|Struggling to pay...|   Wells Fargo engag...|
|Trouble during pa...|   Hi, On XX/XX/XXXX...|
|Trouble during pa...|   In XXXX, XXXX, I ...|
|Trouble during pa...|   it started with X...|
|Closing on a mort...|   XXXX who funds th...|
|Trouble during pa...|   My loan was trans...|
|Trouble during pa...|   Dear CFPB, Refere...|
|Struggling to pay...|   Wells Fargo bank ...|
|Trouble during pa...|   Received notice f...|
|Applying for a mo...|   XXXX  first appli...|
|Struggling to pay...|   I requested a for...|
|Struggling to pay...|   We asked for a Fo...|
|Applying for a mo...|   On XX/XX/2023, I ...|
|Trouble during pa...|   So I am not sure ...|
|Trouble during pa...|   XXXX  ENCLOSED EV...|
|Trouble during pa...|   IN RE : UNITED ST...|
|Applying for a mo...|   In 2018 I was man...|
|Closing on a mort...|   I, XXXX XXXX, am .

                                                                                

In [6]:
# Distinct count of target variables values
unique_count_ap = final_df.select("issue").distinct().count()

print(f"Number of unique issues: {unique_count_ap}")

Number of unique issues: 11


                                                                                

In [20]:
from pyspark.sql.functions import regexp_replace

# Removing special characters and integers
cleaned_final_df = final_df.withColumn("cleaned_text", regexp_replace(col("complaint_what_happened"), r"[\$0-9\n]+", ""))

cleaned_words_df = cleaned_final_df.withColumn("cleaned_complaints", regexp_replace(col("cleaned_text"), r"[^\w\s]", "").alias("cleaned_complaints"))

cleaned_words_df.show(5)

+--------------------+-----------------------+---------------+--------------------+--------------------+
|               issue|complaint_what_happened|sentence_length|        cleaned_text|  cleaned_complaints|
+--------------------+-----------------------+---------------+--------------------+--------------------+
|Struggling to pay...|   Wells Fargo engag...|           1519|Wells Fargo engag...|Wells Fargo engag...|
|Trouble during pa...|   Hi, On XX/XX/XXXX...|            860|Hi, On XX/XX/XXXX...|Hi On XXXXXXXX I ...|
|Trouble during pa...|   In XXXX, XXXX, I ...|           2175|In XXXX, XXXX, I ...|In XXXX XXXX I ob...|
|Trouble during pa...|   it started with X...|           1289|it started with X...|it started with X...|
|Closing on a mort...|   XXXX who funds th...|           1742|XXXX who funds th...|XXXX who funds th...|
+--------------------+-----------------------+---------------+--------------------+--------------------+
only showing top 5 rows



## Data preparation & Text Preprocessing

In [23]:
# Create sentence length column
final_preprocessed = cleaned_words_df.withColumn("sentence_length", F.length(F.col("cleaned_complaints")))
# Summary statistics on complaint text
final_preprocessed.select("sentence_length").describe().show()



+-------+------------------+
|summary|   sentence_length|
+-------+------------------+
|  count|             38423|
|   mean|1622.4322411055878|
| stddev|1822.0424563383447|
|    min|                12|
|    max|             30928|
+-------+------------------+



                                                                                

In [24]:
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")
tokenized_final_df = tokenizer.transform(final_preprocessed)

#Print tokenized output
tokenized_final_df.select("cleaned_complaints", "tokens").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
# Create an instance of StopWordsRemover and use it to filter the tokens and remove stop words
# Removing classified private data in complaints
default_stopwords = StopWordsRemover.loadDefaultStopWords("english")
custom_stopwords = default_stopwords + ["xxxx", "xx", "xx/xx/xxxx", "XXX", 
                                        "XXXX", "XX", "XX/XX/XXXX", "xxx",
                                        "xxxxxxxxxx", "XXXXXXXXXX",
                                        "XXXXXXXX", "xxxxxxxx"]

remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
remover = remover.setStopWords(custom_stopwords)
cleaned_final_df = remover.transform(tokenized_final_df)

# Print the cleaned output
cleaned_final_df.select("tokens", "filtered_tokens").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

## Feature Extraction and Model

In [27]:
# TF-IDF calculation using CountVectorizer
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_tokens")
idf = IDF(inputCol="vectorized_tokens", outputCol="tfidf")

# Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")

# Create and apply pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, normalizer])
processed_data = pipeline.fit(final_preprocessed).transform(final_preprocessed)

# Show the final processed DataFrame with normalized features
processed_data.select("normalized_features").show(truncate=False)

25/01/19 22:30:32 WARN DAGScheduler: Broadcasting large task binary with size 1484.2 KiB


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [28]:
processed_data.show(5)

25/01/19 22:30:40 WARN DAGScheduler: Broadcasting large task binary with size 1501.7 KiB


+--------------------+-----------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               issue|complaint_what_happened|sentence_length|        cleaned_text|  cleaned_complaints|              tokens|     filtered_tokens|   vectorized_tokens|               tfidf| normalized_features|
+--------------------+-----------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Struggling to pay...|   Wells Fargo engag...|           1505|Wells Fargo engag...|Wells Fargo engag...|[wells, fargo, en...|[wells, fargo, en...|(51616,[0,8,9,10,...|(51616,[0,8,9,10,...|(51616,[0,8,9,10,...|
|Trouble during pa...|   Hi, On XX/XX/XXXX...|            834|Hi, On XX/XX/XXXX...|Hi On XXXXXXXX I ...|[hi, on, xxxxxxxx...|[hi, received, le...|(51616,[0,1,3,

                                                                                

In [29]:
# Getting issue labels for encoding in next step
unique_issues = processed_data.select("issue").distinct().count()
name_list = processed_data.select("issue").distinct().rdd.flatMap(lambda x: x).collect()
print(f"Number of unique issues: {unique_issues}")
print(name_list)

                                                                                

Number of unique issues: 11
["Problem with a company's investigation into an existing problem", 'Improper use of your report', 'Incorrect information on your report', 'Applying for a mortgage or refinancing an existing mortgage', 'Problem with fraud alerts or security freezes', 'Struggling to pay mortgage', 'Unable to get your credit report or credit score', 'Trouble during payment process', 'Closing on a mortgage', "Problem with a credit reporting company's investigation into an existing problem", 'Credit monitoring or identity theft protection services']


In [30]:
# Label encoding authors 
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="issue", outputCol="issue_index")
encoded_df = indexer.fit(processed_data).transform(processed_data)

ml_df = encoded_df.filter(F.col("issue_index").isNotNull())

train_data, test_data = ml_df.randomSplit([0.8, 0.2], seed=42)

print(f"df length: {train_data.count()}")
train_data.show(5)

25/01/19 22:31:12 WARN DAGScheduler: Broadcasting large task binary with size 1536.6 KiB
25/01/19 22:31:23 WARN DAGScheduler: Broadcasting large task binary with size 1545.3 KiB


df length: 30752


[Stage 263:>                                                        (0 + 1) / 1]

+--------------------+-----------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|               issue|complaint_what_happened|sentence_length|        cleaned_text|  cleaned_complaints|              tokens|     filtered_tokens|   vectorized_tokens|               tfidf| normalized_features|issue_index|
+--------------------+-----------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|Applying for a mo...|   " I recently appl...|            817|" I recently appl...| I recently appli...|[, i, recently, a...|[, recently, appl...|(51616,[0,1,2,3,7...|(51616,[0,1,2,3,7...|(51616,[0,1,2,3,7...|        2.0|
|Applying for a mo...|   " Quicken Loans '...|            728|" Quicken Loans '...| Quicken Loans  i...|[, quick

                                                                                

### Logistic Regression Model

In [31]:
# Initialize LogisticRegression model
lr = LogisticRegression(featuresCol="normalized_features", labelCol="issue_index")

# Train the model
lr_model = lr.fit(train_data)

25/01/19 22:31:50 WARN DAGScheduler: Broadcasting large task binary with size 1567.2 KiB
25/01/19 22:32:01 WARN DAGScheduler: Broadcasting large task binary with size 1568.3 KiB
25/01/19 22:32:01 WARN DAGScheduler: Broadcasting large task binary with size 1567.8 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1569.0 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1567.8 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1569.0 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1567.8 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1569.0 KiB
25/01/19 22:32:12 WARN DAGScheduler: Broadcasting large task binary with size 1567.8 KiB
25/01/19 22:32:13 WARN DAGScheduler: Broadcasting large task binary with size 1569.0 KiB
25/01/19 22:32:13 WARN DAGScheduler: Broadcasting large task binary with size 1567.8 KiB
25/01/19 22:32:13 WAR

In [32]:
predictions = lr_model.transform(test_data)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="issue_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

25/01/19 22:32:51 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB

Accuracy: 0.5627688697692609


                                                                                