In [1]:
#Imports
## 3 years worth of data
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import setuptools
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Create a spark session
spark = SparkSession.builder.appName("CompanyFinancialComplaints").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/22 07:46:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading and Pre-processing

In [2]:
#Load training data into a data frame
complaints_df = spark.read.json('./input/complaints.json')

#Verify
complaints_df.printSchema()
complaints_df.show(5)

                                                                                

root
 |-- _id: string (nullable = true)
 |-- _index: string (nullable = true)
 |-- _score: string (nullable = true)
 |-- _source: struct (nullable = true)
 |    |-- company: string (nullable = true)
 |    |-- company_public_response: string (nullable = true)
 |    |-- company_response: string (nullable = true)
 |    |-- complaint_id: string (nullable = true)
 |    |-- complaint_what_happened: string (nullable = true)
 |    |-- consumer_consent_provided: string (nullable = true)
 |    |-- consumer_disputed: string (nullable = true)
 |    |-- date_received: string (nullable = true)
 |    |-- date_sent_to_company: string (nullable = true)
 |    |-- issue: string (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sub_issue: string (nullable = true)
 |    |-- sub_product: string (nullable = true)
 |    |-- submitted_via: string (nullable = true)
 |    |-- tags: string (nullable = true)
 |    |-- timely: string (nullable = true)


                                                                                

In [3]:
print(f"df length: {complaints_df.count()}")
complaints_df.show(25)

df length: 71175
+--------+-------------------+------+--------------------+-----+------+
|     _id|             _index|_score|             _source|_type|  sort|
+--------+-------------------+------+--------------------+-----+------+
|10734962|complaint-public-v2|  NULL|{AMERICA FIRST FE...| _doc| [242]|
|10332134|complaint-public-v2|  NULL|{PENNYMAC LOAN SE...| _doc| [281]|
|10749062|complaint-public-v2|  NULL|{Onity Group Inc....| _doc| [292]|
|10310883|complaint-public-v2|  NULL|{AMERISAVE MORTGA...| _doc| [310]|
|11306698|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [395]|
|10285294|complaint-public-v2|  NULL|{Mr. Cooper Group...| _doc| [569]|
|10222910|complaint-public-v2|  NULL|{WELLS FARGO & CO...| _doc| [695]|
|10187626|complaint-public-v2|  NULL|{RoundPoint Mortg...| _doc| [711]|
|10213760|complaint-public-v2|  NULL|{Freedom Mortgage...| _doc| [718]|
|10171606|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [807]|
| 8161248|complaint-public-v2|  NULL|{M&T BANK 

                                                                                

In [4]:
# Unnest _source and keep all fields
from pyspark.sql.functions import col

columns_to_keep = [
    "_source.company",
    "_source.company_public_response",
    "_source.company_response",
    "_source.complaint_id",
    "_source.complaint_what_happened",
    "_source.consumer_consent_provided",
    "_source.consumer_disputed",
    "_source.date_received",
    "_source.date_sent_to_company",
    "_source.issue",
    "_source.product",
    "_source.state",
    "_source.sub_issue",
    "_source.sub_product",
    "_source.submitted_via",
    "_source.tags",
    "_source.timely",
    "_source.zip_code",
]

# Select and rename columns
unnested_df = complaints_df.select([col(column).alias(column.split(".")[1]) for column in columns_to_keep])

# Show the result
unnested_df.show(truncate=False)

+----------------------------------+-----------------------------------------------------------------------------------------------+-------------------------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [5]:
# Grabbing only rows with complaints
clean_df = unnested_df.filter((col("complaint_what_happened").isNotNull()) & (col("complaint_what_happened") != ""))

# Selecting only relevant columns
final_df = clean_df.select(["issue", "complaint_what_happened"])

print(f"df length: {final_df.count()}")
final_df.show()

                                                                                

df length: 38423
+--------------------+-----------------------+
|               issue|complaint_what_happened|
+--------------------+-----------------------+
|Struggling to pay...|   Wells Fargo engag...|
|Trouble during pa...|   Hi, On XX/XX/XXXX...|
|Trouble during pa...|   In XXXX, XXXX, I ...|
|Trouble during pa...|   it started with X...|
|Closing on a mort...|   XXXX who funds th...|
|Trouble during pa...|   My loan was trans...|
|Trouble during pa...|   Dear CFPB, Refere...|
|Struggling to pay...|   Wells Fargo bank ...|
|Trouble during pa...|   Received notice f...|
|Applying for a mo...|   XXXX  first appli...|
|Struggling to pay...|   I requested a for...|
|Struggling to pay...|   We asked for a Fo...|
|Applying for a mo...|   On XX/XX/2023, I ...|
|Trouble during pa...|   So I am not sure ...|
|Trouble during pa...|   XXXX  ENCLOSED EV...|
|Trouble during pa...|   IN RE : UNITED ST...|
|Applying for a mo...|   In 2018 I was man...|
|Closing on a mort...|   I, XXXX XXXX, am .

                                                                                

In [6]:
# Distinct count of target variables values
unique_count_ap = final_df.select("issue").distinct().count()

print(f"Number of unique issues: {unique_count_ap}")

Number of unique issues: 11


In [7]:
from pyspark.sql.functions import regexp_replace

# Removing special characters and integers
cleaned_final_df = final_df.withColumn("cleaned_text", regexp_replace(col("complaint_what_happened"), r"[\$0-9\n]+", ""))

cleaned_words_df = cleaned_final_df.withColumn("cleaned_complaints", regexp_replace(col("cleaned_text"), r"[^\w\s]", "").alias("cleaned_complaints"))

cleaned_words_df.show(5)

+--------------------+-----------------------+--------------------+--------------------+
|               issue|complaint_what_happened|        cleaned_text|  cleaned_complaints|
+--------------------+-----------------------+--------------------+--------------------+
|Struggling to pay...|   Wells Fargo engag...|Wells Fargo engag...|Wells Fargo engag...|
|Trouble during pa...|   Hi, On XX/XX/XXXX...|Hi, On XX/XX/XXXX...|Hi On XXXXXXXX I ...|
|Trouble during pa...|   In XXXX, XXXX, I ...|In XXXX, XXXX, I ...|In XXXX XXXX I ob...|
|Trouble during pa...|   it started with X...|it started with X...|it started with X...|
|Closing on a mort...|   XXXX who funds th...|XXXX who funds th...|XXXX who funds th...|
+--------------------+-----------------------+--------------------+--------------------+
only showing top 5 rows



## Data preparation & Text Preprocessing

In [8]:
# Create sentence length column
final_preprocessed = cleaned_words_df.withColumn("sentence_length", F.length(F.col("cleaned_complaints")))
# Summary statistics on complaint text
final_preprocessed.select("sentence_length").describe().show()



+-------+------------------+
|summary|   sentence_length|
+-------+------------------+
|  count|             38423|
|   mean|1622.4322411055878|
| stddev|1822.0424563383447|
|    min|                12|
|    max|             30928|
+-------+------------------+



                                                                                

In [9]:
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")
tokenized_final_df = tokenizer.transform(final_preprocessed)

#Print tokenized output
tokenized_final_df.select("cleaned_complaints", "tokens").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [10]:
# Create an instance of StopWordsRemover and use it to filter the tokens and remove stop words
# Removing classified private data in complaints
default_stopwords = StopWordsRemover.loadDefaultStopWords("english")
custom_stopwords = default_stopwords + ["xxxx", "xx", "xx/xx/xxxx", "XXX", 
                                        "XXXX", "XX", "XX/XX/XXXX", "xxx",
                                        "xxxxxxxxxx", "XXXXXXXXXX",
                                        "XXXXXXXX", "xxxxxxxx"]

remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
remover = remover.setStopWords(custom_stopwords)
cleaned_final_df = remover.transform(tokenized_final_df)

# Print the cleaned output
cleaned_final_df.select("tokens", "filtered_tokens").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Feature Extraction and Model

In [None]:
# TF-IDF calculation using CountVectorizer
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_tokens")
idf = IDF(inputCol="vectorized_tokens", outputCol="tfidf")

# Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")

# Create and apply pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, normalizer])
processed_data = pipeline.fit(final_preprocessed).transform(final_preprocessed)

# Show the final processed DataFrame with normalized features
processed_data.select("normalized_features").show(truncate=False)

In [None]:
processed_data.show(5)

In [None]:
# Getting issue labels for encoding in next step
unique_issues = processed_data.select("issue").distinct().count()
name_list = processed_data.select("issue").distinct().rdd.flatMap(lambda x: x).collect()
print(f"Number of unique issues: {unique_issues}")
print(name_list)

In [None]:
# Label encoding authors 
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="issue", outputCol="issue_index")
encoded_df = indexer.fit(processed_data).transform(processed_data)

ml_df = encoded_df.filter(F.col("issue_index").isNotNull())
# Filtering for issues with enough records to train on
fltrd_df = ml_df.filter(F.col('issue_index').isin([0.0, 1.0, 2.0, 3.0]))
# Getting counts on issues
issue_counts = fltrd_df.groupBy("issue_index").count().orderBy("issue_index")

issue_counts.show(12)

In [None]:
final_cleaned = fltrd_df.select(["issue_index", "normalized_features"])

final_cleaned.show(5)

In [None]:
train_data, test_data = final_cleaned.randomSplit([0.8, 0.2], seed=42)

print(f"df length: {train_data.count()}")
train_data.show(5)

### Logistic Regression Model

In [None]:
# Initialize LogisticRegression model
lr = LogisticRegression(featuresCol="normalized_features", labelCol="issue_index")

# Train the model
lr_model = lr.fit(train_data)

In [None]:
predictions = lr_model.transform(test_data)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="issue_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

### Naive Bayes

In [None]:
from pyspark.ml.classification import NaiveBayes

# Initialize NaiveBayes
nb = NaiveBayes(featuresCol="normalized_features", labelCol="issue_index")

# Train the Naive Bayes model
nb_model = nb.fit(train_data)


In [None]:
# Make predictions
nb_predictions = nb_model.transform(test_data)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="issue_index", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy:.5f}")

In [None]:
# Apply confusion matrix code to NB model
# Want to see where correct/incorrect predictions are
confusion_matrix = nb_predictions.groupBy("issue_index", "prediction").count().orderBy("issue_index", "prediction")

total_predictions = nb_predictions.count()

confusion_matrix = confusion_matrix.withColumn(
    "percentage",
    (F.col("count") / total_predictions) * 100
)

confusion_matrix = confusion_matrix.orderBy("issue_index", "prediction")

#Display the confusion matrix
confusion_matrix.show()

## Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="normalized_features", labelCol="issue_index", numTrees=10)

rf_model = rf.fit(train_data)


In [None]:
predictions_rf = rf_model.transform(test_data)

# Evaluate accuracy
evaluator_rf = MulticlassClassificationEvaluator(labelCol="issue_index", predictionCol="prediction", metricName="accuracy")
accuracy_rf = evaluator_rf.evaluate(predictions_rf)
print(f"Accuracy: {accuracy_rf}")

## TESTING

In [None]:
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")
tokenized_final_df = tokenizer.transform(final_preprocessed)

#Print tokenized output
tokenized_final_df.select("cleaned_complaints", "tokens").show(5, truncate=False)

In [None]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")

hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
# Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")

In [None]:
cleaned_final_df.show(5)

In [None]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [None]:
# TF-IDF calculation using CountVectorizer
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_tokens")
idf = IDF(inputCol="vectorized_tokens", outputCol="tfidf")

# Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")

# Create and apply pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, normalizer])
processed_data = pipeline.fit(final_preprocessed).transform(final_preprocessed)

# Show the final processed DataFrame with normalized features
processed_data.select("normalized_features").show(truncate=False)

In [None]:
cleaned_words_df.show(5)

In [11]:
# Start
preprocessed_final = cleaned_words_df.select("issue", "cleaned_complaints")

preprocessed_final.show(5)

+--------------------+--------------------+
|               issue|  cleaned_complaints|
+--------------------+--------------------+
|Struggling to pay...|Wells Fargo engag...|
|Trouble during pa...|Hi On XXXXXXXX I ...|
|Trouble during pa...|In XXXX XXXX I ob...|
|Trouble during pa...|it started with X...|
|Closing on a mort...|XXXX who funds th...|
+--------------------+--------------------+
only showing top 5 rows



In [14]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="issue", outputCol="issue_index")
encoded_df = indexer.fit(preprocessed_final).transform(preprocessed_final)

ml_df = encoded_df.filter(F.col("issue_index").isNotNull())
# Filtering for issues with enough records to train on
fltrd_df = ml_df.filter(F.col('issue_index').isin([0.0, 1.0, 2.0, 3.0]))

final_process_df = fltrd_df.select(['issue', 'cleaned_complaints'])

final_process_df.show(12)

+--------------------+--------------------+
|               issue|  cleaned_complaints|
+--------------------+--------------------+
|Struggling to pay...|Wells Fargo engag...|
|Trouble during pa...|Hi On XXXXXXXX I ...|
|Trouble during pa...|In XXXX XXXX I ob...|
|Trouble during pa...|it started with X...|
|Closing on a mort...|XXXX who funds th...|
|Trouble during pa...|My loan was trans...|
|Trouble during pa...|Dear CFPB Referen...|
|Struggling to pay...|Wells Fargo bank ...|
|Trouble during pa...|Received notice f...|
|Applying for a mo...|XXXX  first appli...|
|Struggling to pay...|I requested a for...|
|Struggling to pay...|We asked for a Fo...|
+--------------------+--------------------+
only showing top 12 rows



                                                                                

In [16]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")

default_stopwords = StopWordsRemover.loadDefaultStopWords("english")
custom_stopwords = default_stopwords + ["xxxx", "xx", "xx/xx/xxxx", "XXX", 
                                        "XXXX", "XX", "XX/XX/XXXX", "xxx",
                                        "xxxxxxxxxx", "XXXXXXXXXX",
                                        "XXXXXXXX", "xxxxxxxx"]

remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

hashingTF = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

issue_indexer = StringIndexer(inputCol="issue", outputCol="indexed_issue")

rf_classifier = RandomForestClassifier(labelCol="indexed_issue", featuresCol="features", numTrees=10)

# Step 6: Create a Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, issue_indexer, rf_classifier])

# Step 7: Train-Test Split
train_df, test_df = final_process_df.randomSplit([0.8, 0.2], seed=42)

# Step 8: Train the Model
model = pipeline.fit(train_df)

# Step 9: Make Predictions
predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="indexed_issue", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")



CodeCache: size=131072Kb used=45674Kb max_used=45682Kb free=85397Kb
 bounds [0x00000001099f8000, 0x000000010c6e8000, 0x00000001119f8000]
 total_blobs=16392 nmethods=15287 adapters=1015
 compilation: disabled (not enough contiguous free space left)




Accuracy: 0.59


                                                                                

In [17]:
# Actual start to the project


from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_complaints", outputCol="tokens")

default_stopwords = StopWordsRemover.loadDefaultStopWords("english")
custom_stopwords = default_stopwords + ["xxxx", "xx", "xx/xx/xxxx", "XXX", 
                                        "XXXX", "XX", "XX/XX/XXXX", "xxx",
                                        "xxxxxxxxxx", "XXXXXXXXXX",
                                        "XXXXXXXX", "xxxxxxxx"]

remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# Term Frequency-Inverse Document Frequency
hashingTF = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

issue_indexer = StringIndexer(inputCol="issue", outputCol="indexed_issue")

lr_classifier = LogisticRegression(featuresCol="features", labelCol="indexed_issue")

# Step 6: Create a Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, issue_indexer, lr_classifier])

# Step 7: Train-Test Split
train_df, test_df = final_process_df.randomSplit([0.8, 0.2], seed=42)

# Step 8: Train the Model
model = pipeline.fit(train_df)

# Step 9: Make Predictions
predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="indexed_issue", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")




25/01/22 07:55:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Accuracy: 0.74


                                                                                

In [33]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol="features", labelCol="indexed_issue", thresholds=[0.25, 0.25, 0.2, 0.2], smoothing=0.1)

# Step 6: Create a Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, issue_indexer, nb])

# Step 7: Train-Test Split
train_df, test_df = final_process_df.randomSplit([0.8, 0.2], seed=42)

# Step 8: Train the Model
model = pipeline.fit(train_df)

# Step 9: Make Predictions
predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="indexed_issue", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.72


                                                                                

In [24]:
# Display Predictions
predictions.select("cleaned_complaints","indexed_issue","issue", "prediction").show()

[Stage 312:>                                                        (0 + 1) / 1]

+--------------------+-------------+--------------------+----------+
|  cleaned_complaints|indexed_issue|               issue|prediction|
+--------------------+-------------+--------------------+----------+
| Attached is my l...|          2.0|Applying for a mo...|       2.0|
| How were you dis...|          2.0|Applying for a mo...|       2.0|
| I have a USDAgua...|          2.0|Applying for a mo...|       1.0|
| Inflated Apprais...|          2.0|Applying for a mo...|       3.0|
| Seller agreed to...|          2.0|Applying for a mo...|       3.0|
| Theyre attaching...|          2.0|Applying for a mo...|       0.0|
| days since I rep...|          2.0|Applying for a mo...|       2.0|
|A I was fighting ...|          2.0|Applying for a mo...|       1.0|
|A loan refinance ...|          2.0|Applying for a mo...|       2.0|
|A mortgage compan...|          2.0|Applying for a mo...|       3.0|
|A mortgage loan o...|          2.0|Applying for a mo...|       2.0|
|A previous compla...|          2.

                                                                                