In [1]:
#Imports
## 3 years worth of data
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import setuptools
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt
import numpy as np

#Create a spark session
spark = SparkSession.builder.appName("CompanyFinancialComplaints").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/19 01:17:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
#Load training data into a data frame
complaints_df = spark.read.json('./data/complaints.json')

#Verify
complaints_df.printSchema()
complaints_df.show(5)


                                                                                

root
 |-- _id: string (nullable = true)
 |-- _index: string (nullable = true)
 |-- _score: string (nullable = true)
 |-- _source: struct (nullable = true)
 |    |-- company: string (nullable = true)
 |    |-- company_public_response: string (nullable = true)
 |    |-- company_response: string (nullable = true)
 |    |-- complaint_id: string (nullable = true)
 |    |-- complaint_what_happened: string (nullable = true)
 |    |-- consumer_consent_provided: string (nullable = true)
 |    |-- consumer_disputed: string (nullable = true)
 |    |-- date_received: string (nullable = true)
 |    |-- date_sent_to_company: string (nullable = true)
 |    |-- issue: string (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sub_issue: string (nullable = true)
 |    |-- sub_product: string (nullable = true)
 |    |-- submitted_via: string (nullable = true)
 |    |-- tags: string (nullable = true)
 |    |-- timely: string (nullable = true)


                                                                                

+--------+-------------------+------+--------------------+-----+-----+
|     _id|             _index|_score|             _source|_type| sort|
+--------+-------------------+------+--------------------+-----+-----+
|10734962|complaint-public-v2|  NULL|{AMERICA FIRST FE...| _doc|[242]|
|10332134|complaint-public-v2|  NULL|{PENNYMAC LOAN SE...| _doc|[281]|
|10749062|complaint-public-v2|  NULL|{Onity Group Inc....| _doc|[292]|
|10310883|complaint-public-v2|  NULL|{AMERISAVE MORTGA...| _doc|[310]|
|11306698|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc|[395]|
+--------+-------------------+------+--------------------+-----+-----+
only showing top 5 rows



In [3]:
print(f"df length: {complaints_df.count()}")
complaints_df.show(25)

df length: 71175
+--------+-------------------+------+--------------------+-----+------+
|     _id|             _index|_score|             _source|_type|  sort|
+--------+-------------------+------+--------------------+-----+------+
|10734962|complaint-public-v2|  NULL|{AMERICA FIRST FE...| _doc| [242]|
|10332134|complaint-public-v2|  NULL|{PENNYMAC LOAN SE...| _doc| [281]|
|10749062|complaint-public-v2|  NULL|{Onity Group Inc....| _doc| [292]|
|10310883|complaint-public-v2|  NULL|{AMERISAVE MORTGA...| _doc| [310]|
|11306698|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [395]|
|10285294|complaint-public-v2|  NULL|{Mr. Cooper Group...| _doc| [569]|
|10222910|complaint-public-v2|  NULL|{WELLS FARGO & CO...| _doc| [695]|
|10187626|complaint-public-v2|  NULL|{RoundPoint Mortg...| _doc| [711]|
|10213760|complaint-public-v2|  NULL|{Freedom Mortgage...| _doc| [718]|
|10171606|complaint-public-v2|  NULL|{Shellpoint Partn...| _doc| [807]|
| 8161248|complaint-public-v2|  NULL|{M&T BANK 

In [4]:
# Unnest _source and keep all fields
from pyspark.sql.functions import col

columns_to_keep = [
    "_source.company",
    "_source.company_public_response",
    "_source.company_response",
    "_source.complaint_id",
    "_source.complaint_what_happened",
    "_source.consumer_consent_provided",
    "_source.consumer_disputed",
    "_source.date_received",
    "_source.date_sent_to_company",
    "_source.issue",
    "_source.product",
    "_source.state",
    "_source.sub_issue",
    "_source.sub_product",
    "_source.submitted_via",
    "_source.tags",
    "_source.timely",
    "_source.zip_code",
]

# Select and rename columns
unnested_df = complaints_df.select([col(column).alias(column.split(".")[1]) for column in columns_to_keep])

# Show the result
unnested_df.show(truncate=False)

+----------------------------------+-----------------------------------------------------------------------------------------------+-------------------------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [5]:
print(f"df length: {unnested_df.count()}")
unnested_df.show(25)

df length: 71175
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------+-----+--------------------+--------------------+-------------+--------------+------+--------+
|             company|company_public_response|    company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|       date_received|date_sent_to_company|               issue| product|state|           sub_issue|         sub_product|submitted_via|          tags|timely|zip_code|
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------+-----+--------------------+--------------------+-------------+--------------+------+--------+
|AMERICA FIRST FED...|                   

In [5]:
unique_count_ap = complaints_df.select("Issue").distinct().count()

print(f"Number of unique complaints: {unique_count_ap}")



Number of unique complaints: 390111


                                                                                

