In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config("spark.sql.warehouse.dir", f"/user/itv005857/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
loans_schema = 'loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_months string, interest_rate float, monthly_installment float, issue_date string, loan_status string, loan_purpose string, loan_title string'

In [1]:
loans_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(loans_schema) \
.load("/user/itv010110/lendingclubprojectJ/raw/loans_data_csv")

NameError: name 'spark' is not defined

In [4]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business
68341763,91060b858433e8a61...,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,
66310712,cab1fa9f533688b0a...,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation
68476807,f74e401c1ab0adf78...,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
68476668,538b4653da3b1e814...,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
67275481,b24d55f21390533c5...,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase
68466926,1035c5401b0ca76d0...,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...
68616873,cb0f1777593e77909...,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...


In [5]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [6]:
from pyspark.sql.functions import current_timestamp

In [7]:
loans_df_ingestd = loans_raw_df.withColumn("ingest_date",  current_timestamp())

In [8]:
loans_df_ingestd

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:34:...
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2024-05-14 10:34:...
68341763,91060b858433e8a61...,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2024-05-14 10:34:...
66310712,cab1fa9f533688b0a...,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2024-05-14 10:34:...
68476807,f74e401c1ab0adf78...,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:34:...
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:34:...
68476668,538b4653da3b1e814...,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:34:...
67275481,b24d55f21390533c5...,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:34:...
68466926,1035c5401b0ca76d0...,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:34:...
68616873,cb0f1777593e77909...,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:34:...


In [9]:
loans_df_ingestd.createOrReplaceTempView("loans")

In [10]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [11]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-05-14 10:35:...


In [12]:
columns_to_check = ["loan_amount","funded_amount","loan_term_months","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [13]:
loans_filtered_df = loans_df_ingestd.na.drop(subset = columns_to_check)

In [14]:
loans_filtered_df.count()

2260667

In [15]:
loans_filtered_df.createOrReplaceTempView("loans")

In [16]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2024-05-14 10:35:...
68341763,91060b858433e8a61...,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2024-05-14 10:35:...
66310712,cab1fa9f533688b0a...,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476807,f74e401c1ab0adf78...,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476668,538b4653da3b1e814...,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
67275481,b24d55f21390533c5...,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68466926,1035c5401b0ca76d0...,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...
68616873,cb0f1777593e77909...,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...


In [17]:
from pyspark.sql.functions import regexp_replace, col

In [18]:
loans_term_modified_df = loans_filtered_df.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), "months","") \
.cast("int") / 12) \
.cast("int")) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [19]:
loans_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,3,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,3,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2024-05-14 10:35:...
68341763,91060b858433e8a61...,20000.0,20000.0,5,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2024-05-14 10:35:...
66310712,cab1fa9f533688b0a...,35000.0,35000.0,5,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476807,f74e401c1ab0adf78...,10400.0,10400.0,5,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,3,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476668,538b4653da3b1e814...,20000.0,20000.0,3,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
67275481,b24d55f21390533c5...,20000.0,20000.0,3,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68466926,1035c5401b0ca76d0...,10000.0,10000.0,3,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...
68616873,cb0f1777593e77909...,8000.0,8000.0,3,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...


In [20]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [21]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [22]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,3,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,3,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2024-05-14 10:35:...
68341763,91060b858433e8a61...,20000.0,20000.0,5,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2024-05-14 10:35:...
66310712,cab1fa9f533688b0a...,35000.0,35000.0,5,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476807,f74e401c1ab0adf78...,10400.0,10400.0,5,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,3,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476668,538b4653da3b1e814...,20000.0,20000.0,3,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
67275481,b24d55f21390533c5...,20000.0,20000.0,3,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68466926,1035c5401b0ca76d0...,10000.0,10000.0,3,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...
68616873,cb0f1777593e77909...,8000.0,8000.0,3,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...


In [23]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
<br/><br/>Lending...
Bank of America c...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [24]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [25]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement","other","major_purchase","medical","small_business","car","vacation","moving","house","wedding","renewable_energy","educational"]

In [26]:
from pyspark.sql.functions import when

In [27]:
loans_purpose_modified = loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [28]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4ea...,3600.0,3600.0,3,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68355089,b5e7938b0a2da4cea...,24700.0,24700.0,3,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2024-05-14 10:35:...
68341763,91060b858433e8a61...,20000.0,20000.0,5,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2024-05-14 10:35:...
66310712,cab1fa9f533688b0a...,35000.0,35000.0,5,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476807,f74e401c1ab0adf78...,10400.0,10400.0,5,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68426831,8aef4bb29d609d8d6...,11950.0,11950.0,3,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
68476668,538b4653da3b1e814...,20000.0,20000.0,3,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-14 10:35:...
67275481,b24d55f21390533c5...,20000.0,20000.0,3,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2024-05-14 10:35:...
68466926,1035c5401b0ca76d0...,10000.0,10000.0,3,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...
68616873,cb0f1777593e77909...,8000.0,8000.0,3,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refin...,2024-05-14 10:35:...


In [29]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [30]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [31]:
from pyspark.sql.functions import count

In [32]:
loans_purpose_modified \
.groupBy("loan_purpose") \
.agg(count("*") \
.alias("total")) \
.orderBy(col("total").desc())

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [33]:
loans_purpose_modified.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv010110/lendingclubprojectJ/cleaned/loans_csv") \
.save()

In [34]:
loans_purpose_modified.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv010110/lendingclubprojectJ/cleaned/loans_parquet") \
.save()