## Import libraries

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

## Create a sparksession object

In [14]:
spark = (SparkSession.builder
         .appName("perform-joins")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

## Create the dataframe

In [15]:
cards_df = (spark.read.format("csv")
            .option("header", "true")
            .option("nullValue", "null")
            .load("../../data/Credit Card/CardBase.csv")
           )

customers_df = (spark.read.format("csv")
            .option("header", "true")
            .option("nullValue", "null")
            .load("../../data/Credit Card/CustomerBase.csv")
           )

transactions_df = (spark.read.format("csv")
                   .option("header", "true")
                   .option("nullValue", "null")
                   .load("../../data/Credit Card/TransactionBase.csv")
                  )

fraud_df = (spark.read.format("csv")
                   .option("header", "true")
                   .option("nullValue", "null")
                   .load("../../data/Credit Card/FraudBase.csv")
                  )


## Inner join Cards and Customers (All cards have a customer associated with it)

In [16]:
customer_cards_df = (cards_df.join(customers_df,
                                   on = "Cust_ID",
                                   how ='inner'))

customer_cards_df.show()



+-------+-------------------+-----------+------------+---+----------------+----------------------+
|Cust_ID|        Card_Number|Card_Family|Credit_Limit|Age|Customer_Segment|Customer_Vintage_Group|
+-------+-------------------+-----------+------------+---+----------------+----------------------+
|CC55858|2868-5606-5152-5706|       Gold|       27000| 30|         Diamond|                   VG1|
|CC46077|6876-7378-4945-3251|       Gold|       44000| 49|         Diamond|                   VG1|
|CC46484|5556-4557-4566-1540|       Gold|       45000| 49|         Diamond|                   VG1|
|CC59340|5618-9718-9367-2102|       Gold|       14000| 25|         Diamond|                   VG1|
|CC62994|1652-7516-1273-1992|   Platinum|      180000| 48|         Diamond|                   VG1|
|CC43841|7212-8665-7734-5918|   Platinum|       55000| 30|         Diamond|                   VG1|
|CC21312|7837-4036-5999-1672|       Gold|       24000| 45|         Diamond|                   VG1|
|CC90510|6

### Alternate approach if they have different column names

## Left Outer Join transactions and frauds (not all transactions are frauds)

In [17]:
trans_frauds_df = (transactions_df.join(fraud_df,
                                        transactions_df["Transaction_ID"] == fraud_df["Transaction_ID"],
                                        how = 'left_outer')
                  )

trans_frauds_df.show()

+--------------+----------------+-------------------+-----------------+-------------------+--------------+----------+
|Transaction_ID|Transaction_Date|     Credit_Card_ID|Transaction_Value|Transaction_Segment|Transaction_ID|Fraud_Flag|
+--------------+----------------+-------------------+-----------------+-------------------+--------------+----------+
|  CTID28830551|       24-Apr-16|1629-9566-3285-2123|            23649|              SEG25|          null|      null|
|  CTID45504917|       11-Feb-16|3697-6001-4909-5350|            26726|              SEG16|          null|      null|
|  CTID47312290|        1-Nov-16|5864-4475-3659-1440|            22012|              SEG14|          null|      null|
|  CTID25637718|       28-Jan-16|5991-4421-8476-3804|            37637|              SEG17|          null|      null|
|  CTID66743960|       17-Mar-16|1893-8853-9900-8478|             5113|              SEG14|          null|      null|
|  CTID22308010|       15-May-16|5206-5979-9383-4538|   

## Identify customers who have experienced a fradulent transaction (complex conditions)

### Join customer_cards_df with trans_fraud_df (with a condition where fraud_flag is not null)

In [18]:
joinExpression = ((customer_cards_df["Card_Number"] == trans_frauds_df["Credit_Card_ID"]) & (trans_frauds_df['Fraud_Flag'].isNotNull()))


customer_with_fraud_df = (customer_cards_df.join(trans_frauds_df,
                                                 on = joinExpression,
                                                 how = 'inner'))

customer_with_fraud_df.show(5, truncate = False)

+-------+-------------------+-----------+------------+---+----------------+----------------------+--------------+----------------+-------------------+-----------------+-------------------+--------------+----------+
|Cust_ID|Card_Number        |Card_Family|Credit_Limit|Age|Customer_Segment|Customer_Vintage_Group|Transaction_ID|Transaction_Date|Credit_Card_ID     |Transaction_Value|Transaction_Segment|Transaction_ID|Fraud_Flag|
+-------+-------------------+-----------+------------+---+----------------+----------------------+--------------+----------------+-------------------+-----------------+-------------------+--------------+----------+
|CC87306|5734-5619-8469-4044|Gold       |36000       |30 |Diamond         |VG1                   |CTID26555772  |11-Jan-16       |5734-5619-8469-4044|683              |SEG22              |CTID26555772  |1         |
|CC87034|6722-7299-6082-7974|Gold       |34000       |36 |Platinum        |VG2                   |CTID30763806  |17-Dec-16       |6722-7299-

## Other scenarios

### Right outer join

## Full outer join

## Cross Join

## BroadCast Join

## Multiple Join conditions

### Performing Inner join on multiple columns

In [19]:
spark.stop()