In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, functions, types
spark = SparkSession\
    .builder\
    .appName("Transform&Load")\
    .config("spark.driver.extraClassPath","/home/jim/spark-2.4.0-bin-hadoop2.7/jars/mysql-connector-java-5.1.49.jar")\
    .getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

### Read the data saved as a mySQL database table into a spark Dataframe

In [2]:
df = spark.read\
    .format("jdbc")\
    .option("url", "jdbc:mysql://localhost/Insurance")\
    .option("driver", "com.mysql.jdbc.Driver")\
    .option("dbtable", "Insurance_data").option("user", "jsully")\
    .option("password", "whatisreal1").load()

In [3]:
df.select(df.columns[:8]).show(5)

+-----------+-------+-----------------------+---+--------+--------+-----------------+-----------------+
|Customer_ID|   City|Customer Lifetime Value|Age|Response|Coverage|Effective To Date|Employment_Status|
+-----------+-------+-----------------------+---+--------+--------+-----------------+-----------------+
|    AA23674|Chennai|                9996.58| 44|     Yes| Premium|       18-04-2019|         Employed|
|    AA37549| Indore|                4009.22| 45|     Yes|Extended|       12-04-2019|         On leave|
|    AA39942|   Pune|                5805.17| 27|      No| Premium|       18-08-2020|          Retired|
|    AA55170| Mumbai|                 5874.0| 35|      No| Premium|       21-08-2020|         On leave|
|    AA63877| Mumbai|                7854.84| 38|     Yes| Premium|       11-11-2018|         Employed|
+-----------+-------+-----------------------+---+--------+--------+-----------------+-----------------+
only showing top 5 rows



## Transform the Data
### Now we shall transform the data. The first step is to convert all categorical data to numerical, so that the ML model can process it more easily
### The SPark String Indexer can be utilized here. 
(https://spark.apache.org/docs/latest/ml-features.html#stringindexer)

In [7]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="City", outputCol="CityIndex")
df = indexer.fit(df).transform(df)
df.select('City','CityIndex').distinct().show()

+---------+---------+
|     City|CityIndex|
+---------+---------+
|Hyderabad|      6.0|
|  Kolkata|      4.0|
|   Indore|      5.0|
|     Pune|      1.0|
|Bengaluru|      8.0|
|   Bhopal|      3.0|
|   Mumbai|      7.0|
|  Chennai|      0.0|
|    Delhi|      2.0|
+---------+---------+



### Do this for all categorical columns

In [8]:
indexer_list = []
categ_cols = ['Response','Coverage','Education','Employment_Status','Gender','Location_Code','Marital Status','Policy_Type','Policy_Rating','Renew_Offer_Type','Sales_Channel','Total Claim Amount','Feedback','Job','Company','Credit Card Provider']
for i in categ_cols:
    indexer_list.append(StringIndexer(inputCol=i, outputCol=i+" Index"))
for j in indexer_list:
    df = j.fit(df).transform(df)

### Drop the original categorical columns

In [22]:
df = df.select([c for c in df.columns if c not in categ_cols])
df = df.drop('City')
len(df.columns)

33

#### There is a datetime column which is actually of type String. We can discard the day and month as they are unlikely to influence policy renewal choices when faced with multi-year insurance policies. We will retain only the year and also change the format from string to numeric.

In [44]:
df = df.withColumn("Effective_to_Date",functions.split("Effective_to_Date", "-").getItem(2))
df.select('Effective_to_Date').distinct().show()

+-----------------+
|Effective_to_Date|
+-----------------+
|             2020|
|             2019|
|             2018|
+-----------------+



In [45]:
df = df.withColumn("Effective_to_Date", df["Effective_to_Date"].cast(types.IntegerType()))

## Load Data
### Export the Dataframe to a Database Table in mySQL

In [70]:
df.write\
    .format("jdbc")\
    .option("url", "jdbc:mysql://localhost/Insurance")\
    .option("driver", "com.mysql.jdbc.Driver")\
    .option("dbtable", "Insurance_numdata").option("user", "jsully")\
    .option("password", "whatisreal1").mode('errorifexists').save()