In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [2]:
# starting the Spark server
spark = (
    SparkSession.builder.master("local[1]")
    .appName("titanic_assignment")
    .getOrCreate()
)

23/05/18 12:04:03 WARN Utils: Your hostname, all-MS-7D35 resolves to a loopback address: 127.0.1.1; using 192.168.1.116 instead (on interface enp2s0)
23/05/18 12:04:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/18 12:04:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/18 12:04:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/18 12:04:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/18 12:04:04 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
# reading the CSV file
df = spark.read.csv("./data/titanic.csv", inferSchema=True)

In [4]:
df.show()

+---+---+---+--------------------+------+----+---+---+----------------+-------+----+----+-------------------+
|_c0|_c1|_c2|                 _c3|   _c4| _c5|_c6|_c7|             _c8|    _c9|_c10|_c11|               _c12|
+---+---+---+--------------------+------+----+---+---+----------------+-------+----+----+-------------------+
|  1|  0|  3|Braund, Mr. Owen ...|  male|  22|  1|  0|       A/5 21171|   7.25|null|   S|2020-01-01 13:45:25|
|  2|  1|  1|Cumings, Mrs. Joh...|female|  38|  1|  0|        PC 17599|71.2833| C85|   C|2020-01-01 13:44:48|
|  3|  1|  3|Heikkinen, Miss. ...|female|  26|  0|  0|STON/O2. 3101282|  7.925|null|   S|2020-01-01 13:38:11|
|  4|  1|  1|Futrelle, Mrs. Ja...|female|  35|  1|  0|          113803|   53.1|C123|   S|2020-01-01 13:32:00|
|  5|  0|  3|Allen, Mr. Willia...|  male|  35|  0|  0|          373450|   8.05|null|   S|2020-01-01 13:36:30|
|  6|  0|  3|    Moran, Mr. James|  male|null|  0|  0|          330877| 8.4583|null|   Q|2020-01-01 13:31:39|
|  7|  0| 

In [5]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: timestamp (nullable = true)



In [6]:
# renaming some columns
df = df.withColumnRenamed("_c0", "id") \
       .withColumnRenamed("_c3", "Name") \
       .withColumnRenamed("_c4", "gender") \
       .withColumnRenamed("_c5", "age") 

#dropping the timestamp column
df = df.drop("_c12")
df.show(10)

+---+---+---+--------------------+------+----+---+---+----------------+-------+----+----+
| id|_c1|_c2|                Name|gender| age|_c6|_c7|             _c8|    _c9|_c10|_c11|
+---+---+---+--------------------+------+----+---+---+----------------+-------+----+----+
|  1|  0|  3|Braund, Mr. Owen ...|  male|  22|  1|  0|       A/5 21171|   7.25|null|   S|
|  2|  1|  1|Cumings, Mrs. Joh...|female|  38|  1|  0|        PC 17599|71.2833| C85|   C|
|  3|  1|  3|Heikkinen, Miss. ...|female|  26|  0|  0|STON/O2. 3101282|  7.925|null|   S|
|  4|  1|  1|Futrelle, Mrs. Ja...|female|  35|  1|  0|          113803|   53.1|C123|   S|
|  5|  0|  3|Allen, Mr. Willia...|  male|  35|  0|  0|          373450|   8.05|null|   S|
|  6|  0|  3|    Moran, Mr. James|  male|null|  0|  0|          330877| 8.4583|null|   Q|
|  7|  0|  1|McCarthy, Mr. Tim...|  male|  54|  0|  0|           17463|51.8625| E46|   S|
|  8|  0|  3|Palsson, Master. ...|  male|   2|  3|  1|          349909| 21.075|null|   S|
|  9|  1| 

# Calculating minimum, maximum and average values for numerical columns

In [7]:
# printing unique values in _c1
df.select("_c1").distinct().collect()

[Row(_c1=1), Row(_c1=0)]

In [8]:
# printing unique values in _c1
df.select("_c2").distinct().collect()

[Row(_c2=1), Row(_c2=3), Row(_c2=2)]

### From the above two results, we can safely assume that both these columns are CATEGORICAL 
#### since they only contain 2 or 3 unique values and were label encoded

In [9]:
df = df.withColumnRenamed("_c1", "category_1") \
       .withColumnRenamed("_c2", "category_2")  

In [10]:
df.select("_c6").distinct().collect()

[Row(_c6=1),
 Row(_c6=3),
 Row(_c6=5),
 Row(_c6=4),
 Row(_c6=8),
 Row(_c6=2),
 Row(_c6=0)]

In [11]:
df.select("_c7").distinct().collect()

[Row(_c7=1),
 Row(_c7=6),
 Row(_c7=3),
 Row(_c7=5),
 Row(_c7=4),
 Row(_c7=2),
 Row(_c7=0)]

In [12]:
df.show()

+---+----------+----------+--------------------+------+----+---+---+----------------+-------+----+----+
| id|category_1|category_2|                Name|gender| age|_c6|_c7|             _c8|    _c9|_c10|_c11|
+---+----------+----------+--------------------+------+----+---+---+----------------+-------+----+----+
|  1|         0|         3|Braund, Mr. Owen ...|  male|  22|  1|  0|       A/5 21171|   7.25|null|   S|
|  2|         1|         1|Cumings, Mrs. Joh...|female|  38|  1|  0|        PC 17599|71.2833| C85|   C|
|  3|         1|         3|Heikkinen, Miss. ...|female|  26|  0|  0|STON/O2. 3101282|  7.925|null|   S|
|  4|         1|         1|Futrelle, Mrs. Ja...|female|  35|  1|  0|          113803|   53.1|C123|   S|
|  5|         0|         3|Allen, Mr. Willia...|  male|  35|  0|  0|          373450|   8.05|null|   S|
|  6|         0|         3|    Moran, Mr. James|  male|null|  0|  0|          330877| 8.4583|null|   Q|
|  7|         0|         1|McCarthy, Mr. Tim...|  male|  54|  0|

### Checking for null values in all the numerical columns

In [13]:
df.filter(col("_c6").isNull()).show()

+---+----------+----------+----+------+---+---+---+---+---+----+----+
| id|category_1|category_2|Name|gender|age|_c6|_c7|_c8|_c9|_c10|_c11|
+---+----------+----------+----+------+---+---+---+---+---+----+----+
+---+----------+----------+----+------+---+---+---+---+---+----+----+



In [14]:
df.filter(col("_c7").isNull()).show()

+---+----------+----------+----+------+---+---+---+---+---+----+----+
| id|category_1|category_2|Name|gender|age|_c6|_c7|_c8|_c9|_c10|_c11|
+---+----------+----------+----+------+---+---+---+---+---+----+----+
+---+----------+----------+----+------+---+---+---+---+---+----+----+



In [15]:
df.filter(col("_c9").isNull()).show()

+---+----------+----------+----+------+---+---+---+---+---+----+----+
| id|category_1|category_2|Name|gender|age|_c6|_c7|_c8|_c9|_c10|_c11|
+---+----------+----------+----+------+---+---+---+---+---+----+----+
+---+----------+----------+----+------+---+---+---+---+---+----+----+



In [16]:
df.filter(col("age").isNull()).show()

+---+----------+----------+--------------------+------+----+---+---+---------------+--------+----+----+
| id|category_1|category_2|                Name|gender| age|_c6|_c7|            _c8|     _c9|_c10|_c11|
+---+----------+----------+--------------------+------+----+---+---+---------------+--------+----+----+
|  6|         0|         3|    Moran, Mr. James|  male|null|  0|  0|         330877|  8.4583|null|   Q|
| 18|         1|         2|Williams, Mr. Cha...|  male|null|  0|  0|         244373|    13.0|null|   S|
| 20|         1|         3|Masselmani, Mrs. ...|female|null|  0|  0|           2649|   7.225|null|   C|
| 27|         0|         3|Emir, Mr. Farred ...|  male|null|  0|  0|           2631|   7.225|null|   C|
| 29|         1|         3|"O'Dwyer, Miss. E...|female|null|  0|  0|         330959|  7.8792|null|   Q|
| 30|         0|         3| Todoroff, Mr. Lalio|  male|null|  0|  0|         349216|  7.8958|null|   S|
| 32|         1|         1|Spencer, Mrs. Wil...|female|null|  1|

### We observe that out of all the numerical columns, only age has null values
### To handle these values, we shall impute them with the average of the age column

In [17]:
avg_age = int(df.select(avg("age")).collect()[0][0])
avg_age

29

In [18]:
df = df.fillna({'age':avg_age})
df.show()

+---+----------+----------+--------------------+------+---+---+---+----------------+-------+----+----+
| id|category_1|category_2|                Name|gender|age|_c6|_c7|             _c8|    _c9|_c10|_c11|
+---+----------+----------+--------------------+------+---+---+---+----------------+-------+----+----+
|  1|         0|         3|Braund, Mr. Owen ...|  male| 22|  1|  0|       A/5 21171|   7.25|null|   S|
|  2|         1|         1|Cumings, Mrs. Joh...|female| 38|  1|  0|        PC 17599|71.2833| C85|   C|
|  3|         1|         3|Heikkinen, Miss. ...|female| 26|  0|  0|STON/O2. 3101282|  7.925|null|   S|
|  4|         1|         1|Futrelle, Mrs. Ja...|female| 35|  1|  0|          113803|   53.1|C123|   S|
|  5|         0|         3|Allen, Mr. Willia...|  male| 35|  0|  0|          373450|   8.05|null|   S|
|  6|         0|         3|    Moran, Mr. James|  male| 29|  0|  0|          330877| 8.4583|null|   Q|
|  7|         0|         1|McCarthy, Mr. Tim...|  male| 54|  0|  0|      

In [19]:
df.filter(col("age").isNull()).show()

+---+----------+----------+----+------+---+---+---+---+---+----+----+
| id|category_1|category_2|Name|gender|age|_c6|_c7|_c8|_c9|_c10|_c11|
+---+----------+----------+----+------+---+---+---+---+---+----+----+
+---+----------+----------+----+------+---+---+---+---+---+----+----+



### Now, we can calculate the max, min, and average of all numerical columns

In [20]:
max_age = int(df.agg({"age": "max"}).collect()[0][0])
min_age = int(df.agg({"age": "min"}).collect()[0][0])
avg_age = int(df.agg({"age":"avg"}).collect()[0][0])

In [21]:
{"max_age":max_age, "min_age":min_age, "avg_age":avg_age} 

{'max_age': 80, 'min_age': 0, 'avg_age': 29}

In [22]:
max_c6 = int(df.agg({"_c6": "max"}).collect()[0][0])
min_c6 = int(df.agg({"_c6": "min"}).collect()[0][0])
avg_c6 = int(df.agg({"_c6":"avg"}).collect()[0][0])

In [23]:
{"max_c6":max_c6, "min_c6":min_c6, "avg_c6":avg_c6} 

{'max_c6': 8, 'min_c6': 0, 'avg_c6': 0}

In [24]:
max_c7 = int(df.agg({"_c7": "max"}).collect()[0][0])
min_c7 = int(df.agg({"_c7": "min"}).collect()[0][0])
avg_c7 = int(df.agg({"_c7":"avg"}).collect()[0][0])

In [25]:
{"max_c7":max_c7, "min_c7":min_c7, "avg_c7":avg_c7} 

{'max_c7': 6, 'min_c7': 0, 'avg_c7': 0}

In [26]:
max_c9 = int(df.agg({"_c9": "max"}).collect()[0][0])
min_c9 = int(df.agg({"_c9": "min"}).collect()[0][0])
avg_c9 = int(df.agg({"_c9":"avg"}).collect()[0][0])

In [27]:
{"max_c9":max_c9, "min_c9":min_c9, "avg_c9":avg_c9} 

{'max_c9': 512, 'min_c9': 0, 'avg_c9': 32}

# Applying UDF that will change the last letter of every word in categorical columns to “1”.

In [28]:
# creating a User Defined Function
def change_to_1(value):
    if value is None:
        return('1')
    
    words = value.split()
    result = []
    for word in words:
        new_word = word[:-1] + '1'
        result.append(new_word)
        

    final_result = ' '.join(result)
    
    return final_result

In [29]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- category_1: integer (nullable = true)
 |-- category_2: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [30]:
# changing the datatype of 'category_1' and 'category_2' columns from integer to string
df2 = df.withColumn("category_1",col("category_1").cast(StringType())) \
    .withColumn("category_2",col("category_2").cast(StringType()))

df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- category_1: string (nullable = true)
 |-- category_2: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [31]:
# registering the UDF
change_word_udf = udf(change_to_1, StringType())

# applying the UDF to the categorical columns
df_with_changed_last_word = df2.withColumn('category_1', change_word_udf('category_1')) \
                               .withColumn('category_2', change_word_udf('category_2')) \
                               .withColumn('gender', change_word_udf('gender')) \
                               .withColumn('_c8', change_word_udf('_c8')) \
                               .withColumn('_c10', change_word_udf('_c10')) \
                               .withColumn('_c11', change_word_udf('_c11'))
'''
    we did not consider Name to be a categorical column
'''
# Show the updated DataFrame
df_with_changed_last_word.show()

+---+----------+----------+--------------------+------+---+---+---+----------------+-------+----+----+
| id|category_1|category_2|                Name|gender|age|_c6|_c7|             _c8|    _c9|_c10|_c11|
+---+----------+----------+--------------------+------+---+---+---+----------------+-------+----+----+
|  1|         1|         1|Braund, Mr. Owen ...|  mal1| 22|  1|  0|       A/1 21171|   7.25|   1|   1|
|  2|         1|         1|Cumings, Mrs. Joh...|femal1| 38|  1|  0|        P1 17591|71.2833| C81|   1|
|  3|         1|         1|Heikkinen, Miss. ...|femal1| 26|  0|  0|STON/O21 3101281|  7.925|   1|   1|
|  4|         1|         1|Futrelle, Mrs. Ja...|femal1| 35|  1|  0|          113801|   53.1|C121|   1|
|  5|         1|         1|Allen, Mr. Willia...|  mal1| 35|  0|  0|          373451|   8.05|   1|   1|
|  6|         1|         1|    Moran, Mr. James|  mal1| 29|  0|  0|          330871| 8.4583|   1|   1|
|  7|         1|         1|McCarthy, Mr. Tim...|  mal1| 54|  0|  0|      

                                                                                

### the dataframe is already sorted by the first column

In [32]:
# now saving this df to a parquet file
df_with_changed_last_word.write.parquet('./final_df_parquet')

                                                                                