In [1]:
# Always neds to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hyundai').getOrCreate()

In [None]:
'''Aim here is to predict number of crew required for ships based on the their features. All features appear important.
Crusie_line data needs to be converted from a string to a number becuase it counts alot for the crew usage.
Only regression will be used here for siplicity'''

In [2]:
# Import LinReg from MLlib
from pyspark.ml.regression import LinearRegression

In [3]:
# Set training
data = spark.read.csv('/home/baxman/Codes/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv', inferSchema=True,header=True)

In [4]:
# Showing data and schema
data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [5]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [18]:
# Convert Cruise_line string column to a number with StringIndexer

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_line_index")
indexed = indexer.fit(data).transform(data)

In [19]:
indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_line_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|              1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|       

In [20]:
# Setting up data for usage in MLlib

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
# Don't need ALL columns line string ones...
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_line_index']

In [27]:
# Assemble vectors for MLlib
assembler = VectorAssembler(inputCols=[  'Age',
                                         'Tonnage',
                                         'passengers',
                                         'length',
                                         'cabins',
                                         'passenger_density',
                                         'cruise_line_index'],
                           outputCol = 'features')

In [28]:
output = assembler.transform(indexed)

In [30]:
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- cruise_line_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [31]:
# Make final data

final_data = output.select('features', 'crew')

In [32]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [36]:
# Train/test split
training_data, test_data = final_data.randomSplit([0.8,0.2])

In [37]:
training_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               123|
|   mean| 7.832032520325214|
| stddev|3.5578711997968115|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [38]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               35|
|   mean|7.661142857142855|
| stddev|3.351721920436237|
|    min|             0.59|
|    max|             13.6|
+-------+-----------------+



In [40]:
# Create lr model and fit it

lr = LinearRegression(labelCol = 'crew')
lr_model = lr.fit(training_data)

In [41]:
# Evaluate model

test_results = lr_model.evaluate(test_data)

In [42]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -0.8448755683926414|
|-0.29553353702649154|
|-0.22625082438666766|
| -0.5081998678606983|
|-0.23209551652924176|
|  1.0544708322420036|
|-0.14059105389222637|
|  1.6939164436315348|
|-0.13956503775907425|
| -0.3515124459735137|
| -0.1996768185042601|
|   0.946989857375037|
| 0.09501238512145704|
|  0.8315704760289027|
| -1.1416645127258622|
| 0.29870512364335733|
| 0.31529516662649826|
| -1.0393802464757025|
| 0.28411067772909604|
|  0.5736023287921874|
+--------------------+
only showing top 20 rows



In [43]:
# Show RMSE
test_results.rootMeanSquaredError

0.6324210153646302

In [49]:
# Show R2
test_results.r2

0.963350694085215

In [51]:
# Seeing correlation

from pyspark.sql.functions import corr
data.select(corr('crew','passengers')).show()

# Very strong correlation between crew number and passenger numbers - as expected

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [45]:
# Final data comparison

final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [46]:
# Predict on unlabled data

unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[6.0,30.276999999...|
|[6.0,158.0,43.7,1...|
|[7.0,158.0,43.7,1...|
|[8.0,110.0,29.74,...|
|[9.0,59.058,17.0,...|
|[9.0,113.0,26.74,...|
|[9.0,116.0,26.0,9...|
|[10.0,46.0,7.0,6....|
|[10.0,68.0,10.8,7...|
|[10.0,81.76899999...|
|[11.0,86.0,21.24,...|
|[11.0,108.977,26....|
|[12.0,42.0,14.8,7...|
|[13.0,91.0,20.32,...|
|[13.0,138.0,31.14...|
|[14.0,30.27699999...|
|[14.0,33.0,4.9,5....|
|[14.0,63.0,14.4,7...|
|[14.0,83.0,17.5,9...|
|[15.0,70.367,20.5...|
+--------------------+
only showing top 20 rows



In [47]:
# Predicting crew members basted on features

predictions = lr_model.transform(unlabeled_data)

In [48]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[6.0,30.276999999...| 4.394875568392641|
|[6.0,158.0,43.7,1...|13.895533537026491|
|[7.0,158.0,43.7,1...|13.826250824386667|
|[8.0,110.0,29.74,...|12.108199867860698|
|[9.0,59.058,17.0,...| 7.632095516529242|
|[9.0,113.0,26.74,...|11.325529167757997|
|[9.0,116.0,26.0,9...|11.140591053892226|
|[10.0,46.0,7.0,6....| 2.776083556368465|
|[10.0,68.0,10.8,7...| 6.499565037759075|
|[10.0,81.76899999...| 8.771512445973514|
|[11.0,86.0,21.24,...|  9.49967681850426|
|[11.0,108.977,26....|11.053010142624963|
|[12.0,42.0,14.8,7...| 6.704987614878543|
|[13.0,91.0,20.32,...| 9.158429523971098|
|[13.0,138.0,31.14...|12.901664512725862|
|[14.0,30.27699999...|3.4312948763566427|
|[14.0,33.0,4.9,5....| 2.924704833373502|
|[14.0,63.0,14.4,7...| 6.649380246475703|
|[14.0,83.0,17.5,9...| 9.165889322270903|
|[15.0,70.367,20.5...| 8.626397671207812|
+--------------------+------------