In [0]:
# Instantaite spark Session
from pyspark.sql import SparkSession

In [0]:
# Create spark instance
spark = SparkSession.builder.appName('lr_project').getOrCreate()

In [0]:
# Load the data
df = spark.read.csv('/FileStore/tables/cruise_ship_info.csv', inferSchema=True, header=True)

In [0]:
[print(row) for row in df.head(2)[1]]

Quest
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55
Out[14]: [None, None, None, None, None, None, None, None, None]

In [0]:
df.columns

Out[8]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [0]:
# Importing linear regression
from pyspark.ml.regression import LinearRegression

In [0]:
# Prepare our dataset for spark machine readable format
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
df.columns

Out[17]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [0]:
df.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='Cruise_line',outputCol='cruise_cat')
indexed = indexer.fit(df).transform(df)
indexed.head(2)

Out[57]: [Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0)]

In [0]:
# Assembling features into vector
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','cruise_cat',], outputCol='features')

In [0]:
type(assembler)

Out[24]: pyspark.ml.feature.VectorAssembler

In [0]:
# Now transforming the assembler to the dataframe
output = assembler.transform(indexed)

In [0]:
output.head(1)

Out[60]: [Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0, features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

In [0]:
# Getting the right columns
final_df = output.select(['features','crew'])

In [0]:
final_df.show(n=2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
+--------------------+----+
only showing top 2 rows



In [0]:
# Train test division
train_data, test_data = final_df.randomSplit([0.8, 0.2])

In [0]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               133|
|   mean| 7.626090225563921|
| stddev|3.3115303233862674|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [0]:
test_data.describe().show()

+-------+----------------+
|summary|            crew|
+-------+----------------+
|  count|              25|
|   mean|          8.6884|
| stddev|4.35790630157801|
|    min|             1.8|
|    max|            21.0|
+-------+----------------+



In [0]:
# Fitting the model
lr = LinearRegression(labelCol='crew')
lr_model = lr.fit(train_data)

In [0]:
# Evaluate Model
evaluate = lr_model.evaluate(test_data)

In [0]:
evaluate.r2, evaluate.rootMeanSquaredError

Out[70]: (0.8633709237550353, 1.5782836985436364)

In [0]:
evaluate.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  0.6866160706923488|
|  0.6003454000228494|
|  0.5267237726813381|
| -0.4788986677841969|
| -0.9742268687589011|
| -0.4308820429533817|
|  0.7045825635922345|
|   7.233748795168413|
|-0.42379834184464915|
|-0.03452709360135042|
| -1.0369631367375263|
|-0.45554556354549725|
| -1.0232775833784498|
|  0.5925780408764378|
|   -0.39911429216796|
| -0.5832855685888054|
|  0.6773529254467157|
|-0.20111727079962316|
|   0.686440836120509|
| -1.2380297527591437|
+--------------------+
only showing top 20 rows



In [0]:
final_df.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [0]:
# Prediction for unlabeled data
unlabeled_data= test_data.select('features')

In [0]:
prediction = lr_model.transform(unlabeled_data)

In [0]:
prediction.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...| 20.31338392930765|
|[6.0,110.23899999...| 10.89965459997715|
|[9.0,81.0,21.44,9...| 9.473276227318662|
|[10.0,86.0,21.14,...| 9.678898667784196|
|[10.0,138.0,31.14...|  12.8242268687589|
|[11.0,90.09,25.01...| 8.910882042953382|
|[11.0,91.0,20.32,...| 9.285417436407766|
|[11.0,110.0,29.74...|11.866251204831588|
|[12.0,58.6,15.66,...| 7.423798341844649|
|[13.0,30.27699999...|  4.03452709360135|
|[13.0,138.0,31.14...|12.796963136737526|
|[14.0,101.509,27....|10.455545563545497|
|[16.0,74.137,19.5...|  8.62327758337845|
|[16.0,77.71300000...| 8.497421959123562|
|[17.0,70.0,20.76,...|  7.59911429216796|
|[18.0,51.004,9.4,...| 6.033285568588806|
|[18.0,70.367,20.5...| 8.522647074553284|
|[19.0,16.8,2.96,5...|2.3011172707996232|
|[19.0,70.367,20.5...|  8.51355916387949|
|[23.0,14.745,3.08...|3.0380297527591438|
+--------------------+------------

In [0]:
test_data.select('crew').show()

+-----+
| crew|
+-----+
| 21.0|
| 11.5|
| 10.0|
|  9.2|
|11.85|
| 8.48|
| 9.99|
| 19.1|
|  7.0|
|  4.0|
|11.76|
| 10.0|
|  7.6|
| 9.09|
|  7.2|
| 5.45|
|  9.2|
|  2.1|
|  9.2|
|  1.8|
+-----+
only showing top 20 rows

