In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('linreg_project').getOrCreate()

In [3]:
data = spark.read.csv('cruise_ship_info.csv', inferSchema=True, header=True)

In [4]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [5]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [7]:
for item in data.head(1)[0]:
    print(item)

Journey
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55


In [15]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [27]:
str_indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_idx')
data = str_indexer.fit(data).transform(data)

In [28]:
assemb = VectorAssembler(inputCols=['Cruise_line_idx', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'],
                         outputCol='features')
assemb_data = assemb.transform(data)
assemb_data.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_line_idx=16.0, features=DenseVector([16.0, 30.277, 6.94, 5.94, 3.55, 42.64]))]

In [32]:
train_data, test_data = assemb_data.randomSplit([0.7, 0.3])

In [31]:
from pyspark.ml.regression import LinearRegression

In [35]:
lin_reg = LinearRegression(labelCol='crew')
linreg_model = lin_reg.fit(train_data)

In [37]:
predictions = linreg_model.evaluate(test_data)

In [39]:
predictions.r2

0.9591272792008518

In [40]:
predictions.rootMeanSquaredError

0.6800155135327185