In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark Linear Regression") \
    .getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
training = spark.read.format('libsvm').load("/FileStore/tables/sample_linear_regression_data.txt")
training.show()

In [0]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'label', predictionCol = 'prediction' ) # put whatevert real name

In [0]:
ldModel = lr.fit(training)

In [0]:
ldModel.coefficients

In [0]:
ldModel.intercept

In [0]:
#my_data = my_data.drop(*['Batsman', 'Bowler', 'Id'])
## import sql function pyspark
# import pyspark.sql.functions as f

# # null values in each column
# data_agg = my_data.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in my_data.columns])
# data_agg.show()

In [0]:
training_summary = ldModel.summary

In [0]:
training_summary.rootMeanSquaredError

In [0]:
training_summary.r2

In [0]:
# this is copied from https://runawayhorse001.github.io/LearningApacheSpark/regression.html
# thank George Feng for such a good spark learning file.

def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, ", \
            Total iterations: %i"% Summary.totalIterations)

In [0]:
modelsummary(ldModel)

In [0]:
all_data = spark.read.format('libsvm').load("/FileStore/tables/sample_linear_regression_data.txt")

In [0]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
correct_model = lr.fit(train_data)

In [0]:
test_results = correct_model.evaluate(test_data)

In [0]:
test_results.r2

In [0]:
test_results.rootMeanSquaredError

In [0]:
unlabeled_data = test_data.select('features')

In [0]:
unlabeled_data.show()

In [0]:
predictions = correct_model.transform(unlabeled_data)

In [0]:
predictions.show()  # when deplot to predict new data, we can't evaluate yet we don't have true data to evaluate against

In [0]:
 from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [0]:
spark = SparkSession \
    .builder \
    .appName("PySpark Linear Regression") \
    .getOrCreate()


In [0]:
data = spark.read.csv("/FileStore/tables/Ecommerce_Customers.csv", inferSchema = True, header = True)

In [0]:
data.printSchema()

In [0]:
data.head(1)[0]

In [0]:
for item in data.head(1)[0]:
  print (item)  # the real value

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
# care numerical feature for now
data.columns

In [0]:
assembler = VectorAssembler(inputCols = [
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], 
  outputCol = 'features')

In [0]:
output = assembler.transform(data)

In [0]:
output.printSchema()

In [0]:
output.head(1)

In [0]:
final_data = output.select('features','Yearly Amount Spent')

In [0]:
final_data.show()

In [0]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')  # use default values if not specified

In [0]:
lr_model = lr.fit(train_data)

In [0]:
test_results = lr_model.evaluate(test_data)

In [0]:
test_results.residuals.show()

In [0]:
test_results.rootMeanSquaredError

In [0]:
test_results.r2

In [0]:
final_data.describe().show()

In [0]:
unlabeled_data = test_data.select('features')

In [0]:
unlabeled_data.show()

In [0]:
predictions = lr_model.transform(unlabeled_data)
predictions .show()

In [0]:
 data = spark.read.csv("/FileStore/tables/cruise_ship_info.csv", header = True, inferSchema = True)

In [0]:
data.printSchema()

In [0]:
for ship in data.head(5):
  print(ship)
  print("\n")

In [0]:
data.groupBy('Cruise_line').count().show()

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol = 'Cruise_line', outputCol = 'cruise_cat')
indexed = indexer.fit(data).transform(data)

In [0]:
indexed.head(1)  # label encoding

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
indexed.columns

In [0]:
assembler = VectorAssembler(inputCols= ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density', 
 'cruise_cat'], 
  outputCol = 'features'
)

In [0]:
output = assembler.transform(indexed)

In [0]:
output.select('features', 'crew').show()

In [0]:
final_data =output.select('features', 'crew')

In [0]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [0]:
 train_data.describe().show()

In [0]:
 test_data.describe().show()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
lr = LinearRegression(labelCol = 'crew')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
train_model = lr_model.evaluate(test_data)

In [0]:
train_model.r2

In [0]:
train_model.rootMeanSquaredError  # this is pretty book looking below

In [0]:
train_data.describe().show()

In [0]:
train_model.meanAbsoluteError

In [0]:
from pyspark.sql.functions import corr #pearson correlation
data.describe().show()

In [0]:
data.select(corr('crew', 'passengers')).show()

In [0]:
data.select(corr('crew', 'cabins')).show()