In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 55.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=cb05a8575e1d451ecfa5e158a2b93d1cb608d0194088d3b339cc26e34fd73dad
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [33]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [34]:
##Creacion de la sesion y lectura de los datos
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('app').getOrCreate()
df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/SAA/DATASETS/cereal.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



In [35]:
#Features
df=df[["calories","fiber","sugars","rating"]]

In [36]:
#Transformacion a vectores
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
trainingData=df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])

In [37]:
#Train-test split
train, test = trainingData.randomSplit([0.7, 0.3], seed = 43)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 52
Test Dataset Count: 25


In [38]:
#Modelo regresion lineal
from pyspark.ml.regression import LinearRegression


lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [-0.12966964814549908,2.4599553518649344,-1.8011152310769376]
Intercept: 63.68853155386343
numIterations: 8
RMSE: 5.321817
r2: 0.861852


In [39]:
#Predicciones con test
lr_predictions = lrModel.transform(test)
lr_predictions.select("prediction","label","features").show(10)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+---------+----------------+
|        prediction|    label|        features|
+------------------+---------+----------------+
| 57.20504914658847|60.756112|  [50.0,0.0,0.0]|
| 60.69482575781831|68.235885|  [80.0,3.0,0.0]|
| 46.13148253803675|55.333142|  [90.0,2.0,6.0]|
| 59.39812927636331|72.801787|  [90.0,3.0,0.0]|
|23.704838273159453|35.252444|[100.0,0.0,15.0]|
| 49.57929162902458|45.863324| [100.0,1.0,2.0]|
| 44.17594593579376|44.330856| [100.0,1.0,5.0]|
| 44.83478605658176|45.328074| [100.0,2.0,6.0]|
| 44.83478605658176|49.511874| [100.0,2.0,6.0]|
| 52.69808710167751|46.658844| [100.0,3.0,3.0]|
+------------------+---------+----------------+
only showing top 10 rows

R Squared (R2) on test data = 0.798318
