In [7]:
import pyspark as ps
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [2]:
sc = spark.sparkContext

In [6]:
df = spark.read.csv('s3a://jm-uk-bucket/studentVle.csv', 
                    header=True,
                   inferSchema=True)

In [8]:
df.show()

+-----------+-----------------+----------+-------+----+---------+
|code_module|code_presentation|id_student|id_site|date|sum_click|
+-----------+-----------------+----------+-------+----+---------+
|        AAA|            2013J|     28400| 546652| -10|        4|
|        AAA|            2013J|     28400| 546652| -10|        1|
|        AAA|            2013J|     28400| 546652| -10|        1|
|        AAA|            2013J|     28400| 546614| -10|       11|
|        AAA|            2013J|     28400| 546714| -10|        1|
|        AAA|            2013J|     28400| 546652| -10|        8|
|        AAA|            2013J|     28400| 546876| -10|        2|
|        AAA|            2013J|     28400| 546688| -10|       15|
|        AAA|            2013J|     28400| 546662| -10|       17|
|        AAA|            2013J|     28400| 546890| -10|        1|
|        AAA|            2013J|     28400| 547011| -10|        1|
|        AAA|            2013J|     28400| 547013| -10|        1|
|        A

In [9]:
df.printSchema()

root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- id_student: integer (nullable = true)
 |-- id_site: integer (nullable = true)
 |-- date: integer (nullable = true)
 |-- sum_click: integer (nullable = true)



In [13]:
df.describe().show()

+-------+-----------+-----------------+-----------------+------------------+-----------------+------------------+
|summary|code_module|code_presentation|       id_student|           id_site|             date|         sum_click|
+-------+-----------+-----------------+-----------------+------------------+-----------------+------------------+
|  count|   10655280|         10655280|         10655280|          10655280|         10655280|          10655280|
|   mean|       null|             null|733333.5668717293|  738323.416399569|95.17399955702713|3.7169458709672574|
| stddev|       null|             null|582705.9825107378|131219.62216193537|76.07130084050284| 8.849046655101438|
|    min|        AAA|            2013B|             6516|            526721|              -25|                 1|
|    max|        GGG|            2014J|          2698588|           1049562|              269|              6977|
+-------+-----------+-----------------+-----------------+------------------+------------

In [17]:
vec_ass = VectorAssembler(inputCols=['id_site', 'date'],
                         outputCol = 'features')

In [18]:
df_vector = vec_ass.transform(df)

In [19]:
df_vector.show()

+-----------+-----------------+----------+-------+----+---------+----------------+
|code_module|code_presentation|id_student|id_site|date|sum_click|        features|
+-----------+-----------------+----------+-------+----+---------+----------------+
|        AAA|            2013J|     28400| 546652| -10|        4|[546652.0,-10.0]|
|        AAA|            2013J|     28400| 546652| -10|        1|[546652.0,-10.0]|
|        AAA|            2013J|     28400| 546652| -10|        1|[546652.0,-10.0]|
|        AAA|            2013J|     28400| 546614| -10|       11|[546614.0,-10.0]|
|        AAA|            2013J|     28400| 546714| -10|        1|[546714.0,-10.0]|
|        AAA|            2013J|     28400| 546652| -10|        8|[546652.0,-10.0]|
|        AAA|            2013J|     28400| 546876| -10|        2|[546876.0,-10.0]|
|        AAA|            2013J|     28400| 546688| -10|       15|[546688.0,-10.0]|
|        AAA|            2013J|     28400| 546662| -10|       17|[546662.0,-10.0]|
|   

In [21]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaledfeatures')

In [22]:
scaler_transformer = scaler.fit(df_vector)

In [23]:
scaled_data = scaler_transformer.transform(df_vector)

In [27]:
scaled_data.show(5)

+-----------+-----------------+----------+-------+----+---------+----------------+--------------------+
|code_module|code_presentation|id_student|id_site|date|sum_click|        features|      scaledfeatures|
+-----------+-----------------+----------+-------+----+---------+----------------+--------------------+
|        AAA|            2013J|     28400| 546652| -10|        4|[546652.0,-10.0]|[0.03812057585384...|
|        AAA|            2013J|     28400| 546652| -10|        1|[546652.0,-10.0]|[0.03812057585384...|
|        AAA|            2013J|     28400| 546652| -10|        1|[546652.0,-10.0]|[0.03812057585384...|
|        AAA|            2013J|     28400| 546614| -10|       11|[546614.0,-10.0]|[0.03804789601427...|
|        AAA|            2013J|     28400| 546714| -10|        1|[546714.0,-10.0]|[0.03823915874998...|
+-----------+-----------------+----------+-------+----+---------+----------------+--------------------+
only showing top 5 rows



In [28]:
train, test = scaled_data.randomSplit([0.7, 0.3], seed=1234)

In [31]:
lr = LinearRegression(featuresCol='features', labelCol='sum_click', regParam=0.3)

In [32]:
lrModel = lr.fit(train)

In [33]:
lrModel.coefficients

DenseVector([0.0, 0.0027])

In [34]:
summ = lrModel.summary

In [37]:
summ.r2

0.0005974244594275646