## PySpark TP2 : Abre de décision et forêt aléatoire

#### Kostadinovic Nemanja 

***

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import LinearRegression


from pyspark.ml.feature import StringIndexer, VectorIndexer, StandardScaler, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql.functions import col

## Exercice 1 : Arbre de décision (sample libsvm)

In [2]:
data = spark.read.format("libsvm").load('/usr/local/Cellar/apache-spark/3.0.1/libexec/data/mllib/sample_libsvm_data.txt')

In [3]:
#data.collect()

In [4]:
type(data)

pyspark.sql.dataframe.DataFrame

In [5]:
(training_data, test_data) = data.randomSplit([0.7, 0.3])

In [6]:
dtree = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=5, impurity='gini')

In [7]:
model = dtree.fit(training_data)

In [8]:
predictions = model.transform(test_data)

In [9]:
predictions.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [10]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % ((1.0 - accuracy)*100))

Test Error = 6.06061


In [11]:
model.explainParams

<bound method Params.explainParams of DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c040821ca7e1, depth=1, numNodes=3, numClasses=2, numFeatures=692>

In [12]:
model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c040821ca7e1, depth=1, numNodes=3, numClasses=2, numFeatures=692\n  If (feature 406 <= 126.5)\n   Predict: 0.0\n  Else (feature 406 > 126.5)\n   Predict: 1.0\n'

## Exercice 2 : Arbre de décision (duke breast cancer)

***

In [13]:
data = spark.read.format("libsvm").load('data/duke')
(training_data, test_data) = data.randomSplit([0.7, 0.3])
dtree = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=7, impurity='gini')
model = dtree.fit(training_data)
predictions = model.transform(test_data)

In [14]:
predictions.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(7129,[0,1,2,3,4,...|   [13.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(7129,[0,1,2,3,4,...|   [13.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(7129,[0,1,2,3,4,...|   [0.0,15.0]|  [0.0,1.0]|       1.0|
|  0.0|(7129,[0,1,2,3,4,...|   [13.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(7129,[0,1,2,3,4,...|   [0.0,15.0]|  [0.0,1.0]|       1.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [15]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % ((1.0 - accuracy)*100))

Test Error = 40


In [16]:
model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4dcb470c90ce, depth=2, numNodes=5, numClasses=2, numFeatures=7129\n  If (feature 2389 <= 0.0686425)\n   Predict: 1.0\n  Else (feature 2389 > 0.0686425)\n   If (feature 0 <= 1.012746)\n    Predict: 0.0\n   Else (feature 0 > 1.012746)\n    Predict: 1.0\n'

## Exercice 3 : Forêt aléatoire/RandomForest (libsvm data) 

In [17]:
data = spark.read.format("libsvm").load('/usr/local/Cellar/apache-spark/3.0.1/libexec/data/mllib/sample_libsvm_data.txt')
(training_data, test_data) = data.randomSplit([0.7, 0.3])

In [18]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=3, impurity='gini')

In [19]:
model = rf.fit(training_data)

In [20]:
predictions = model.transform(test_data)

In [21]:
predictions.show(5)

+-----+--------------------+-------------+--------------------+----------+
|label|            features|rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(692,[121,122,123...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|    [2.0,1.0]|[0.66666666666666...|       0.0|
+-----+--------------------+-------------+--------------------+----------+
only showing top 5 rows



In [22]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % ((1.0 - accuracy)*100))
print(accuracy)

Test Error = 0
1.0


In [23]:
model.toDebugString

'RandomForestClassificationModel: uid=RandomForestClassifier_36f3860c49cb, numTrees=3, numClasses=2, numFeatures=692\n  Tree 0 (weight 1.0):\n    If (feature 434 <= 70.5)\n     Predict: 0.0\n    Else (feature 434 > 70.5)\n     Predict: 1.0\n  Tree 1 (weight 1.0):\n    If (feature 324 <= 60.0)\n     If (feature 518 <= 21.0)\n      Predict: 0.0\n     Else (feature 518 > 21.0)\n      Predict: 1.0\n    Else (feature 324 > 60.0)\n     If (feature 517 <= 43.0)\n      Predict: 0.0\n     Else (feature 517 > 43.0)\n      Predict: 1.0\n  Tree 2 (weight 1.0):\n    If (feature 379 <= 59.0)\n     If (feature 271 <= 9.5)\n      If (feature 662 <= 7.0)\n       Predict: 1.0\n      Else (feature 662 > 7.0)\n       Predict: 0.0\n     Else (feature 271 > 9.5)\n      Predict: 0.0\n    Else (feature 379 > 59.0)\n     If (feature 680 <= 141.5)\n      Predict: 1.0\n     Else (feature 680 > 141.5)\n      Predict: 0.0\n'

## Exercice 4 : Forêt aléatoire/RandomForest (Covtype)

In [24]:
data = spark.read.format("libsvm").load('data/covtype.libsvm.binary')

In [25]:
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(54,[0,1,2,3,5,6,...|
|  1.0|(54,[0,1,2,3,4,5,...|
|  2.0|(54,[0,1,2,3,4,5,...|
|  2.0|(54,[0,1,2,3,4,5,...|
|  1.0|(54,[0,1,2,3,4,5,...|
+-----+--------------------+
only showing top 5 rows



In [26]:
(training_data, test_data) = data.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=5, impurity='gini')
model = rf.fit(training_data)
predictions = model.transform(test_data)

In [27]:
predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  1.0|(54,[0,1,2,3,4,5,...|[0.0,2.2062167068...|[0.0,0.4412433413...|       2.0|
|  1.0|(54,[0,1,2,3,4,5,...|[0.0,2.2062167068...|[0.0,0.4412433413...|       2.0|
|  1.0|(54,[0,1,2,3,4,5,...|[0.0,2.2062167068...|[0.0,0.4412433413...|       2.0|
|  1.0|(54,[0,1,2,3,4,5,...|[0.0,2.2062167068...|[0.0,0.4412433413...|       2.0|
|  1.0|(54,[0,1,2,3,4,5,...|[0.0,2.2062167068...|[0.0,0.4412433413...|       2.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [28]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % ((1.0 - accuracy)*100))

Test Error = 26.3171


In [29]:
model.numClasses

3

## Exercice 5 : Forêt aléatoire RedWine Quality

#### Import et préprocessing en Python

In [30]:
data_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')

In [31]:
data_wine.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [32]:
data_wine.loc[data_wine['quality'] < 5, 'Classe'] = 0 #mauvaise qualité

In [33]:
data_wine.loc[data_wine['quality'] >= 5, 'Classe'] = 1 #bonne qualité

In [34]:
data_wine = data_wine.drop('quality',axis=1) #on en a plus besoin

In [35]:
data_wine.sample(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Classe
988,7.7,0.39,0.12,1.7,0.097,19.0,27.0,0.99596,3.16,0.49,9.4,1.0
1557,6.6,0.855,0.02,2.4,0.062,15.0,23.0,0.99627,3.54,0.6,11.0,1.0


#### Passage à Spark

In [36]:
df_features = spark.createDataFrame(data_wine, list(data_wine.columns))
df_features.show(2)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|Classe|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|   1.0|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|   1.0|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------+
only showing top 2 rows



In [37]:
assembler = VectorAssembler(inputCols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'], outputCol = 'features')

df = assembler.transform(df_features)
df = df.select('features','Classe')
df.show(6)

+--------------------+------+
|            features|Classe|
+--------------------+------+
|[7.4,0.7,0.0,1.9,...|   1.0|
|[7.8,0.88,0.0,2.6...|   1.0|
|[7.8,0.76,0.04,2....|   1.0|
|[11.2,0.28,0.56,1...|   1.0|
|[7.4,0.7,0.0,1.9,...|   1.0|
|[7.4,0.66,0.0,1.8...|   1.0|
+--------------------+------+
only showing top 6 rows



In [38]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures').fit(df)
df_scaled = scaler.transform(df)
df_scaled.show(6)

+--------------------+------+--------------------+
|            features|Classe|      scaledFeatures|
+--------------------+------+--------------------+
|[7.4,0.7,0.0,1.9,...|   1.0|[4.25019565141442...|
|[7.8,0.88,0.0,2.6...|   1.0|[4.47993595689628...|
|[7.8,0.76,0.04,2....|   1.0|[4.47993595689628...|
|[11.2,0.28,0.56,1...|   1.0|[6.43272855349210...|
|[7.4,0.7,0.0,1.9,...|   1.0|[4.25019565141442...|
|[7.4,0.66,0.0,1.8...|   1.0|[4.25019565141442...|
+--------------------+------+--------------------+
only showing top 6 rows



In [39]:
(training_data, test_data) = df_scaled.randomSplit([0.7, 0.3])

In [40]:
rf = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='Classe',numTrees=3, impurity='gini')

In [41]:
model = rf.fit(training_data)

In [42]:
predictions = model.transform(test_data)

In [43]:
predictions.show(5)

+--------------------+------+--------------------+--------------------+--------------------+----------+
|            features|Classe|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+--------------------+----------+
|[5.2,0.32,0.25,1....|   1.0|[2.98662397126419...|[0.05862396522536...|[0.01954132174178...|       1.0|
|[5.2,0.34,0.0,1.8...|   1.0|[2.98662397126419...|[0.06217152985906...|[0.02072384328635...|       1.0|
|[5.6,0.5,0.09,2.3...|   1.0|[3.21636427674605...|[0.06217152985906...|[0.02072384328635...|       1.0|
|[5.6,0.5,0.09,2.3...|   1.0|[3.21636427674605...|[0.06217152985906...|[0.02072384328635...|       1.0|
|[5.6,0.85,0.05,1....|   1.0|[3.21636427674605...|[0.08899665719352...|[0.02966555239784...|       1.0|
+--------------------+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [44]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Classe", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % ((1.0 - accuracy)*100))

Test Error = 3.73626


#### Comparaison avec le RandomForest de sklearn sur redwine data

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [46]:
X = data_wine.drop('Classe', axis=1)
X.sample(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1439,7.3,0.67,0.02,2.2,0.072,31.0,92.0,0.99566,3.32,0.68,11.066667
544,14.3,0.31,0.74,1.8,0.075,6.0,15.0,1.0008,2.86,0.79,8.4


In [47]:
y = data_wine['Classe']
y.sample(2)

239    1.0
402    1.0
Name: Classe, dtype: float64

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [49]:
clf = RandomForestClassifier(n_estimators=3,criterion='gini')

In [50]:
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=3)

In [51]:
prediction = clf.predict(X_test)

In [52]:
print("Test Error = %g" % ((1.0 - accuracy_score(y_test, prediction))*100))

Test Error = 4.92424


## Exercice 6 : Régréssion linéaire

In [53]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer

In [54]:
data_lpsa = spark.read.csv('/usr/local/Cellar/apache-spark/3.0.1/libexec/data/mllib/ridge-data/lpsa.data', sep=' ')

In [55]:
#data_lpsa.collect()

In [56]:
data_lpsa.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8']

In [57]:
data_lpsa = data_lpsa.select(*(col(c).cast("float").alias(c) for c in data_lpsa.columns))

In [58]:
assembler = VectorAssembler(inputCols = ['_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8'], outputCol = 'features')
data = assembler.transform(data_lpsa).select('features','_c0')
data.show(6)

+--------------------+----------+
|            features|       _c0|
+--------------------+----------+
|[-1.6373556852340...|-0.4307829|
|[-1.9889804124832...|-0.1625189|
|[-1.5788189172744...|-0.1625189|
|[-2.1669170856475...|-0.1625189|
|[-0.5078744888305...| 0.3715636|
|[-2.0361285209655...| 0.7654678|
+--------------------+----------+
only showing top 6 rows



In [59]:
data = data.selectExpr("features as features", "_c0 as label")

In [60]:
train,test = data.randomSplit([0.75, 0.25])

In [61]:
model = LinearRegression(maxIter=100)

In [62]:
lrModel = model.fit(train)

In [63]:
print("Coefficients: " + str(lrModel.coefficients))
print("\nIntercept: " + str(lrModel.intercept))

Coefficients: [0.8463532973898743,0.055231493095207695,-0.14388277767264118,0.2805316537322125,0.33357173139950064,-0.49861394474532983,-0.17412877943973856,0.4547890514296306]

Intercept: 2.40495913259291


In [64]:
trainSummary = lrModel.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 0.678600

r2: 0.708419


In [65]:
predictions = lrModel.transform(test)

In [66]:
predictions.show()

+--------------------+----------+------------------+
|            features|     label|        prediction|
+--------------------+----------+------------------+
|[-2.2883305549621...| 1.2669476|0.7509031379017597|
|[-1.5788189172744...|-0.1625189|0.5901593859027527|
|[-1.5240614414215...| 2.9626925| 0.873161758945654|
|[-0.9907206892967...| 1.5993876|  1.81874373922193|
|[-0.9034516811370...| 1.9242487|1.8726805015959123|
|[-0.7103073596954...| 1.8000582|2.2283874834717188|
|[-0.6852776408195...|  2.806386|1.9682179715933543|
|[-0.5078744888305...| 0.3715636|1.7426454864024101|
|[-0.2189723700284...| 2.6567569| 2.508676105649968|
|[-0.1324315369129...| 2.9729753|2.3886997381162915|
|[-0.0633337870240...|  2.008214|2.4070629458405133|
|[0.06202037259936...| 2.1575594|0.8712527665663519|
|[0.11549910157918...| 2.5217206|  2.53690878816636|
|[0.22349804639816...| 1.2669476|2.2509795626525437|
|[0.26634132862091...| 2.5533438| 2.799888206618476|
|[0.57088696956634...| 2.9204698|3.13204317002

## Exercice 7 : Régréssion linéaire

In [68]:
data = spark.read.csv('./data/fertility_Diagnosis.txt', sep=',')
data.show()

+-----+----+---+---+---+---+---+---+----+---+
|  _c0| _c1|_c2|_c3|_c4|_c5|_c6|_c7| _c8|_c9|
+-----+----+---+---+---+---+---+---+----+---+
|-0.33|0.69|  0|  1|  1|  0|0.8|  0|0.88|  N|
|-0.33|0.94|  1|  0|  1|  0|0.8|  1|0.31|  O|
|-0.33| 0.5|  1|  0|  0|  0|  1| -1| 0.5|  N|
|-0.33|0.75|  0|  1|  1|  0|  1| -1|0.38|  N|
|-0.33|0.67|  1|  1|  0|  0|0.8| -1| 0.5|  O|
|-0.33|0.67|  1|  0|  1|  0|0.8|  0| 0.5|  N|
|-0.33|0.67|  0|  0|  0| -1|0.8| -1|0.44|  N|
|-0.33|   1|  1|  1|  1|  0|0.6| -1|0.38|  N|
|    1|0.64|  0|  0|  1|  0|0.8| -1|0.25|  N|
|    1|0.61|  1|  0|  0|  0|  1| -1|0.25|  N|
|    1|0.67|  1|  1|  0| -1|0.8|  0|0.31|  N|
|    1|0.78|  1|  1|  1|  0|0.6|  0|0.13|  N|
|    1|0.75|  1|  1|  1|  0|0.8|  1|0.25|  N|
|    1|0.81|  1|  0|  0|  0|  1| -1|0.38|  N|
|    1|0.94|  1|  1|  1|  0|0.2| -1|0.25|  N|
|    1|0.81|  1|  1|  0|  0|  1|  1| 0.5|  N|
|    1|0.64|  1|  0|  1|  0|  1| -1|0.38|  N|
|    1|0.69|  1|  0|  1|  0|0.8| -1|0.25|  O|
|    1|0.75|  1|  1|  1|  0|  1|  

In [69]:
indexer = StringIndexer(inputCol='_c9', outputCol="_c10") 
indexed = indexer.fit(data).transform(data) 
indexed.show()

+-----+----+---+---+---+---+---+---+----+---+----+
|  _c0| _c1|_c2|_c3|_c4|_c5|_c6|_c7| _c8|_c9|_c10|
+-----+----+---+---+---+---+---+---+----+---+----+
|-0.33|0.69|  0|  1|  1|  0|0.8|  0|0.88|  N| 0.0|
|-0.33|0.94|  1|  0|  1|  0|0.8|  1|0.31|  O| 1.0|
|-0.33| 0.5|  1|  0|  0|  0|  1| -1| 0.5|  N| 0.0|
|-0.33|0.75|  0|  1|  1|  0|  1| -1|0.38|  N| 0.0|
|-0.33|0.67|  1|  1|  0|  0|0.8| -1| 0.5|  O| 1.0|
|-0.33|0.67|  1|  0|  1|  0|0.8|  0| 0.5|  N| 0.0|
|-0.33|0.67|  0|  0|  0| -1|0.8| -1|0.44|  N| 0.0|
|-0.33|   1|  1|  1|  1|  0|0.6| -1|0.38|  N| 0.0|
|    1|0.64|  0|  0|  1|  0|0.8| -1|0.25|  N| 0.0|
|    1|0.61|  1|  0|  0|  0|  1| -1|0.25|  N| 0.0|
|    1|0.67|  1|  1|  0| -1|0.8|  0|0.31|  N| 0.0|
|    1|0.78|  1|  1|  1|  0|0.6|  0|0.13|  N| 0.0|
|    1|0.75|  1|  1|  1|  0|0.8|  1|0.25|  N| 0.0|
|    1|0.81|  1|  0|  0|  0|  1| -1|0.38|  N| 0.0|
|    1|0.94|  1|  1|  1|  0|0.2| -1|0.25|  N| 0.0|
|    1|0.81|  1|  1|  0|  0|  1|  1| 0.5|  N| 0.0|
|    1|0.64|  1|  0|  1|  0|  1

In [70]:
indexed = indexed.drop('_c9')

In [71]:
data = indexed.select(*(col(c).cast("float").alias(c) for c in indexed.columns))

In [72]:
data.show()

+-----+----+---+---+---+----+---+----+----+----+
|  _c0| _c1|_c2|_c3|_c4| _c5|_c6| _c7| _c8|_c10|
+-----+----+---+---+---+----+---+----+----+----+
|-0.33|0.69|0.0|1.0|1.0| 0.0|0.8| 0.0|0.88| 0.0|
|-0.33|0.94|1.0|0.0|1.0| 0.0|0.8| 1.0|0.31| 1.0|
|-0.33| 0.5|1.0|0.0|0.0| 0.0|1.0|-1.0| 0.5| 0.0|
|-0.33|0.75|0.0|1.0|1.0| 0.0|1.0|-1.0|0.38| 0.0|
|-0.33|0.67|1.0|1.0|0.0| 0.0|0.8|-1.0| 0.5| 1.0|
|-0.33|0.67|1.0|0.0|1.0| 0.0|0.8| 0.0| 0.5| 0.0|
|-0.33|0.67|0.0|0.0|0.0|-1.0|0.8|-1.0|0.44| 0.0|
|-0.33| 1.0|1.0|1.0|1.0| 0.0|0.6|-1.0|0.38| 0.0|
|  1.0|0.64|0.0|0.0|1.0| 0.0|0.8|-1.0|0.25| 0.0|
|  1.0|0.61|1.0|0.0|0.0| 0.0|1.0|-1.0|0.25| 0.0|
|  1.0|0.67|1.0|1.0|0.0|-1.0|0.8| 0.0|0.31| 0.0|
|  1.0|0.78|1.0|1.0|1.0| 0.0|0.6| 0.0|0.13| 0.0|
|  1.0|0.75|1.0|1.0|1.0| 0.0|0.8| 1.0|0.25| 0.0|
|  1.0|0.81|1.0|0.0|0.0| 0.0|1.0|-1.0|0.38| 0.0|
|  1.0|0.94|1.0|1.0|1.0| 0.0|0.2|-1.0|0.25| 0.0|
|  1.0|0.81|1.0|1.0|0.0| 0.0|1.0| 1.0| 0.5| 0.0|
|  1.0|0.64|1.0|0.0|1.0| 0.0|1.0|-1.0|0.38| 0.0|
|  1.0|0.69|1.0|0.0|

In [73]:
data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c10']

In [74]:
assembler = VectorAssembler(inputCols = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8'], outputCol = 'features')
data = assembler.transform(data).select('features','_c10')
data.show(6)

+--------------------+----+
|            features|_c10|
+--------------------+----+
|[-0.3300000131130...| 0.0|
|[-0.3300000131130...| 1.0|
|[-0.3300000131130...| 0.0|
|[-0.3300000131130...| 0.0|
|[-0.3300000131130...| 1.0|
|[-0.3300000131130...| 0.0|
+--------------------+----+
only showing top 6 rows



In [75]:
data = data.selectExpr("features as features", "_c10 as label")

In [76]:
train,test = data.randomSplit([0.75, 0.25])

In [77]:
model = LinearRegression(maxIter=100)

In [78]:
lrModel = model.fit(train)

In [79]:
print("Coefficients: " + str(lrModel.coefficients))
print("\nIntercept: " + str(lrModel.intercept))

Coefficients: [0.03598957965088354,0.3860661315644554,-0.0005654692699647865,-0.11784704431071193,-0.036734795329805794,-0.041828883663344925,-0.40464073593127525,0.015095558803912107,0.20769899709470827]

Intercept: 0.18978600024481734


In [80]:
trainSummary = lrModel.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 0.300867

r2: 0.093373


In [81]:
predictions = lrModel.transform(test)

In [82]:
predictions.show()

+--------------------+-----+--------------------+
|            features|label|          prediction|
+--------------------+-----+--------------------+
|[-1.0,0.529999971...|  0.0| 0.10805932115099928|
|[-1.0,0.529999971...|  0.0|-0.14208398430308977|
|[-1.0,0.529999971...|  0.0|-0.00150338585027...|
|[-1.0,0.579999983...|  0.0| 0.17747585330427118|
|[-1.0,0.639999985...|  0.0| 0.02086395848546918|
|[-1.0,0.670000016...|  0.0|-0.09803617886582472|
|[-1.0,0.720000028...|  0.0|-0.06455748962786934|
|[-1.0,0.779999971...|  0.0|0.049816182107487506|
|[-1.0,1.0,1.0,0.0...|  0.0|  0.2698737019627987|
|[-0.3300000131130...|  0.0| 0.15342139723723314|
|[-0.3300000131130...|  0.0|0.004259578489254373|
|[-0.3300000131130...|  0.0|-0.00457701452193...|
|[-0.3300000131130...|  0.0| 0.27227873197999397|
|[-0.3300000131130...|  1.0| 0.25928099146820877|
|[1.0,0.5600000023...|  0.0| 0.11305840871357574|
|[1.0,0.6700000166...|  0.0|  0.1160628797555846|
|[1.0,0.6700000166...|  0.0|  0.1485303605955668|
