In [1]:
spark

In [2]:
%sh
pip install koalas
pip install lxml
pip install pymongo
pip install pymongo[srv]
pip install pymongo[tls]
pip install dnspython
pip install mlflow

In [3]:
import databricks.koalas as ks
import lxml
from pymongo import MongoClient
from pyspark.sql.types import *
import pyspark.sql
from pyspark.sql import SQLContext
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.regression import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import mlflow
import pandas as pd

In [4]:
client = MongoClient("mongodb+srv://isabelle1:data603@cluster0-t7ekj.mongodb.net/dataPI_db?retryWrites=true&w=majority")

In [5]:
db = client.get_database('dataPI_db')

info_p = db.performance
info_p.count_documents({})

pandas_df =  pd.DataFrame(list(info_p.find()))

df = pandas_df.drop(['_id'], axis = 1)
df_koalas = ks.DataFrame(df)
df_koalas.head()

# Here we have converted our target / label / y variable into an integer.
df_koalas['Winner'] = df_koalas['Winner'].replace({'Home': 0, 'Away': 1}).astype(int)

In [6]:
# Manual Feature Reduction

df = df_koalas[['D-1%', 'D-BO', 'D-CG', 'D-CM', 'D-IF', 'D-MI', 'F-BO', 'F-CM', 'F-FA', 'F-GA', 'F-KE', 'F-MI', 'F-RB', 'M-1%', 'M-GA', 'M-KE', 'M-RB', 'R-1%', 'R-BO', 'R-FF', 'R-GA', 'R-HB', 'R-HO', 'R-MI', 'R-RB', 'R-TK', 'R-UP', 'RLP', 'Same / Neutral Venue', 'Winner']]

df.head()

In [7]:
# Machine Learning Section

test = df.to_spark()

In [8]:
from pyspark.ml.feature import StringIndexer
stages = []
  
label_stringIdx = StringIndexer(inputCol = 'Winner', outputCol = 'label')
stages += [label_stringIdx]

numericCols = ['D-1%', 'D-BO', 'D-CG', 'D-CM', 'D-IF', 'D-MI', 'F-BO', 'F-CM', 'F-FA', 'F-GA', 'F-KE', 'F-MI', 'F-RB', 'M-1%', 'M-GA', 'M-KE', 'M-RB', 'R-1%', 'R-BO', 'R-FF', 'R-GA', 'R-HB', 'R-HO', 'R-MI', 'R-RB', 'R-TK', 'R-UP', 'RLP']

assemblerInputs = numericCols
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol = "features")
stages += [assembler]

# This is critical!

dataset = assembler.transform(test)

In [9]:
trainingData, testData = dataset.randomSplit([0.7, 0.3], seed = 100)
print(trainingData.count())
print(testData.count())

In [10]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(withStd = True, withMean = True, inputCol = 'features', outputCol = 'scaled_features')

scaler_model = scaler.fit(trainingData)

In [11]:
scaled_trainingData = scaler_model.transform(trainingData)
scaled_testData = scaler_model.transform(testData)

In [12]:
scaled_trainingData = scaled_trainingData.withColumnRenamed('Winner', 'label')
scaled_testData = scaled_testData.withColumnRenamed('Winner', 'label')


In [13]:
# Fit and Evaluate Models

lr = LogisticRegression(labelCol = 'label', featuresCol = 'scaled_features', maxIter = 10)

lrModel = lr.fit(scaled_trainingData)

In [14]:
predictions = lrModel.transform(scaled_testData)

# predictions1 = predictions.withColumnRenamed('Winner', 'label')

In [15]:
predictions.prediction

In [16]:
selected = predictions.select('label', 'prediction', 'probability')
display(selected)

In [17]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction')
baseline_model = evaluator.evaluate(predictions)

In [18]:
# Here I want to try a CrossGrid Validation to find better paramaters

In [19]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
.addGrid(lr.aggregationDepth, [2,5,10])\
.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
.addGrid(lr.fitIntercept, [False, True])\
.addGrid(lr.regParam, [0.01, 0.5, 2.0])\
.build()

In [20]:
# Create a 5-fold CrossValidator

cv = CrossValidator(estimator = lr, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 5)

# Run cross validations

cvModel = cv.fit(scaled_trainingData)

# This can take about 15 minutes to run.
# There is a warning about mlflow logging, but it requires a library, which we have not loaded.  This is just a warning, which you can safely ignore.

In [21]:
predict_train = cvModel.transform(scaled_trainingData)
predict_test = cvModel.transform(scaled_testData)

In [22]:
print('The area under ROC for the training set after CV Grid is: {}'.format(evaluator.evaluate(predict_train)))
print('The area under ROC for the test set after CV Grid is {}'.format(evaluator.evaluate(predict_test)))

print('\rThe area under ROC for the original (baseline) model is {}'.format(baseline_model))

In [23]:
# End of CrossGrid Validation section

In [24]:
# ML REGRESSION ANALYSIS

# Here we're going to keep the same features, but we are going to change the label to 'Net Score' instead of the binary classification problem of simply which team won.

In [25]:
df_regression = df_koalas[['D-1%', 'D-BO', 'D-CG', 'D-CM', 'D-IF', 'D-MI', 'F-BO', 'F-CM', 'F-FA', 'F-GA', 'F-KE', 'F-MI', 'F-RB', 'M-1%', 'M-GA', 'M-KE', 'M-RB', 'R-1%', 'R-BO', 'R-FF', 'R-GA', 'R-HB', 'R-HO', 'R-MI', 'R-RB', 'R-TK', 'R-UP', 'RLP', 'Same / Neutral Venue', 'Net Score']]

In [26]:
spark_regression_df = df_regression.to_spark()

In [27]:
from pyspark.ml.feature import VectorAssembler

stages = []

numericCols_regression = ['D-1%', 'D-BO', 'D-CG', 'D-CM', 'D-IF', 'D-MI', 'F-BO', 'F-CM', 'F-FA', 'F-GA', 'F-KE', 'F-MI', 'F-RB', 'M-1%', 'M-GA', 'M-KE', 'M-RB', 'R-1%', 'R-BO', 'R-FF', 'R-GA', 'R-HB', 'R-HO', 'R-MI', 'R-RB', 'R-TK', 'R-UP', 'RLP', 'Same / Neutral Venue']
assemblerInputs_regression = numericCols_regression
assembler_regression = VectorAssembler(inputCols = assemblerInputs_regression, outputCol = "features")

regression_dataset = assembler.transform(spark_regression_df)

In [28]:
linear_trainingData, linear_testData = regression_dataset.randomSplit([0.7, 0.3], seed = 100)
print(linear_trainingData.count())
print(linear_testData.count())

In [29]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(withStd = True, withMean = True, inputCol = 'features', outputCol = 'scaled_features')

scaler_model = scaler.fit(linear_trainingData)

In [30]:
scaled_linear_trainingData = scaler_model.transform(linear_trainingData)
scaled_linear_testData = scaler_model.transform(linear_testData)
scaled_linear_trainingData = scaled_linear_trainingData.withColumnRenamed('Net Score', 'label')
scaled_linear_testData = scaled_linear_testData.withColumnRenamed('Net Score', 'label')

In [31]:
# Let's see if this works
algo = LinearRegression(featuresCol = 'scaled_features', labelCol = 'label')
# algo = LinearRegression(featuresCol = 'features', labelCol = 'Net Score')

In [32]:
model = algo.fit(scaled_linear_trainingData)

In [33]:
evaluation_summary = model.evaluate(scaled_linear_testData)
# evaluation_summary = model.evaluate(test)

In [34]:
print("Mean absolute error: "+ str(evaluation_summary.meanAbsoluteError))
print('Mean square root error: '+str(evaluation_summary.rootMeanSquaredError))
print('R2: '+str(evaluation_summary.r2))

In [35]:
predictions = model.transform(scaled_linear_testData)

In [36]:
display(predictions)