In [6]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("model_building").getOrCreate()

In [57]:
path = 's3://502pubg/clean/aggeragate2.parquet'
data = spark.read.parquet(path)

In [58]:
data.printSchema()

root
 |-- game_size: integer (nullable = true)
 |-- match_mode: string (nullable = true)
 |-- party_size: integer (nullable = true)
 |-- player_assists: integer (nullable = true)
 |-- player_dbno: integer (nullable = true)
 |-- player_dist_ride: integer (nullable = true)
 |-- player_dist_walk: integer (nullable = true)
 |-- player_dmg: integer (nullable = true)
 |-- player_kills: integer (nullable = true)
 |-- player_survive_time: integer (nullable = true)
 |-- team_placement: integer (nullable = true)
 |-- matchmode_index: double (nullable = true)
 |-- label: double (nullable = true)
 |-- gamesize_index: double (nullable = true)
 |-- partysize_index: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- stdfeatures: vector (nullable = true)



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, \
    StringIndexer, IndexToString, VectorAssembler, StandardScaler

# Linear regression

In [10]:
from pyspark.ml.regression import LinearRegression

In [100]:
# prepare pipiline
vectorAssembler_features = VectorAssembler(
    inputCols=['player_assists','player_dbno','player_dist_ride',
               'player_dist_walk','player_dmg','player_kills','matchmode_index',
               "gamesize_index","partysize_index"], 
    outputCol="features",handleInvalid="skip")

# scale features
scal = StandardScaler(inputCol='features',outputCol='stdfeatures',withStd=True, withMean=False)

# fit model
model_fit = LinearRegression(featuresCol = 'stdfeatures', 
                      labelCol='team_placement', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [101]:
df = data.select(['player_assists','player_dbno','player_dist_ride',
               'player_dist_walk','player_dmg','player_kills','matchmode_index',
               "gamesize_index","partysize_index", "team_placement"])
train_df, test_df = df.randomSplit([0.7, 0.3])

In [102]:
# start pipeline
pipeline = Pipeline(stages=[vectorAssembler_features, 
                            scal,
                            model_fit])

Model training

In [104]:
linear_std = pipeline.fit(train_df)

In [107]:
linear_std_model = linear_std.stages[-1]
print("Coefficients: " + str(linear_std_model.coefficients))
print("Intercept: " + str(linear_std_model.intercept))

Coefficients: [-0.5273252206732283,3.2100739399024434,-5.328388928736673,-0.9313110566766926,-3.5683834637655365,-3.1494299580433993,0.0,-0.5582667229764633,12.847622943525609]
Intercept: 19.08657108957678


Training summary

In [108]:
linear_std_trainingSummary = linear_std_model.summary
print("RMSE: %f" % linear_std_trainingSummary.rootMeanSquaredError)
print("r2: %f" % linear_std_trainingSummary.r2)

RMSE: 13.427718
r2: 0.566009


Testing summary (standardized features)

In [120]:
linear_std_predictions = linear_std.transform(test_df)

from pyspark.ml.evaluation import RegressionEvaluator

linear_std_evaluator = RegressionEvaluator(predictionCol="prediction",
                 labelCol="team_placement",metricName="r2")

print("R Squared (R2) on test data = %g" % linear_std_evaluator.evaluate(linear_std_predictions))

test_df = vectorAssembler_features.transform(test_df)
test_df = scal.fit(test_df).transform(test_df)
linear_std_test_result = linear_std_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % linear_std_test_result.rootMeanSquaredError)

R Squared (R2) on test data = 0.565071


# Decision tree regression

First fit standardized features

In [94]:
from pyspark.ml.regression import DecisionTreeRegressor

In [75]:
# prepare pipiline
vectorAssembler_features = VectorAssembler(
    inputCols=['player_assists','player_dbno','player_dist_ride',
               'player_dist_walk','player_dmg','player_kills','matchmode_index',
               "gamesize_index","partysize_index"], 
    outputCol="features",handleInvalid="skip")

# scale features
scal = StandardScaler(inputCol='features',outputCol='stdfeatures',withStd=True, withMean=False)

# fit model
model_fit = DecisionTreeRegressor(labelCol="team_placement", featuresCol="stdfeatures")

In [76]:
df = data.select(['player_assists','player_dbno','player_dist_ride',
               'player_dist_walk','player_dmg','player_kills','matchmode_index',
               "gamesize_index","partysize_index", "team_placement"])
train_df, test_df = df.randomSplit([0.7, 0.3])

In [77]:
# start pipeline
pipeline = Pipeline(stages=[vectorAssembler_features, 
                            scal,
                            model_fit])

In [78]:
tree_std = pipeline.fit(train_df)

Evaluate model

In [79]:
tree_std_predictions = tree_std.transform(test_df)
tree_std_evaluator = RegressionEvaluator(
    labelCol="team_placement", predictionCol="prediction", metricName="rmse")

rmse = tree_std_evaluator.evaluate(tree_std_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 7.07401


Feature importances (standardized features)

In [97]:
tree_std.stages[-1].featureImportances

SparseVector(9, {2: 0.0113, 3: 0.5218, 4: 0.0022, 7: 0.002, 8: 0.4627})

In [98]:
df.take(1)

[Row(player_assists=0, player_dbno=0, player_dist_ride=3737, player_dist_walk=1443, player_dmg=0, player_kills=0, matchmode_index=0.0, gamesize_index=1.0, partysize_index=0.0, team_placement=3)]

Player distance walk and party size play important roles.