# Himanshi Sharma
# Issues
1. I was not able to split the dataset using RFormula in train-test and it was showing error 'label' not found. Despite using the labelCol = 'label' in RF formula I was getting this error. I reviewed my code and found out that I was incorrectly spelling NA_Sales to NA_sales which caused in error. 
2. I had a hard time figuring out how to find correlation of all numeric variables, I used google to get to my code for computing the correlation.

In [1]:
# Data exploration
from pyspark.sql.functions import col, log, expr, corr
from pyspark.ml.stat import Correlation
# Data preparation
from pyspark.ml.feature import RFormula, StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder
from pyspark.ml import Pipeline
# Model building
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
# Model evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
# Parameter tuning
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
# Feature engineering
from pyspark.ml.feature import Normalizer, ChiSqSelector

# Data Preparation

In [2]:
# Loading the data
videoGameSales = spark.read.csv("NA_sales_filtered_reformatted.csv", header = True, inferSchema="true")
# printing the schema
videoGameSales.printSchema()
videoGameSales.show(5)

root
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Critic_Score: integer (nullable = true)
 |-- Critic_Count: integer (nullable = true)
 |-- User_Score: integer (nullable = true)
 |-- User_Count: integer (nullable = true)
 |-- NA_Sales: double (nullable = true)

+------------------+--------+------------+------+------------+------------+----------+----------+--------+
|              Name|Platform|       Genre|Rating|Critic_Score|Critic_Count|User_Score|User_Count|NA_Sales|
+------------------+--------+------------+------+------------+------------+----------+----------+--------+
| Final Fantasy VII|      PS|Role-Playing|     T|          92|          20|        91|      1282|    3.01|
|   Final Fantasy X|     PS2|Role-Playing|     T|          92|          53|        86|      1056|    2.91|
|        The Sims 3|      PC|  Simulation|     T|          86|          75|        75|      

# Data Exploration

In [3]:
# summary statistics
videoGameSales.select("NA_Sales","Critic_Score","Critic_Count","User_Score", "User_Count").summary().show()

+-------+-------------------+------------------+-----------------+-----------------+------------------+
|summary|           NA_Sales|      Critic_Score|     Critic_Count|       User_Score|        User_Count|
+-------+-------------------+------------------+-----------------+-----------------+------------------+
|  count|               6345|              6345|             6345|             6345|              6345|
|   mean| 0.3542301024428651|  69.8193853427896|28.96674546887313|70.76548463356974|152.73096926713947|
| stddev|0.49440400387695516|13.945811492959752|19.09859425551718|14.43843215185036| 545.9624829141369|
|    min|               0.01|                13|                3|                5|                 4|
|    25%|               0.08|                61|               14|               64|                10|
|    50%|               0.17|                72|               25|               74|                24|
|    75%|               0.41|                80|               4

In [35]:
## correlation
correlation = {"NA_Sales-Critic_Score":videoGameSales.corr("NA_Sales","Critic_Score"),
"NA_Sales-Critic_Count":videoGameSales.corr("NA_Sales","Critic_Count"),
"NA_Sales-User_Score":videoGameSales.corr("NA_Sales","User_Score"),
"NA_Sales-User_Count":videoGameSales.corr("NA_Sales", "User_Count"),
"Critic_Score-Critic_Count":videoGameSales.corr("Critic_Score", "Critic_Count"),
"Critic_Score-User_Score":videoGameSales.corr("Critic_Score", "User_Score"),
"Critic_Score-User_Count":videoGameSales.corr("Critic_Score", "User_Count"),
"Critic_Count-User_Score":videoGameSales.corr("Critic_Count", "User_Score"),
"Critic_Count-User_Count":videoGameSales.corr("Critic_Count", "User_Count"),
"User_Score-User_Count":videoGameSales.corr("User_Score", "User_Count")}
correlation

###
numeric_col = VectorAssembler(inputCols=["NA_Sales","Critic_Score", "Critic_Count","User_Score","User_Count"], outputCol="features").transform(videoGameSales)
print(Correlation.corr(numeric_col, "features",method='pearson').collect()[0][0])

DenseMatrix([[1.        , 0.35297362, 0.34350463, 0.14535079, 0.26839294],
             [0.35297362, 1.        , 0.39039934, 0.58454759, 0.24368767],
             [0.34350463, 0.39039934, 1.        , 0.1927621 , 0.34805984],
             [0.14535079, 0.58454759, 0.1927621 , 1.        , 0.01239754],
             [0.26839294, 0.24368767, 0.34805984, 0.01239754, 1.        ]])


In [7]:
# category count and avg NA_sales by category
videoGameSales.groupBy("Name").count().show()
videoGameSales.groupBy("Platform").count().show()
videoGameSales.groupBy("Genre").count().show()
videoGameSales.groupBy("Rating").count().show()
videoGameSales.groupBy("Name").mean("NA_Sales").show()
videoGameSales.groupBy("Platform").mean("NA_Sales").show()
videoGameSales.groupBy("Genre").mean("NA_Sales").show()
videoGameSales.groupBy("Rating").mean("NA_Sales").show()

+--------------------+-----+
|                Name|count|
+--------------------+-----+
|Legacy of Kain: S...|    1|
|                RIFT|    1|
|  The Last Airbender|    2|
|Men of War: Assau...|    1|
|Back to the Futur...|    2|
|       Kakuto Chojin|    1|
|          Dungeons 2|    1|
|All-Star Baseball...|    3|
|Sherlock Holmes: ...|    1|
|ESPN X Games Skat...|    1|
|RalliSport Challe...|    1|
|The Elder Scrolls...|    2|
|Call of Duty Blac...|    1|
|Godzilla: Save th...|    2|
|     Mister Mosquito|    1|
|      Birds of Steel|    2|
|      Arcana Heart 3|    1|
|Battlestar Galactica|    1|
|Samurai Shodown A...|    1|
|Stoked: Big Air E...|    1|
+--------------------+-----+
only showing top 20 rows

+--------+-----+
|Platform|count|
+--------+-----+
|      PC|  304|
|     PS3|  749|
|      PS|  150|
|     PS2| 1135|
|     3DS|  141|
|    WiiU|   81|
|     PS4|  210|
|     PSP|  379|
|    XOne|  158|
|    X360|  848|
|     GBA|  238|
|     Wii|  475|
|      GC|  353|
|     

In [8]:
## dropping the 'Name' field
videoGameSales_1 = videoGameSales.drop("Name")
videoGameSales_1.show(5)

+--------+------------+------+------------+------------+----------+----------+--------+
|Platform|       Genre|Rating|Critic_Score|Critic_Count|User_Score|User_Count|NA_Sales|
+--------+------------+------+------------+------------+----------+----------+--------+
|      PS|Role-Playing|     T|          92|          20|        91|      1282|    3.01|
|     PS2|Role-Playing|     T|          92|          53|        86|      1056|    2.91|
|      PC|  Simulation|     T|          86|          75|        75|       886|    0.99|
|      PS|Role-Playing|     T|          90|          24|        85|       644|    2.28|
|      PS|    Fighting|     T|          96|          15|        90|       367|    3.27|
+--------+------------+------+------------+------------+----------+----------+--------+
only showing top 5 rows



# Model Building and Evaluation

## RFormula

In [9]:
## RFormula
formula = RFormula(
    formula="NA_Sales ~ .",
    featuresCol="features",
    labelCol="label")

In [10]:
videoGameSales_fitted = formula.fit(videoGameSales_1).transform(videoGameSales_1)
videoGameSales_fitted.show(5, False)

+--------+------------+------+------------+------------+----------+----------+--------+---------------------------------------------------------------+-----+
|Platform|Genre       |Rating|Critic_Score|Critic_Count|User_Score|User_Count|NA_Sales|features                                                       |label|
+--------+------------+------+------------+------------+----------+----------+--------+---------------------------------------------------------------+-----+
|PS      |Role-Playing|T     |92          |20          |91        |1282      |3.01    |(34,[12,18,26,30,31,32,33],[1.0,1.0,1.0,92.0,20.0,91.0,1282.0])|3.01 |
|PS2     |Role-Playing|T     |92          |53          |86        |1056      |2.91    |(34,[0,18,26,30,31,32,33],[1.0,1.0,1.0,92.0,53.0,86.0,1056.0]) |2.91 |
|PC      |Simulation  |T     |86          |75          |75        |886       |0.99    |(34,[8,23,26,30,31,32,33],[1.0,1.0,1.0,86.0,75.0,75.0,886.0])  |0.99 |
|PS      |Role-Playing|T     |90          |24       

In [11]:
train, test = videoGameSales_fitted.select('label','features').randomSplit([0.7,0.3], seed=2018)

In [12]:
videoGameSales_fitted.schema["features"].metadata["ml_attr"]["attrs"]

{'binary': [{'idx': 0, 'name': 'Platform_PS2'},
  {'idx': 1, 'name': 'Platform_X360'},
  {'idx': 2, 'name': 'Platform_PS3'},
  {'idx': 3, 'name': 'Platform_XB'},
  {'idx': 4, 'name': 'Platform_Wii'},
  {'idx': 5, 'name': 'Platform_DS'},
  {'idx': 6, 'name': 'Platform_PSP'},
  {'idx': 7, 'name': 'Platform_GC'},
  {'idx': 8, 'name': 'Platform_PC'},
  {'idx': 9, 'name': 'Platform_GBA'},
  {'idx': 10, 'name': 'Platform_PS4'},
  {'idx': 11, 'name': 'Platform_XOne'},
  {'idx': 12, 'name': 'Platform_PS'},
  {'idx': 13, 'name': 'Platform_3DS'},
  {'idx': 14, 'name': 'Platform_PSV'},
  {'idx': 15, 'name': 'Genre_Action'},
  {'idx': 16, 'name': 'Genre_Sports'},
  {'idx': 17, 'name': 'Genre_Shooter'},
  {'idx': 18, 'name': 'Genre_Role-Playing'},
  {'idx': 19, 'name': 'Genre_Racing'},
  {'idx': 20, 'name': 'Genre_Platform'},
  {'idx': 21, 'name': 'Genre_Misc'},
  {'idx': 22, 'name': 'Genre_Fighting'},
  {'idx': 23, 'name': 'Genre_Simulation'},
  {'idx': 24, 'name': 'Genre_Adventure'},
  {'idx': 25

In [13]:
train.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 0.01|(34,[0,15,28,30,3...|
| 0.01|(34,[0,15,28,30,3...|
| 0.01|(34,[0,16,27,30,3...|
| 0.01|(34,[0,17,28,30,3...|
| 0.01|(34,[0,17,29,30,3...|
+-----+--------------------+
only showing top 5 rows



In [14]:
# function to evaluate regression model (with target field name 'label')
def RegressionModelsEvaluator(name, predictions):    
    predictionAndLabels =predictions.select("prediction", "label").rdd
    metrics = RegressionMetrics(predictionAndLabels)
    print(name,"MAE:", metrics.meanAbsoluteError,"RMSE:",metrics.rootMeanSquaredError,"R2:",metrics.r2)

In [15]:
# function to evaluate regression model (with target field name 'NA_Sales')
def RegressionModelsEvaluator2(name, predictions):    
    predictionAndLabels =predictions.select("prediction", "NA_Sales").rdd
    metrics = RegressionMetrics(predictionAndLabels)
    print(name,"MAE:", metrics.meanAbsoluteError,"RMSE:", metrics.rootMeanSquaredError,"R2:", metrics.r2)

### I) Linear Regression (Data Prepared with RFormula)

In [16]:
## Model Building and Evaluation
lr = LinearRegression(labelCol='label')
lrModel = lr.fit(train)
print("Coefficients:", lrModel.coefficients)
print("Intercept:", lrModel.intercept)
print("p-values", lrModel.summary.pValues)
print("  ")
predictions = lrModel.evaluate(test)
print("LR:","MAE:",round(predictions.meanAbsoluteError,2),"RMSE:",round(predictions.rootMeanSquaredError,2),"R2:",round(predictions.r2,2))

Coefficients: [0.14144911059260423,0.06808900107952881,0.023592157339702678,-0.03006992152725865,0.16949649861109534,0.08495969301930362,-0.011855918189083698,0.04110208630155655,-0.3400316142695568,0.09314940849938402,-0.1135709689900666,0.14198006080346065,0.37354305971575885,-0.07595010921191243,-0.15732967065962994,0.05826798646304408,0.042841743389255435,0.059196392231643254,-0.02414501640426887,0.03148308625143497,0.033937764287222734,0.14849528406525192,0.0471275249892468,0.10294109976235097,-0.07232883309930105,-0.12253323586891206,0.13626869492616717,0.22488435308269955,0.10793640962520645,0.17750847170095282,0.009869371067262253,0.006489410936566472,-0.002486482027059155,0.00021697737602888252]
Intercept: -0.6294084337106871
p-values [0.015085641009881856, 0.24578433750195483, 0.6883809228846667, 0.6174679049882656, 0.0051059778231579145, 0.16401360640328821, 0.8483017071317955, 0.5089617138251881, 1.4831715589025407e-07, 0.15249024802401046, 0.08589710513735715, 0.0417680615

## Pipeline Approach

In [17]:
## Pipeline approach
# Preparing stages
#Preparing stages
s1_platform_indexer = StringIndexer(inputCol = 'Platform', outputCol = 'PlatformIndex')
s1_platform_encoder = OneHotEncoder(inputCol = 'PlatformIndex', outputCol = 'PlatformVec')
s2_genre_indexer = StringIndexer(inputCol = 'Genre', outputCol = 'GenreIndex')
s2_genre_encoder = OneHotEncoder(inputCol = 'GenreIndex', outputCol = 'GenreVec')
s3_rating_indexer = StringIndexer(inputCol = 'Rating', outputCol = 'RatingIndex')
s3_rating_encoder = OneHotEncoder(inputCol = 'RatingIndex', outputCol = 'RatingVec')
s4_assembler = VectorAssembler(inputCols=['PlatformVec', 'GenreVec', 'RatingVec', "Critic_Score","Critic_Count","User_Score", "User_Count"], outputCol="features")

In [18]:
## pipeline approach
vgs_stages = [s1_platform_indexer, s1_platform_encoder,s2_genre_indexer, s2_genre_encoder,s3_rating_indexer,s3_rating_encoder,s4_assembler]
vgs_pipeline = Pipeline(stages = vgs_stages)
pipelineModel = vgs_pipeline.fit(videoGameSales_1)
vgs_pipeline = pipelineModel.transform(videoGameSales_1)
vgs_pipeline.show(5)

+--------+------------+------+------------+------------+----------+----------+--------+-------------+---------------+----------+--------------+-----------+-------------+--------------------+
|Platform|       Genre|Rating|Critic_Score|Critic_Count|User_Score|User_Count|NA_Sales|PlatformIndex|    PlatformVec|GenreIndex|      GenreVec|RatingIndex|    RatingVec|            features|
+--------+------------+------+------------+------------+----------+----------+--------+-------------+---------------+----------+--------------+-----------+-------------+--------------------+
|      PS|Role-Playing|     T|          92|          20|        91|      1282|    3.01|         12.0|(15,[12],[1.0])|       3.0|(11,[3],[1.0])|        0.0|(4,[0],[1.0])|(34,[12,18,26,30,...|
|     PS2|Role-Playing|     T|          92|          53|        86|      1056|    2.91|          0.0| (15,[0],[1.0])|       3.0|(11,[3],[1.0])|        0.0|(4,[0],[1.0])|(34,[0,18,26,30,3...|
|      PC|  Simulation|     T|          86|  

In [20]:
#splitting the data in train and test
train_pipe, test_pipe = vgs_pipeline.select('NA_Sales', 'features').randomSplit([0.7, 0.3], seed=2018)

In [22]:
train_pipe.show(5)

+--------+--------------------+
|NA_Sales|            features|
+--------+--------------------+
|    0.01|(34,[0,15,28,30,3...|
|    0.01|(34,[0,15,28,30,3...|
|    0.01|(34,[0,16,27,30,3...|
|    0.01|(34,[0,17,28,30,3...|
|    0.01|(34,[0,17,29,30,3...|
+--------+--------------------+
only showing top 5 rows



### II) Linear Regression (data prepared with pipeline approach)

In [23]:
# Model Building and evaluation(data prepared with pipeline method)
## Model Building and Evaluation
lr1 = LinearRegression(labelCol='NA_Sales')
lrModel_pipe = lr1.fit(train_pipe)
print("Coefficients:", lrModel_pipe.coefficients)
print("Intercept:", lrModel_pipe.intercept)
print("p-values", lrModel_pipe.summary.pValues)
print("  ")
predictions_pipe = lrModel_pipe.evaluate(test_pipe)
print("LR:","MAE:",round(predictions_pipe.meanAbsoluteError,2),"RMSE:",round(predictions_pipe.rootMeanSquaredError,2),"R2:",round(predictions_pipe.r2,2))

Coefficients: [0.14144911059260423,0.06808900107952881,0.023592157339702678,-0.03006992152725865,0.16949649861109534,0.08495969301930362,-0.011855918189083698,0.04110208630155655,-0.3400316142695568,0.09314940849938402,-0.1135709689900666,0.14198006080346065,0.37354305971575885,-0.07595010921191243,-0.15732967065962994,0.05826798646304408,0.042841743389255435,0.059196392231643254,-0.02414501640426887,0.03148308625143497,0.033937764287222734,0.14849528406525192,0.0471275249892468,0.10294109976235097,-0.07232883309930105,-0.12253323586891206,0.13626869492616717,0.22488435308269955,0.10793640962520645,0.17750847170095282,0.009869371067262253,0.006489410936566472,-0.002486482027059155,0.00021697737602888252]
Intercept: -0.6294084337106871
p-values [0.015085641009881856, 0.24578433750195483, 0.6883809228846667, 0.6174679049882656, 0.0051059778231579145, 0.16401360640328821, 0.8483017071317955, 0.5089617138251881, 1.4831715589025407e-07, 0.15249024802401046, 0.08589710513735715, 0.0417680615

### III) Generalized Linear Model (Data Preparation: RFormula)

In [24]:
## Generalized Linear Model (data preparation: RFormula)
glr = GeneralizedLinearRegression(labelCol='label')
glrModel = glr.fit(train)
print("Coefficients:",glrModel.coefficients)
print("Intercept:",glrModel.intercept)
print("p-values", glrModel.summary.pValues)
print("")
predictions2 = glrModel.transform(test)
RegressionModelsEvaluator("GLR:",predictions2)

Coefficients: [0.14144911059260423,0.06808900107952881,0.023592157339702678,-0.03006992152725865,0.16949649861109534,0.08495969301930362,-0.011855918189083698,0.04110208630155655,-0.3400316142695568,0.09314940849938402,-0.1135709689900666,0.14198006080346065,0.37354305971575885,-0.07595010921191243,-0.15732967065962994,0.05826798646304408,0.042841743389255435,0.059196392231643254,-0.02414501640426887,0.03148308625143497,0.033937764287222734,0.14849528406525192,0.0471275249892468,0.10294109976235097,-0.07232883309930105,-0.12253323586891206,0.13626869492616717,0.22488435308269955,0.10793640962520645,0.17750847170095282,0.009869371067262253,0.006489410936566472,-0.002486482027059155,0.00021697737602888252]
Intercept: -0.6294084337106871
p-values [0.015085641009881856, 0.24578433750195483, 0.6883809228846667, 0.6174679049882656, 0.0051059778231579145, 0.16401360640328821, 0.8483017071317955, 0.5089617138251881, 1.4831715589025407e-07, 0.15249024802401046, 0.08589710513735715, 0.0417680615

###  IV) Generalized Linear Model (Data Preparation: pipeline)

In [25]:
## Generalized Linear Model (data preparation: pipeline)
glr = GeneralizedLinearRegression(labelCol='NA_Sales')
glrModel_pipe = glr.fit(train_pipe)
print("Coefficients:",glrModel_pipe.coefficients)
print("Intercept:",glrModel_pipe.intercept)
print("p-values", glrModel_pipe.summary.pValues)
print("")
predictions_pipe2 = glrModel_pipe.transform(test_pipe)
RegressionModelsEvaluator2("GLR:",predictions_pipe2)

Coefficients: [0.14144911059260423,0.06808900107952881,0.023592157339702678,-0.03006992152725865,0.16949649861109534,0.08495969301930362,-0.011855918189083698,0.04110208630155655,-0.3400316142695568,0.09314940849938402,-0.1135709689900666,0.14198006080346065,0.37354305971575885,-0.07595010921191243,-0.15732967065962994,0.05826798646304408,0.042841743389255435,0.059196392231643254,-0.02414501640426887,0.03148308625143497,0.033937764287222734,0.14849528406525192,0.0471275249892468,0.10294109976235097,-0.07232883309930105,-0.12253323586891206,0.13626869492616717,0.22488435308269955,0.10793640962520645,0.17750847170095282,0.009869371067262253,0.006489410936566472,-0.002486482027059155,0.00021697737602888252]
Intercept: -0.6294084337106871
p-values [0.015085641009881856, 0.24578433750195483, 0.6883809228846667, 0.6174679049882656, 0.0051059778231579145, 0.16401360640328821, 0.8483017071317955, 0.5089617138251881, 1.4831715589025407e-07, 0.15249024802401046, 0.08589710513735715, 0.0417680615

### V) Decision Tree Regression (Data Preparation : RFormula)

In [26]:
## Decision Tree Regression
# with maxDepth = 3, Data Preparation: RFormula
dtr = DecisionTreeRegressor(labelCol='label', maxDepth=4)
dtrModel = dtr.fit(train)
print(dtrModel.toDebugString)
predictions3 = dtrModel.transform(test)
RegressionModelsEvaluator("DT_rf:",predictions3)

# with default settings,  Data Preparation: RFormula
dtr2 = DecisionTreeRegressor(labelCol='label')
predictions4 = dtr2.fit(train).transform(test)
RegressionModelsEvaluator("DT2_rf:",predictions4)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_49f290331174111419e8) of depth 4 with 31 nodes
  If (feature 33 <= 65.5)
   If (feature 33 <= 19.5)
    If (feature 27 in {0.0})
     If (feature 4 in {0.0})
      Predict: 0.1285830784913352
     Else (feature 4 not in {0.0})
      Predict: 0.23761061946902662
    Else (feature 27 not in {0.0})
     If (feature 33 <= 12.5)
      Predict: 0.1974137931034483
     Else (feature 33 > 12.5)
      Predict: 0.31580487804878066
   Else (feature 33 > 19.5)
    If (feature 30 <= 78.5)
     If (feature 21 in {0.0})
      Predict: 0.25295918367346926
     Else (feature 21 not in {0.0})
      Predict: 0.6142222222222226
    Else (feature 30 > 78.5)
     If (feature 16 in {0.0})
      Predict: 0.4131600000000002
     Else (feature 16 not in {0.0})
      Predict: 0.6814117647058822
  Else (feature 33 > 65.5)
   If (feature 30 <= 86.5)
    If (feature 8 in {1.0})
     If (feature 33 <= 1106.5)
      Predict: 0.06863157894736843
     Else (feature

### VI) Decision Tree Regression (Data Preparation: Pipeline)

In [27]:
## Decision Tree Regression
# with maxDepth = 3, Data Preparation: Pipeline
dtr_pipe = DecisionTreeRegressor(labelCol='NA_Sales', maxDepth=4)
dtrModel_pipe = dtr_pipe.fit(train_pipe)
print(dtrModel_pipe.toDebugString)
predictions_pipe3 = dtrModel_pipe.transform(test_pipe)
RegressionModelsEvaluator2("DT_pipe:",predictions_pipe3)

# with default settings,  Data Preparation: Pipeline
dtr2_pipe = DecisionTreeRegressor(labelCol='NA_Sales')
predictions_pipe4 = dtr2_pipe.fit(train_pipe).transform(test_pipe)
RegressionModelsEvaluator2("DT2_pipe:",predictions_pipe4)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4b9aa24d99d7a7473a84) of depth 4 with 31 nodes
  If (feature 33 <= 65.5)
   If (feature 33 <= 19.5)
    If (feature 27 in {0.0})
     If (feature 4 in {0.0})
      Predict: 0.1285830784913352
     Else (feature 4 not in {0.0})
      Predict: 0.23761061946902662
    Else (feature 27 not in {0.0})
     If (feature 33 <= 12.5)
      Predict: 0.1974137931034483
     Else (feature 33 > 12.5)
      Predict: 0.31580487804878066
   Else (feature 33 > 19.5)
    If (feature 30 <= 78.5)
     If (feature 21 in {0.0})
      Predict: 0.25295918367346926
     Else (feature 21 not in {0.0})
      Predict: 0.6142222222222226
    Else (feature 30 > 78.5)
     If (feature 16 in {0.0})
      Predict: 0.4131600000000002
     Else (feature 16 not in {0.0})
      Predict: 0.6814117647058822
  Else (feature 33 > 65.5)
   If (feature 30 <= 86.5)
    If (feature 8 in {1.0})
     If (feature 33 <= 1106.5)
      Predict: 0.06863157894736843
     Else (feature

### VII) Random Forest Regression(Data Preparation: RFormula)

In [28]:
## Random Forest Regression
# with numTrees = 3,maxDepth = 3 and Data Preparation: RFormula
rfr = RandomForestRegressor(labelCol='label', numTrees=3, maxDepth=3)
rfrModel = rfr.fit(train)
print(rfrModel.toDebugString)
predictions5 = rfrModel.transform(test)
RegressionModelsEvaluator("RF:",predictions5)

# with default settings,  Data Preparation: RFormula
rfr2 = RandomForestRegressor(labelCol='label')
predictions6 = rfr2.fit(train).transform(test)
RegressionModelsEvaluator("RF2:",predictions6)

RandomForestRegressionModel (uid=RandomForestRegressor_4858beb4ace5908f2717) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 30 <= 84.5)
     If (feature 4 in {0.0})
      If (feature 33 <= 40.5)
       Predict: 0.2000703642384106
      Else (feature 33 > 40.5)
       Predict: 0.43840927920463957
     Else (feature 4 not in {0.0})
      If (feature 18 in {1.0})
       Predict: 0.15733333333333333
      Else (feature 18 not in {1.0})
       Predict: 0.4
    Else (feature 30 > 84.5)
     If (feature 33 <= 46.5)
      If (feature 33 <= 19.5)
       Predict: 0.32875
      Else (feature 33 > 19.5)
       Predict: 0.5119736842105262
     Else (feature 33 > 46.5)
      If (feature 31 <= 64.5)
       Predict: 0.8721543408360128
      Else (feature 31 > 64.5)
       Predict: 1.349405940594059
  Tree 1 (weight 1.0):
    If (feature 33 <= 65.5)
     If (feature 33 <= 24.5)
      If (feature 26 in {1.0})
       Predict: 0.14029601029601033
      Else (feature 26 not in {1.0})
       Predict: 0

### VIII) Random Forest Regression(Data Preparation: Pipeline)

In [29]:
## Random Forest Regression
# with numTrees = 3, maxDepth = 3, and Data Preparation: Pipeline
rfr_pipe = RandomForestRegressor(labelCol='NA_Sales', numTrees=3, maxDepth=3)
rfrModel_pipe = rfr_pipe.fit(train_pipe)
print(rfrModel_pipe.toDebugString)

predictions_pipe5 = rfrModel_pipe.transform(test_pipe)
RegressionModelsEvaluator2("RF:",predictions_pipe5)


# with default settings,  Data Preparation: Pipeline
rfr2_pipe = RandomForestRegressor(labelCol='NA_Sales')
predictions_pipe6 = rfr2_pipe.fit(train_pipe).transform(test_pipe)
RegressionModelsEvaluator2("RF2:",predictions_pipe6)

RandomForestRegressionModel (uid=RandomForestRegressor_48e8a37c577f116da963) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 30 <= 84.5)
     If (feature 4 in {0.0})
      If (feature 33 <= 40.5)
       Predict: 0.2000703642384106
      Else (feature 33 > 40.5)
       Predict: 0.43840927920463957
     Else (feature 4 not in {0.0})
      If (feature 18 in {1.0})
       Predict: 0.15733333333333333
      Else (feature 18 not in {1.0})
       Predict: 0.4
    Else (feature 30 > 84.5)
     If (feature 33 <= 46.5)
      If (feature 33 <= 19.5)
       Predict: 0.32875
      Else (feature 33 > 19.5)
       Predict: 0.5119736842105262
     Else (feature 33 > 46.5)
      If (feature 31 <= 64.5)
       Predict: 0.8721543408360128
      Else (feature 31 > 64.5)
       Predict: 1.349405940594059
  Tree 1 (weight 1.0):
    If (feature 33 <= 65.5)
     If (feature 33 <= 24.5)
      If (feature 26 in {1.0})
       Predict: 0.14029601029601033
      Else (feature 26 not in {1.0})
       Predict: 0

### IX) Gradient-boosted Tree Regression (Data Preparation: RFormula)

In [30]:
## Gradient-boosted Tree Regression
# with maxDepth = 3, maxIter = 3, and Data Preparation: RFormula
gbtr = GBTRegressor(labelCol='label', maxDepth=3, maxIter=3)
gbtrModel = gbtr.fit(train)
print(gbtrModel.toDebugString)
predictions7 = gbtrModel.transform(test)
RegressionModelsEvaluator("GB:",predictions7)

# with default settings,  Data Preparation: RFormula
gbtr2 = GBTRegressor(labelCol='label')
predictions8 = gbtr2.fit(train).transform(test)
RegressionModelsEvaluator("GB2:",predictions8)

GBTRegressionModel (uid=GBTRegressor_4614856b941a9006bb66) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 33 <= 65.5)
     If (feature 33 <= 19.5)
      If (feature 27 in {0.0})
       Predict: 0.1398446069469835
      Else (feature 27 not in {0.0})
       Predict: 0.2262040332147095
     Else (feature 33 > 19.5)
      If (feature 30 <= 78.5)
       Predict: 0.27049622437971954
      Else (feature 30 > 78.5)
       Predict: 0.48122388059701493
    Else (feature 33 > 65.5)
     If (feature 30 <= 86.5)
      If (feature 8 in {1.0})
       Predict: 0.12744525547445257
      Else (feature 8 not in {1.0})
       Predict: 0.6019614921780988
     Else (feature 30 > 86.5)
      If (feature 8 in {1.0})
       Predict: 0.4282051282051282
      Else (feature 8 not in {1.0})
       Predict: 1.193375
  Tree 1 (weight 0.1):
    If (feature 33 <= 526.5)
     If (feature 28 in {1.0})
      If (feature 33 <= 65.5)
       Predict: -0.15868956398902256
      Else (feature 33 > 65.5)
       Predict: 

### X) Gradient-boosted Tree Regression(Data Preparation: RFormula)

In [31]:
## Gradient-boosted Tree Regression
# with maxDepth = 3,maxIter = 3, and Data Preparation: RFormula
gbtr_pipe = GBTRegressor(labelCol='NA_Sales', maxDepth=3, maxIter=3)
gbtrModel_pipe = gbtr_pipe.fit(train_pipe)
print(gbtrModel_pipe.toDebugString)
predictions_pipe7 = gbtrModel_pipe.transform(test_pipe)
RegressionModelsEvaluator2("GB:",predictions_pipe7)

# with default settings,  Data Preparation: Pipeline
gbtr2_pipe = GBTRegressor(labelCol='NA_Sales')
predictions_pipe8 = gbtr2_pipe.fit(train_pipe).transform(test_pipe)
RegressionModelsEvaluator2("GB2:",predictions_pipe8)

GBTRegressionModel (uid=GBTRegressor_43dbb9eb9855ed7550a5) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 33 <= 65.5)
     If (feature 33 <= 19.5)
      If (feature 27 in {0.0})
       Predict: 0.1398446069469835
      Else (feature 27 not in {0.0})
       Predict: 0.2262040332147095
     Else (feature 33 > 19.5)
      If (feature 30 <= 78.5)
       Predict: 0.27049622437971954
      Else (feature 30 > 78.5)
       Predict: 0.48122388059701493
    Else (feature 33 > 65.5)
     If (feature 30 <= 86.5)
      If (feature 8 in {1.0})
       Predict: 0.12744525547445257
      Else (feature 8 not in {1.0})
       Predict: 0.6019614921780988
     Else (feature 30 > 86.5)
      If (feature 8 in {1.0})
       Predict: 0.4282051282051282
      Else (feature 8 not in {1.0})
       Predict: 1.193375
  Tree 1 (weight 0.1):
    If (feature 33 <= 526.5)
     If (feature 28 in {1.0})
      If (feature 33 <= 65.5)
       Predict: -0.15868956398902256
      Else (feature 33 > 65.5)
       Predict: 

## Linear Regression Model Tuning

In [32]:
# Parameter tuning
# Linear Regression
lr_parameterTuning = LinearRegression(labelCol='label')
# ParamGrid 1
paramGrid = (ParamGridBuilder()
             .addGrid(lr_parameterTuning.regParam, [0.0, 0.5])
             .addGrid(lr_parameterTuning.maxIter, [10, 50])
             .addGrid(lr_parameterTuning.elasticNetParam, [0.0, 0.3])
             .build())
evaluator = RegressionEvaluator(labelCol='label', metricName='mae')

#cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
lrTuned = TrainValidationSplit(estimator=lr_parameterTuning, estimatorParamMaps=paramGrid, evaluator=evaluator, trainRatio=0.6)

#cvModel = cv.fit(train)
lrTunedModel = lrTuned.fit(train)
#cv_predictions = cvModel.transform(test)
split_predictions = lrTunedModel.transform(test)
RegressionModelsEvaluator("LR:",split_predictions)

LR: MAE: 0.27102462170078284 RMSE: 0.43330749971570104 R2: 0.2027087674035042


In [33]:
## Future Generation
vgs_fg = videoGameSales_1.withColumn("User Score Logarithm",log(col("User_Score")))
vgs_fg.show(5)
formula_2 = RFormula(formula="NA_Sales ~ .", featuresCol="features", labelCol="label")
vgs_2_fitted = formula.fit(vgs_fg).transform(vgs_fg)
train_fg, test_fg = vgs_2_fitted.select('label', 'features').randomSplit([0.7, 0.3], seed=2018)
lr_fg = LinearRegression(labelCol='label', featuresCol='features',maxIter=10)
lr_fg_Model = lr_fg.fit(train_fg)
print("Coefficients:", lr_fg_Model.coefficients)
print("Intercept:", lr_fg_Model.intercept)
print("p-values", lr_fg_Model.summary.pValues)
predictions_fg = lr_fg_Model.transform(test_fg)
print("  ")
RegressionModelsEvaluator("LR:",predictions_fg)

+--------+------------+------+------------+------------+----------+----------+--------+--------------------+
|Platform|       Genre|Rating|Critic_Score|Critic_Count|User_Score|User_Count|NA_Sales|User Score Logarithm|
+--------+------------+------+------------+------------+----------+----------+--------+--------------------+
|      PS|Role-Playing|     T|          92|          20|        91|      1282|    3.01|    4.51085950651685|
|     PS2|Role-Playing|     T|          92|          53|        86|      1056|    2.91|   4.454347296253507|
|      PC|  Simulation|     T|          86|          75|        75|       886|    0.99|    4.31748811353631|
|      PS|Role-Playing|     T|          90|          24|        85|       644|    2.28|   4.442651256490317|
|      PS|    Fighting|     T|          96|          15|        90|       367|    3.27|   4.499809670330265|
+--------+------------+------+------------+------------+----------+----------+--------+--------------------+
only showing top 5 