## Random Forest - Model Fitting
### University of Virginia
### DS 5110: Big Data Systems
### By: TeamBike

Create session: 16 cores, 128 GB RAM

#### Process is similar to the logistic regression model, though model never ran with CV

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder\
    .master("local") \
    .appName("models") \
    .config('spark.cores.max', '16') \
    .getOrCreate()



'''
spark = SparkSession \
    .builder\
    .master("local") \
    .appName("models") \
    .config("spark.executor.memory", '80g') \
    .config('spark.executor.cores', '16') \
    .config('spark.cores.max', '16') \
    .config("spark.driver.memory",'40g') \
    .getOrCreate()
'''

#spark = SparkSession \
#    .builder.getOrCreate()


sc = SparkSession.sparkContext

In [3]:
%matplotlib inline

## This model is essentially a challenger model to the logistic regression we also ran. The EDA exists in the `log_reg_NBmodel.ipynb` file, which goes over the neighborhood mapping and start-to-end distribution. 

### Data pipeline is unchanged from the logistic-regression file, so no need to run this code chunk

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

cats = ['day', 'time_bin', 'peak_commute', 'month', 'start_neighborhood']

# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in cats
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")

label_indexer = StringIndexer(inputCol='end_neighborhood', outputCol= 'n_index')

label_encoder = OneHotEncoder(inputCol='n_index',outputCol= 'label')
                              
#label_assember = VectorAssembler(inputCols=,outputCol="label")

pipeline = Pipeline(stages=indexers + [label_indexer] + encoders+ [label_encoder] + [assembler])
model=pipeline.fit(df_group)
transformed = model.transform(df_group)
transformed.show(5)

### Load transformed data (output in Logistic Regression file)

In [6]:
#transformed.write.parquet("pipelined_data_NBs.parquet")

In [7]:
transformed = spark.read.parquet("pipelined_data_NBs.parquet");

### Split the data: Using 60/20/20 split is more than sufficient -- the model takes a very long time to fit due to the number of observations, even with 60% as training. 

In [8]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier

#randomly split data into training and test dataset
(train_data, test_data, hold_out) = transformed.randomSplit([0.60, 0.20, 0.20], seed = 33)

### Run the random-forest model. I do not use CV as this model takes a very long time to run and I often ran out of resources. Although CV may have helped slightly, I do not think it would have provided much of a boost. Modeling end neighborhoods is difficult due to the fact that most rides are intra-neighborhood, and as such, most predictions predict that a ride will end in the same area where it started. 

In [33]:
# train RandomForest model

rf = RandomForestClassifier(labelCol="n_index", featuresCol="features")
rf_model = rf.fit(train_data)

In [34]:
# Make predictions on test data
predictions = rf_model.transform(test_data)

In [35]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="n_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = rf_model.stages[2]
print(gbtModel)  # summary only

Test Error = 0.368
StringIndexerModel: uid=StringIndexer_9e604145725b, handleInvalid=error


In [36]:
predictions.columns

['start_neighborhood',
 'end_neighborhood',
 'day',
 'time_bin',
 'peak_commute',
 'month',
 'hour',
 'day_indexed',
 'time_bin_indexed',
 'peak_commute_indexed',
 'month_indexed',
 'start_neighborhood_indexed',
 'n_index',
 'day_indexed_encoded',
 'time_bin_indexed_encoded',
 'peak_commute_indexed_encoded',
 'month_indexed_encoded',
 'start_neighborhood_indexed_encoded',
 'label',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [41]:
rf_model.save("rf_nbs")

In [38]:
preds = predictions.select('start_neighborhood', 'end_neighborhood', 'day', 'time_bin', 'peak_commute', 'month', 'hour', 'n_index', 'prediction')

### See that the prediction accuracy is about 63%. I will cover model analysis in another notebook. 

In [39]:
predictions.select('start_neighborhood', 'end_neighborhood', 'day', 'time_bin', 'peak_commute', 'month', 'hour', 'n_index', 'prediction')\
.where(preds.start_neighborhood == preds.end_neighborhood).count()/preds.count()

0.6326516755198247

### Model files are saved for analysis