In [1]:
path = "mnt/blobmount/"

In [2]:
# IMPORTS
import matplotlib.pyplot as plt
import numpy as np
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, LinearRegressionModel, RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import SparseVector

In [3]:
def parse_csv(s):
  return list(map(float, s.split(',')))

def filter_outliers(vals):
  # label
  if not (-7.0 <= vals[0] and vals[0] <= 103.0):
    return False
  # p_score
  if not (-7.0 <= vals[-2] and vals[-2] <= 103.0):
    return False
  
  return True

def normalize(vals):
  # num_words
  vals[-1] = (vals[-1] - 17.13712707395722)/32.30891863874633
  # p_score
  vals[-2] = (vals[-2] - 5.390716682669022)/13.664352804935564
  # p_time
  vals[-4] = (vals[-4] - 9761.223722159446)/50008.29178605555
  return vals
  
def build_tup(vals):
  label = vals[0]
  subreddit_id = int(vals[1])
  
  index = 0
  feat_vec = []
  
  feat_vec.append((subreddit_id, 1))
  index += 100
  for val in vals[2:]:
    if val:
      feat_vec.append((index, val))
    index += 1
    
  return (label, SparseVector(index, feat_vec))
  
def read_csv(filepath):
  return (sc.textFile(filepath, minPartitions=96)
          .map(parse_csv)
          .filter(filter_outliers)
          .map(normalize)
          .map(build_tup)
          .toDF(['label', 'features']))

In [4]:
train_df = read_csv(path + "full_processed_train.csv").cache()
val_df = read_csv(path + "full_processed_val.csv").cache()
test_df = read_csv(path + "full_processed_test.csv").cache()

In [5]:
regEval = RegressionEvaluator(metricName="rmse")

def print_evaluation(df):
  # RMSE metric
  rmse = regEval.evaluate(df)
  print("RMSE: %.2f" % rmse)

  # Mean absolute error
  mae = regEval.evaluate(df, {regEval.metricName: "mae"})
  print("MAE: {0:.2f}".format(mae))
  
  # R^2 (coefficient of determination) metric
  r2 = regEval.evaluate(df, {regEval.metricName: "r2"})
  print("R^2: {0:.2f}".format(r2))

### Baseline

In [7]:
train_label_average = train_df.selectExpr("avg(label)").first()['avg(label)']
baseline_predictions_rdd = test_df.rdd.map(lambda row: (train_label_average, row['label']))
baseline_predictions_df = sqlContext.createDataFrame(baseline_predictions_rdd, ["prediction", "label"])

print_evaluation(baseline_predictions_df)

### Linear Regression

In [9]:
# Learn best model via cross-validation
best_score, best_param, lr_cv_model = float('inf'), None, None

for reg_param in [10**i for i in range(1, -6, -1)] + [0]:
  lr = LinearRegression(maxIter=100, regParam=reg_param, elasticNetParam=1.0)
  model = lr.fit(train_df)
  
  predictions_df = model.transform(val_df)
  score = regEval.evaluate(predictions_df)
  
  if score < best_score:
    best_score = score
    best_param = reg_param
    lr_cv_model = model

lr_cv_model.write().overwrite().save("mnt/blobmount/lr_cv_model") 

In [10]:
lr_cv_model = LinearRegressionModel.load("mnt/blobmount/lr_cv_model")

# Model Evaluation
lr_predictions_df = lr_cv_model.transform(test_df)
print_evaluation(lr_predictions_df)

### Random Forests

In [12]:
# Learn the best model via cross-validation
best_score, best_trees, best_depth, rf_cv_model = float('inf'), None, None, None

from datetime import datetime

for trees in [10, 20, 30]:
  for depth in [5, 10, 15]:
    rf = RandomForestRegressor(numTrees=trees, maxDepth=depth)
    model = rf.fit(train_df)

    predictions_df = model.transform(val_df)
    score = regEval.evaluate(predictions_df)
    
    if score < best_score:
      best_score = score
      best_trees = trees
      best_depth = depth
      rf_cv_model = model

rf_cv_model.write().overwrite().save("mnt/blobmount/rf_cv_model")

In [13]:
rf_cv_model = RandomForestRegressionModel.load("mnt/blobmount/rf_cv_model")

# Model Evaluation
rf_predictions_df = rf_cv_model.transform(test_df)
print_evaluation(rf_predictions_df)

### Visualizations / Miscellaneous

In [15]:
def unpack(row):
  return [row['label'], row['prediction'], row['label'] - row['prediction']]

data = np.array(rf_predictions_df.rdd.map(unpack).collect())

In [16]:
# Plot predictions against labels
fig, ax = plt.subplots()
ax.scatter(data[:,0], data[:,1], s=0.1, alpha=0.01)
ax.set_aspect("equal")
ax.set_xlabel("Label")
ax.set_ylabel("Prediction")
ax.set_title("Label vs Prediction")
display(fig)

In [17]:
# Plot Residuals

fig, ax = plt.subplots()
ax.hist(data[:,2], bins=list(range(-20, 41)))
ax.set_xlabel("Residual")
ax.set_ylabel("Frequency")
ax.set_title("Residuals (Label - Prediction)")
display(fig)

In [18]:
# Feature importances
importances = rf_cv_model.featureImportances
pairs = [(i, importances[i]) for i in range(len(importances))]
sorted_pairs = sorted(pairs, key=lambda tup: -tup[1])

for i in range(len(importances)):
  print(sorted_pairs[i])