In [1]:
%matplotlib inline

In [2]:
# IMPORTS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, LinearRegressionModel, RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import SparseVector
from pyspark.sql.types import IntegerType
import matplotlib.pyplot as plt
import numpy as np

In [3]:
def parse_csv(s):
  return list(map(float, s.split(',')))

def extract_relevant_feats(vals):
  return {
    'label': int(vals[0]),
    'num_words': float(vals[-1]),
    'p_score': float(vals[-2]),
    'p_time': float(vals[-4])
  }
  
def read_csv(filepath):
  return (sc.textFile(filepath, minPartitions=96)
          .map(parse_csv)
          .map(extract_relevant_feats)
          .toDF())

In [4]:
path = "mnt/blobmount/"
df = read_csv(path + "full_processed_train").cache()

### Outlier Removal

In [6]:
minimum = df.selectExpr('MIN(label)').take(1)[0][0]
maximum = df.selectExpr('MAX(label)').take(1)[0][0]
bot_percentile = df.selectExpr('percentile(label, 0.01)').take(1)[0][0]
top_percentile = df.selectExpr('percentile(label, 0.99)').take(1)[0][0]

print('minimum: ', minimum)
print('maximum: ', maximum)
print('0.01 percentile: ', bot_percentile)
print('0.99 percentile: ', top_percentile)

In [7]:
filtered_df = (df.filter(bot_percentile <= df.label).filter(df.label <= top_percentile)
               .filter(bot_percentile <= df.p_score).filter(df.p_score <= top_percentile))

### Features

In [9]:
for feat in ['label', 'num_words', 'p_score', 'p_time']:
  print(feat)
  avg = filtered_df.selectExpr('AVG({})'.format(feat)).take(1)[0][0]
  std = filtered_df.selectExpr('STD({})'.format(feat)).take(1)[0][0]
  print('average: ', avg)
  print('std: ', std)
  print()