Lecture Notes:

- Num of sample partition: K

1. Cross-validation
- Bias-Variance trade off
- for prediction error estimation
- CV for regression
    - Estimate CV(K), which is the average of all MSE_k
- CV for classification
    - Estimate CV(K), which is the average of all Err_k (error rate)
- for determining the best parameter

2. Bootstrap
- Resampling with replacements
- for SE estimation, not good for prediction erorr estimation
- bootstrap confidence interval
    - resampling 1000 times from originial data
    - build model for each sample and compute RSE/MSE
    - compute the mean and std of the set of 1000 RSE/MSE
    - or use 95% threshold to get lower and upper bound of the set of 1000 RSE/MSE, i.e. 2.5-97.5 percentile of the set
- Pr(one observation is not in the boostrap) = (1-1/n)^n (around 1/3)

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame, DataFrameReader
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import DoubleType
from pyspark.ml.classification import LogisticRegression

%matplotlib inline

# change the path on your machine
findspark.init("/Users/lhd0430/Downloads/spark")

In [2]:
# Creat spark session
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

In [7]:
# Load data as pyspark.sql.DataFrame
data = spark.read.csv("../data/Smarket.csv", header=True, inferSchema=True)
data.cache()
print(data.count())
data.dropna()
print(data.count())

1250
1250


In [8]:
data = data.replace(['Up','Down'],['1','0'],'Direction')
data = data.withColumn('Direction',data["Direction"].cast(DoubleType()))

In [14]:
# CV

# Convert feature to vector type
vecAssembler = VectorAssembler(inputCols=data.columns[:-2], outputCol="features")
df = vecAssembler.transform(data)

# Fit cv model
lr = LogisticRegression(featuresCol="features", labelCol="Direction")
evaAUR = BinaryClassificationEvaluator(labelCol="Direction",metricName="areaUnderROC")
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
cv = CrossValidator(estimator=lr, evaluator=evaAUR, numFolds=3, estimatorParamMaps=paramGrid)
ml = cv.fit(df)
print(ml.avgMetrics[0])
# Note: CrossValidator is similar to sklearn GridSearchCV. It requires estimator,evaluator, and the grid of tuning parameters.

# Predict
predict = ml.transform(df)

# Evaluate
evaAUR = BinaryClassificationEvaluator(labelCol="Direction",metricName="areaUnderROC")
aur = evaAUR.evaluate(predict)
print(aur)


0.5
0.5512258520979454


In [17]:
# Bootstrap
n_iter = 10
stats = list()
lr = LogisticRegression(featuresCol="features", labelCol="Direction")
evaAUR = BinaryClassificationEvaluator(labelCol="Direction",metricName="areaUnderROC")
for i in range(n_iter):
    df_resample = df.sample(withReplacement=True,fraction=1.0)
    ml = lr.fit(df_resample)
    predict = ml.transform(df_resample)
    aur = evaAUR.evaluate(predict)
    stats.append(aur)
print(np.mean(stats),np.std(stats))


0.553814246093 0.0150243990288


In [18]:
spark.stop()