In [1]:
hkong = spark.table("hkongpercent_1_ord_currency_35acb_csv")
display(hkong)

In [2]:
from pyspark.ml.feature import VectorAssembler
X = hkong.columns[1:311]
assembler = VectorAssembler(inputCols=X, outputCol="features")
#print (assembler)

In [3]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression()
# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
# Import the tuning submodule
import pyspark.ml.tuning as tune
import numpy as np

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

cv = tune.CrossValidator(estimator=lr,
                         estimatorParamMaps=grid,
                         evaluator=evaluator
                         )

In [4]:
from pyspark.sql.functions import col
scores=[]
target = hkong.columns[311:]
for i in target:
  #print i
  data = hkong.select(hkong.columns[:311] + [i])
  new_data = data.select(*[col(s).alias('label') if s ==i  else s for s in data.columns])
  new_data  = assembler.transform(new_data)  
  (training, test) = new_data.randomSplit([0.7, 0.3], seed = 100)  
  models = lr.fit(training)    
  print i     
  test_results = models.transform(test)
  print (evaluator.evaluate(test_results))
  scores.append(evaluator.evaluate(test_results))

print(scores)


In [5]:
lr_score = np.mean(scores)
lr_score

In [6]:
from pyspark.sql.functions import col
scores=[]
target = hkong.columns[311:]
for i in target[0:3]:
  #print i
  data = hkong.select(hkong.columns[:311] + [i])
  new_data = data.select(*[col(s).alias('label') if s ==i  else s for s in data.columns])
  new_data  = assembler.transform(new_data)  
  (training, test) = new_data.randomSplit([0.7, 0.3], seed = 100)  
  models = cv.fit(training)  
  best_lr = models.bestModel
  print i
  print(best_lr)    
  test_results = best_lr.transform(test)
  print (evaluator.evaluate(test_results))
  scores.append(evaluator.evaluate(test_results))

print(scores)
#print training.count()
#print test.count()
#display(new_data)


In [7]:
from pyspark.ml.classification import RandomForestClassifier

# Create initial LogisticRegression model

# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
# Import the tuning submodule
import pyspark.ml.tuning as tune
import numpy as np



In [8]:
from pyspark.sql.functions import col
scores=[]
target = hkong.columns[311:]
for i in target:
  #print i
  data = hkong.select(hkong.columns[:311] + [i])
  new_data = data.select(*[col(s).alias('label') if s ==i  else s for s in data.columns])
  new_data  = assembler.transform(new_data)  
  (training, test) = new_data.randomSplit([0.7, 0.3], seed = 100)  
  rfc= RandomForestClassifier()
  models = rfc.fit(training)  
  
  print i
     
  test_results = models.transform(test)
  print (evaluator.evaluate(test_results))
  scores.append(evaluator.evaluate(test_results))

print(scores)
#print training.count()
#print test.count()
#display(new_data)


In [9]:
mean_score = np.mean(scores)
print (mean_score)

In [10]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
rfc= RandomForestClassifier()
# Create the parameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(rfc.maxDepth, [2, 4, 6])
             .addGrid(rfc.maxBins, [20, 60])
             .addGrid(rfc.numTrees, [5, 20])
             .build())

cv = CrossValidator(estimator=rfc, estimatorParamMaps=paramGrid, evaluator=evaluator)

In [11]:
scores=[]
best = []

In [12]:
from pyspark.sql.functions import col

target = hkong.columns[311:]
for i in target[28:]:
  data = hkong.select(hkong.columns[:311] + [i])
  new_data = data.select(*[col(s).alias('label') if s ==i  else s for s in data.columns])
  new_data  = assembler.transform(new_data)  
  (training, test) = new_data.randomSplit([0.7, 0.3], seed = 100)
  rfc= RandomForestClassifier()
  paramGrid = (ParamGridBuilder()
             .addGrid(rfc.maxDepth, [2, 4, 6])
             .addGrid(rfc.maxBins, [20, 60])
             .addGrid(rfc.numTrees, [5, 20])
             .build())

  cv = CrossValidator(estimator=rfc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
  models = cv.fit(training)  
  best_rfc = models.bestModel
  #print i
  #best.append(best_rfc)  
  test_results = best_rfc.transform(test)
  #print (evaluator.evaluate(test_results))
  scores.append(evaluator.evaluate(test_results))

#print(best)  
print(scores)

In [13]:
print (scores)

In [14]:
rfc_score = np.mean(scores)
print (rfc_score)