In [1]:
from pyspark.sql import functions as F
from pyspark.sql import types
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

import pandas as pd
import numpy as np
from IPython.display import display

from collections import defaultdict
from sklearn import metrics

In [2]:
training_set = sqlContext.sql("select * from ignite.training_set_id").select([
        'features', 'label', 
    ]).coalesce(100) # Reducing from current 10k partitions

In [4]:
pd_train = training_set.sample(False,0.03).toPandas() #Trying to get around million randomly picked records out of 28M+ records

In [5]:
for index in range(391):#As the feature sparse vector size is 391
    pd_train['feature'+str(index)] =  pd_train['features'].map(lambda x: x[index])

In [6]:
pd_train.drop('features',axis=1,inplace=True)
pd_train.fillna(0,inplace=True)

In [7]:
X = pd_train.drop('label',axis=1)
y=np.asarray(pd_train['label'], dtype="|S6")

In [8]:
from sklearn.ensemble import ExtraTreesClassifier
forest= ExtraTreesClassifier(n_estimators=250,random_state=0)
forest.fit(X, y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [9]:
feature_pairs = zip(X.columns,forest.feature_importances_)

In [17]:
importances = forest.feature_importances_
# the code below is little fancy to create graphical representation of these feature importances (might not be useful if we have aweful lot of input features)
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print feature_pairs[indices[f]][0],":", feature_pairs[indices[f]][1]


Feature ranking:
feature8 : 0.0671308715213
feature5 : 0.0620835886224
feature1 : 0.0556265324792
feature2 : 0.0464993777563
feature0 : 0.0442142574458
feature390 : 0.0320681711291
feature10 : 0.0301485881497
feature7 : 0.0291598519171
feature4 : 0.0289612660977
feature11 : 0.0219027190394
feature13 : 0.018954819322
feature41 : 0.014508562841
feature3 : 0.0138618761727
feature6 : 0.0137255491682
feature30 : 0.0133963908546
feature24 : 0.0129804414661
feature27 : 0.0128285668272
feature9 : 0.01242176723
feature43 : 0.0109576840983
feature42 : 0.0109439451476
feature23 : 0.0106341615385
feature32 : 0.0105202715445
feature26 : 0.0104471196965
feature29 : 0.0103735059338
feature12 : 0.00989737428861
feature33 : 0.0096682891435
feature111 : 0.00962091206865
feature180 : 0.00890800245234
feature110 : 0.00825914125183
feature40 : 0.00774716960731
feature181 : 0.00759798144282
feature112 : 0.00752221582061
feature113 : 0.0073041707452
feature46 : 0.00713846914064
feature21 : 0.00691449129608
f

In [19]:
#Getting only the top important features
for f in range(X.shape[1]):
    if feature_pairs[indices[f]][1]>np.average(importances):
        print feature_pairs[indices[f]][0],":", feature_pairs[indices[f]][1]

feature8 : 0.0671308715213
feature5 : 0.0620835886224
feature1 : 0.0556265324792
feature2 : 0.0464993777563
feature0 : 0.0442142574458
feature390 : 0.0320681711291
feature10 : 0.0301485881497
feature7 : 0.0291598519171
feature4 : 0.0289612660977
feature11 : 0.0219027190394
feature13 : 0.018954819322
feature41 : 0.014508562841
feature3 : 0.0138618761727
feature6 : 0.0137255491682
feature30 : 0.0133963908546
feature24 : 0.0129804414661
feature27 : 0.0128285668272
feature9 : 0.01242176723
feature43 : 0.0109576840983
feature42 : 0.0109439451476
feature23 : 0.0106341615385
feature32 : 0.0105202715445
feature26 : 0.0104471196965
feature29 : 0.0103735059338
feature12 : 0.00989737428861
feature33 : 0.0096682891435
feature111 : 0.00962091206865
feature180 : 0.00890800245234
feature110 : 0.00825914125183
feature40 : 0.00774716960731
feature181 : 0.00759798144282
feature112 : 0.00752221582061
feature113 : 0.0073041707452
feature46 : 0.00713846914064
feature21 : 0.00691449129608
feature250 : 0.006