In [1]:
import sys
import math
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+

from pyspark.sql import SparkSession, functions, types
cluster_seeds = ['199.60.17.188', '199.60.17.216']
spark = SparkSession.builder.appName('cassandra read tables and etl') \
    .config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate()
assert spark.version >= '2.3' # make sure we have Spark 2.3+
sc = spark.sparkContext

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
ready_train = spark.read.option("inferSchema", True).csv('/Users/michaelyang/Downloads/all_data/application_train.csv',
                                   header=True)

In [13]:
ready_train = ready_train.limit(50000)

In [14]:
ready_train = ready_train.withColumn('CREDIT_INCOME_PERCENT', ready_train.AMT_CREDIT / ready_train.AMT_INCOME_TOTAL)
ready_train = ready_train.withColumn('ANNUITY_INCOME_PERCENT', ready_train.AMT_ANNUITY/ ready_train.AMT_INCOME_TOTAL)
ready_train = ready_train.withColumn('CREDIT_TERM', ready_train.AMT_ANNUITY/ ready_train.AMT_CREDIT)

In [15]:
from ETL import ETL, get_dummies_spark
train_cat_encoded = get_dummies_spark(ready_train,'SK_ID_CURR','ready_train')
numerical_feats = [f for f, t in ready_train.dtypes if t != 'string']
train_num_df = ready_train.select(numerical_feats)
ready_train = train_cat_encoded.join(train_num_df,on = 'SK_ID_CURR')

In [16]:
import pandas as pd

In [6]:
from pyspark.mllib.stat import Statistics
import pandas as pd

# df = sqlCtx.read.format('com.databricks.spark.csv').option('header', 'true').option('inferschema', 'true').load('corr_test.csv')
df = ready_train.drop('SK_ID_CURR').drop('TARGET')
col_names = df.columns
features = df.rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

In [9]:
import numpy as np
upper = corr_df.where(np.triu(np.ones(corr_df.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

In [12]:
for col in ready_train.columns:
    if col in to_drop:
        ready_train.drop(col)

In [29]:
to_drop = []
for col in ready_train.columns:
    if ready_train.filter(ready_train[col].isNull()).count() > 50000:
        to_drop.append(col)

KeyboardInterrupt: 

In [50]:
from pyspark.sql.functions import isnull, when, count, col

temp = ready_train.select([count(when(isnull(c), c)).alias(c) for c in ready_train.columns]).collect()

In [61]:
size = ready_train.count()
threshold = 0.75 * size
to_drop = []
for col in ready_train.columns:
    if temp[0][col] > threshold:
        to_drop.append(col)

to_drop

[]

In [6]:
# assembler
features = list(set(ready_train.columns) - set(['SK_ID_CURR','TARGET']))
feature_assembler = VectorAssembler(inputCols=features ,outputCol='features')
# classifier
classifier = RandomForestClassifier(labelCol='TARGET',maxBins=60,maxDepth=6)
# pipeline
credit_pipeline = Pipeline(stages=[feature_assembler, classifier])
# fit and see feature importance
credit_model = credit_pipeline.fit(ready_train.na.fill(-999))

In [17]:
x, y = (list(x) for x in zip(*sorted(zip(credit_model.stages[-1].
                                         featureImportances, features), reverse = False)))

{0.0: 'e_ready_train_WEEKDAY_APPR_PROCESS_START_SUNDAY',
 1.0244084585201484e-05: 'e_ready_train_WEEKDAY_APPR_PROCESS_START_MONDAY',
 1.6952551364207237e-05: 'e_ready_train_ORGANIZATION_TYPE_Electricity',
 1.753458508379548e-05: 'e_ready_train_FLAG_OWN_REALTY_N',
 2.2130415251301406e-05: 'e_ready_train_ORGANIZATION_TYPE_Industry: type 11',
 3.477008669750142e-05: 'e_ready_train_NAME_HOUSING_TYPE_Office apartment',
 3.778397902382731e-05: 'e_ready_train_WEEKDAY_APPR_PROCESS_START_WEDNESDAY',
 3.890529739265786e-05: 'FLAG_DOCUMENT_20',
 4.212310735551919e-05: 'e_ready_train_NAME_HOUSING_TYPE_Co-op apartment',
 4.565910968164815e-05: 'e_ready_train_WALLSMATERIAL_MODE_Wooden',
 5.138345168496126e-05: 'e_ready_train_NAME_TYPE_SUITE_Family',
 5.228396772057454e-05: 'e_ready_train_OCCUPATION_TYPE_Realty agents',
 5.67999772272969e-05: 'e_ready_train_ORGANIZATION_TYPE_Industry: type 3',
 5.9250104204871894e-05: 'e_ready_train_NAME_EDUCATION_TYPE_Incomplete higher',
 6.12087331991179e-05: 'e_re

In [22]:
feature_importances = pd.DataFrame({'feature': features, 
                                    'importance': credit_model.stages[-1].featureImportances})\
    .sort_values('importance', ascending = False)
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
zero_features

['e_ready_train_NAME_INCOME_TYPE_Unemployed',
 'FLAG_DOCUMENT_11',
 'e_ready_train_NAME_INCOME_TYPE_Student',
 'e_ready_train_FONDKAPREMONT_MODE_reg oper spec account',
 'e_ready_train_ORGANIZATION_TYPE_Trade: type 7',
 'e_ready_train_NAME_EDUCATION_TYPE_Academic degree',
 'e_ready_train_NAME_INCOME_TYPE_Maternity leave',
 'e_ready_train_ORGANIZATION_TYPE_Industry: type 7',
 'e_ready_train_ORGANIZATION_TYPE_Legal Services',
 'e_ready_train_ORGANIZATION_TYPE_Security Ministries',
 'e_ready_train_ORGANIZATION_TYPE_Realtor',
 'e_ready_train_ORGANIZATION_TYPE_Culture',
 'e_ready_train_ORGANIZATION_TYPE_Telecom',
 'FLAG_DOCUMENT_13',
 'e_ready_train_OCCUPATION_TYPE_Private service staff',
 'e_ready_train_ORGANIZATION_TYPE_Medicine',
 'e_ready_train_ORGANIZATION_TYPE_Industry: type 13',
 'e_ready_train_ORGANIZATION_TYPE_Industry: type 5',
 'e_ready_train_ORGANIZATION_TYPE_Kindergarten',
 'e_ready_train_WEEKDAY_APPR_PROCESS_START_SUNDAY',
 'FLAG_DOCUMENT_2',
 'e_ready_train_ORGANIZATION_TYPE_

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [19]:
features = list(set(ready_train.columns) - set(['SK_ID_CURR', 'TARGET']))
feature_assembler = VectorAssembler(inputCols=features, outputCol='features')
classifier = GBTClassifier(labelCol='TARGET', maxBins=60)
pipeline = Pipeline(stages=[feature_assembler, classifier])
grid = ParamGridBuilder().addGrid(classifier.maxDepth,[3,4]).build()
evaluator = BinaryClassificationEvaluator(labelCol='TARGET')
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= 5)
cv.fit(ready_train.fillna(-999))

CrossValidatorModel_4900ac24f0fd215f8c23