In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from dask.distributed import Client
from sklearn.externals import joblib
import pickle

In [2]:
#Load and split the data
training_data = pd.read_csv('train.csv')
data = training_data.drop(['Target'], axis=1)
target = training_data['Target']

In [3]:
#Clean the data to either replace or remove string columns
data.select_dtypes(exclude=[np.number]).head()
data = data.select_dtypes(include=[np.number], exclude=[np.object]).fillna(0)

In [4]:
#Extract feature names
cols = data.columns.values
dicts = dict(zip(cols, range(len(cols))))

features = []
for key in dicts:
    if dicts[key] in [98, 135, 134, 131, 118, 133, 109, 132, 94, 2]:
        features.append(key)

In [5]:
#Define train and test data sets
X_train, X_test, y_train, y_test = train_test_split(data[features], target, test_size= 0.2, random_state=42)

In [6]:
#Start a local Dask client
client = Client()

In [None]:
#Use Dask to parallelize tasks
with joblib.parallel_backend('dask'):
    def baseline_model():
        model = tf.keras.Sequential()
        model.add(Dense(4, input_dim=10, activation='relu'))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    
    estimator = KerasClassifier(build_fn=baseline_model, 
                            epochs=10, 
                            batch_size=5,
                            verbose=0)
    
    kfold = KFold(n_splits=10, shuffle=True)
    
    results = cross_val_score(estimator, X_train, y_train, cv=kfold)
    print('Accuracy for Baseline Model: ' + str(round(results.mean()*100,2)) + '%')

In [None]:
#Save the trained model as a pickle string, load pickled model + use it to make predictions
saved_model = pickle.dumps(model)
rf_from_pickle = pickle.loads(saved_model)
rf_from_pickle.predict(X_test)