In [1]:
from pyspark.sql import SparkSession
from random import randint
from pyspark.sql.types import IntegerType, StructField, StructType
import pyspark.sql.functions as F
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
import numpy as np

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.appName('SparkTitanic.com').getOrCreate()

Reading the data

In [4]:
train_df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv('./input/titanic/train.csv')
    #for production: '../input/titanic/train.csv'

Cleaning the data

In [5]:
train_df = train_df.select("Sex", "Age", "Survived")

In [6]:
train_df = train_df.where(train_df["Age"].isNotNull())

In [7]:
train_df = train_df.where(train_df["Sex"].isNotNull())

In [8]:
train_df = train_df.where(train_df["Survived"].isNotNull())

In [9]:
train_df = train_df.withColumn("Sex", F.when(F.col("Sex")==F.lit("male"), 0.).otherwise(1.) )

In [10]:
x_train = list(map(lambda x: list(x), train_df.select("Sex", "Age").collect()))
y_train = list(map(lambda x: list(x)[0], train_df.select("Survived").collect()))


In [11]:
del(train_df)

In [12]:
for _ in range(500):
    Sex = randint(0,1)
    Age = randint(0,100)
    prediction = 1 if ( (Sex == 1)|( (Sex == 0) & ( Age < 15 ) ) ) else 0
    x_train.append( [Sex, Age] )
    y_train.append( prediction )
del(Sex)
del(Age)
del(prediction)

In [13]:
x_train = np.array([[x[0], x[1]*0.01] for x in x_train], dtype="float32")
#normalization

In [14]:
y_train = to_categorical(y_train,2)

Training

In [15]:
model = Sequential()

In [16]:
model.add(Dense(128, input_dim = 2))

In [17]:
model.add(BatchNormalization(momentum=0.8))

In [18]:
model.add(Dense(2084, activation='relu'))

In [19]:
model.add(BatchNormalization(momentum=0.8))

In [20]:
model.add(Dense(128))

In [21]:
model.add(Dense(2084, activation='tanh'))

In [22]:
model.add(BatchNormalization(momentum=0.8))

In [23]:
model.add(Dense(128, activation='sigmoid'))

In [24]:
model.add(Dense(512, activation='tanh'))

In [25]:
model.add(BatchNormalization(momentum=0.8))

In [26]:
model.add(Dense(128, activation='sigmoid'))

In [27]:
model.add(Dense(2))

In [28]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
model.fit(x_train, y_train, batch_size=500, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f3ed0132b10>

In [30]:
del(x_train)
del(y_train)

Prediction

In [31]:
def prediction(row):
    if row.Sex == 'female':
        Sex = 1.
    elif row.Sex == 'male':
        Sex = 0.
    else:
        Sex = randint(0,1)*1.
    if row.Age == None:
        Age = randint(0,100)*0.01
    else:
        Age = row.Age*0.01

    p = model.predict([(Sex, Age)])[0]

    if p[0] > p[1]:
        return 0
    else:
        return 1

In [32]:
test_df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv('./input/titanic/test.csv')
    #for production: '../input/titanic/test.csv'

In [33]:
output_df = spark.createDataFrame(spark.sparkContext.parallelize([]), StructType([
    StructField('PassengerId', IntegerType(), True),
    StructField('Survived', IntegerType(), True)
]))

In [34]:
for row in test_df.collect():
    output_df = output_df.union(spark.sparkContext.parallelize([(row.PassengerId, prediction(row))]).toDF(['PassengerId', 'Survived']))

Output

In [35]:
output_df.toPandas().to_csv('./submission.csv', index=False)