In [1]:
from pyspark.sql import SparkSession
from random import randint
from pyspark.sql.types import IntegerType, StructField, StructType
import pyspark.sql.functions as F
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras.layers import Dense
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.appName('SparkTitanic.com').getOrCreate()

Reading the data

In [4]:
train_df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv('./input/titanic/train.csv')
    #for production: '../input/titanic/train.csv'

Cleaning the data

In [5]:
train_df = train_df.select("Sex", "Age", "Pclass", "Survived")

In [6]:
train_df = train_df.where(train_df["Age"].isNotNull())

In [7]:
train_df = train_df.where(train_df["Sex"].isNotNull())

In [8]:
train_df = train_df.where(train_df["Survived"].isNotNull())

In [9]:
train_df = train_df.withColumn("Sex", F.when(F.col("Sex")==F.lit("male"), 0.).otherwise(1.) )

In [10]:
x_train = list(map(lambda x: list(x), train_df.select("Sex", "Age", "Pclass").collect()))
y_train = list(map(lambda x: list(x)[0], train_df.select("Survived").collect()))


In [11]:
del(train_df)

In [12]:
x_train = np.array([[x[0], np.log(1+x[1])/np.log(100), (x[2]-1)/2] for x in x_train], dtype="float32")
#normalization

In [13]:
y_train = to_categorical(y_train,2)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.30, random_state=40)

Training

In [15]:
model = Sequential()
model.add(Dense(16, input_dim=3, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [16]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [17]:
model.fit(x_train, y_train, batch_size=5, epochs=12, shuffle=True, verbose=2)

Epoch 1/12
100/100 - 0s - loss: 0.6331 - accuracy: 0.6733 - 398ms/epoch - 4ms/step
Epoch 2/12
100/100 - 0s - loss: 0.5664 - accuracy: 0.7776 - 61ms/epoch - 609us/step
Epoch 3/12
100/100 - 0s - loss: 0.5214 - accuracy: 0.8056 - 75ms/epoch - 754us/step
Epoch 4/12
100/100 - 0s - loss: 0.4895 - accuracy: 0.8076 - 73ms/epoch - 731us/step
Epoch 5/12
100/100 - 0s - loss: 0.4705 - accuracy: 0.7936 - 74ms/epoch - 737us/step
Epoch 6/12
100/100 - 0s - loss: 0.4596 - accuracy: 0.7916 - 77ms/epoch - 766us/step
Epoch 7/12
100/100 - 0s - loss: 0.4498 - accuracy: 0.7936 - 101ms/epoch - 1ms/step
Epoch 8/12
100/100 - 0s - loss: 0.4482 - accuracy: 0.7936 - 79ms/epoch - 790us/step
Epoch 9/12
100/100 - 0s - loss: 0.4438 - accuracy: 0.7896 - 76ms/epoch - 757us/step
Epoch 10/12
100/100 - 0s - loss: 0.4399 - accuracy: 0.7896 - 72ms/epoch - 715us/step
Epoch 11/12
100/100 - 0s - loss: 0.4386 - accuracy: 0.7956 - 71ms/epoch - 710us/step
Epoch 12/12
100/100 - 0s - loss: 0.4348 - accuracy: 0.7936 - 74ms/epoch - 74

<keras.callbacks.History at 0x7fed48208210>

In [18]:
model.evaluate(x_test, y_test, batch_size=20)



[0.4808778762817383, 0.7627906799316406]

In [17]:
del(x_train)
del(y_train)

In [None]:
del(x_test)
del(y_test)

Prediction

In [18]:
def prediction(row):
    if row.Sex == 'female':
        Sex = 1.
    elif row.Sex == 'male':
        Sex = 0.
    else:
        Sex = randint(0,1)*1.
    Age = row.Age
    if Age == None:
        Age = randint(0,100)
    Age = np.log(1+Age)/np.log(100)
    Pclass = row.Pclass
    if Pclass == None:
        Pclass = randint(1,3)
    Pclass = (Pclass - 1)/2    
    p = model.predict([(Sex, Age, Pclass)])[0]
    return int(np.argmax(p))

In [19]:
test_df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv('./input/titanic/test.csv')
    #for production: '../input/titanic/test.csv'

In [20]:
output_df = spark.createDataFrame(spark.sparkContext.parallelize([]), StructType([
    StructField('PassengerId', IntegerType(), True),
    StructField('Survived', IntegerType(), True)
]))

In [21]:
for row in test_df.collect():
    output_df = output_df.union( spark.sparkContext.parallelize([(row.PassengerId, prediction(row) )]).toDF(['PassengerId', 'Survived']) )

Output

In [None]:
output_df.toPandas().to_csv('./submission.csv', index=False)