# Graph Data Science with the Titanic Dataset

You have probably heard about Titanic and you may be aware of the Titanic dataset. The purpose of this notebook is to explain you the 'graph data science' using it.   
Please use this notebook in accordance with the slides, where the graph concepts are explained in relation to the Titanic dataset

### Import Libraries

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import nxneo4j as nx
import numpy as np
from neo4j import GraphDatabase

### Create Graph

In [6]:
#NODES
driver = GraphDatabase.driver(uri="bolt://localhost:11004",auth=("neo4j","neo"))
G = nx.DiGraph(driver)

G.delete_all()
G.identifier_property = 'PassengerId'
G.node_label = 'Passenger'

nodes = pd.read_csv('/Users/ybaktir/Desktop/titanic/nodes.csv',index_col='PassengerId')
nodes.loc[nodes.index > 892,'Survived'] = np.nan


G.add_nodes_from([(str(k), v) for k,v in nodes.to_dict('index').items()])

In [7]:
#RELATIONSHIPS
relationships = pd.read_csv('/Users/ybaktir/Desktop/titanic/relationships.csv')
for i in range(len(relationships)):
    type = str(relationships.loc[i]['type'])
    if type == 'nan':
        G.relationship_type = 'CONNECTED'
    else:
        G.relationship_type = type
    G.add_edge(str(relationships.loc[i]['u_id']),str(relationships.loc[i]['v_id']))

Display the graph:

In [8]:
nx.draw(G,limit=25)

### Encode categorical features

In [9]:
nodes = pd.read_csv('/Users/ybaktir/Desktop/titanic/titanic.csv')

In [10]:
nodes['Pclass3']= (nodes['Pclass'] == 3) * 1
nodes['Pclass2']= (nodes['Pclass'] == 2) * 1
nodes['Pclass1']= (nodes['Pclass'] == 1) * 1

In [11]:
nodes.Sex.replace({'female':1,'male':0},inplace=True)

In [12]:
nodes['Age'].fillna(nodes['Age'].median(),inplace=True)

In [13]:
nodes['Fare'].fillna(nodes['Fare'].median(),inplace=True)

In [14]:
nodes['Embarked'].fillna('S',inplace=True)

In [15]:
nodes['Embarked_Q'] = (nodes['Embarked'] == 'Q') * 1
nodes['Embarked_C'] = (nodes['Embarked'] == 'C') * 1
nodes['Embarked_S'] = (nodes['Embarked'] == 'S') * 1

In [16]:
remove_list = ['PassengerId', 'Pclass', 'Name', 'Ticket', 'Cabin', 'Embarked']

In [17]:
train_cols = [i for i in nodes.columns if i not in remove_list]

In [18]:
train_cols

['Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Survived',
 'Pclass3',
 'Pclass2',
 'Pclass1',
 'Embarked_Q',
 'Embarked_C',
 'Embarked_S']

In [19]:
### Train, Test sets (in accordance with Kaggle)

In [20]:
train = nodes[nodes['PassengerId'] <= 892][train_cols]
test  = nodes[nodes['PassengerId'] > 892][train_cols]

In [21]:
y_train = train.pop('Survived')
y_test  = test.pop('Survived')

In [22]:
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [23]:
accuracy_score(clf.predict(test),y_test)

0.7649880095923262

In [24]:
print('Feature Importances')
pd.DataFrame(zip(train.columns,clf.feature_importances_))

Feature Importances


Unnamed: 0,0,1
0,Sex,0.466557
1,Age,0.069164
2,SibSp,0.037545
3,Parch,0.03079
4,Fare,0.167964
5,Pclass3,0.108627
6,Pclass2,0.021526
7,Pclass1,0.057441
8,Embarked_Q,0.007661
9,Embarked_C,0.018306


# GRAPH EXTENDED FEATURES

In [25]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(uri="bolt://localhost:11004",auth=("neo4j","neo"))

Connection Size:

In [26]:
query = """
MATCH (n1)
OPTIONAL MATCH (n1)-[]->(n2)
RETURN toInteger(n1.PassengerId) as PassengerId, count(n2) as connection_size"""
connection_size = driver.session().run(query).data()

Average Survived:

In [27]:
query = f"""
MATCH (n1)
OPTIONAL MATCH (n1)-[]->(n2)
where TOFLOAT(n2.Survived) = n2.Survived
RETURN distinct toInteger(n1.PassengerId) as PassengerId, AVG(n2.Survived) as avg_survived
"""
avg_survived = driver.session().run(query).data()

Mother Survived:

In [28]:
query = f"""
MATCH (n1)
OPTIONAL MATCH (n1)-[:MOTHER]->(n2)
where TOFLOAT(n2.Survived) = n2.Survived
RETURN distinct toInteger(n1.PassengerId) as PassengerId, AVG(n2.Survived) as mother_survived
"""
mother_survived = driver.session().run(query).data()

Father Survied:

In [29]:
query = f"""
MATCH (n1)
OPTIONAL MATCH (n1)-[:FATHER]->(n2)
where TOFLOAT(n2.Survived) = n2.Survived
RETURN distinct toInteger(n1.PassengerId) as PassengerId, AVG(n2.Survived) as father_survived
"""
father_survived = driver.session().run(query).data()

Same Gender Survived:

In [30]:
query = """
MATCH (n1)
OPTIONAL MATCH (n1)-[]->(n2)
where (TOFLOAT(n2.Survived) = n2.Survived) and (n1.Sex = n2.Sex)
RETURN distinct toInteger(n1.PassengerId) as PassengerId, AVG(n2.Survived) as same_gender_survived
"""
same_gender_survived = driver.session().run(query).data()

Similar Age Survived:

In [31]:
query = """
MATCH (n1)
OPTIONAL MATCH (n1)-[]->(n2)
where (TOFLOAT(n2.Survived) = n2.Survived) and (n2.Age >= n1.Age -5) and (n2.Age <= n1.Age +5)
RETURN distinct toInteger(n1.PassengerId) as PassengerId, AVG(n2.Survived) as similar_age_survived
"""
similar_age_survived = driver.session().run(query).data()

Father:

In [32]:
query = f"""

MATCH ()-[:FATHER]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as father
"""
father = driver.session().run(query).data()

Mother:

In [33]:
query = f"""
MATCH ()-[:MOTHER]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as mother
"""
mother = driver.session().run(query).data()

Sister:

In [34]:
query = f"""
MATCH ()-[:SISTER]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as sister
"""
sister = driver.session().run(query).data()

Daughter:

In [35]:
query = f"""
MATCH ()-[:DAUGHTER]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as daughter
"""
daughter = driver.session().run(query).data()

Brother:

In [36]:
query = f"""
MATCH ()-[:BROTHER]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as brother
"""
brother = driver.session().run(query).data()

Wife:

In [37]:
query = f"""
MATCH ()-[:WIFE]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as wife
"""
wife = driver.session().run(query).data()

Husband:

In [38]:
query = f"""
MATCH ()-[:HUSBAND]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as husband
"""
husband = driver.session().run(query).data()

Son:

In [39]:
query = """
MATCH ()-[:SON]->(n2)
RETURN distinct toInteger(n2.PassengerId) as PassengerId, 1 as son
"""
son = driver.session().run(query).data()

Gds graph for Node Embeddings:

In [40]:
query = """CALL gds.graph.create(
  'persons',
  {
    Passenger: {
      label: 'Passenger',
      properties: {
        Age: {
          property: 'Age',
          defaultValue: 1.0
        },
        Sex: {
          property: 'Sex',
          defaultValue: 1.0
        },
        SibSp: {
         property: 'SibSp',
         defaultValue: 1.0
       },
        Parch: {
         property: 'Parch',
         defaultValue: 1.0
       },
        Fare: {
         property: 'Fare',
         defaultValue: 1.0
       },
        Pclass3: {
         property: 'Pclass3',
         defaultValue: 1.0
       },
        Pclass2: {
         property: 'Pclass2',
         defaultValue: 1.0
       },
        Pclass1: {
         property: 'Pclass1',
         defaultValue: 1.0
       },
        Embarked_Q: {
         property: 'Embarked_Q',
         defaultValue: 1.0
       },
        Embarked_C: {
         property: 'Embarked_C',
         defaultValue: 1.0
       },
        Embarked_S: {
         property: 'Embarked_S',
         defaultValue: 1.0
     }
     }
    }
  }, {
    R: {
      type: '*',
      orientation: 'UNDIRECTED'
    }
})"""
driver.session().run(query)

<neo4j.work.result.Result at 0x1a1d9840d0>

Node2vec:

In [41]:
query = """
CALL gds.alpha.node2vec.stream('persons', {embeddingSize:4,
    iterations:10})
YIELD nodeId, embedding
RETURN toInteger(gds.util.asNode(nodeId).PassengerId) AS PassengerId, embedding"""
embedding = driver.session().run(query).data()

length = len(embedding[0]['embedding']) 

columns = []
for i in range(length):
    columns.append('embedding'+str(i+1))

embedding_df = pd.DataFrame(pd.DataFrame(embedding)['embedding'].tolist(),columns=columns)

embedding_df['PassengerId'] = pd.DataFrame(embedding)['PassengerId']

GraphSage:

In [42]:
query = """CALL gds.alpha.graphSage.stream(
  'persons',
  {
    nodePropertyNames: ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass3', 'Pclass2', 'Pclass1', 'Embarked_Q', 'Embarked_C', 'Embarked_S'],
    aggregator: 'mean',
    activationFunction: 'sigmoid',
    embeddingSize: 5,
    sampleSizes: [25, 10],
    degreeAsProperty: true,
    maxIterations:75
  }
)
YIELD nodeId, embeddings
RETURN toInteger(gds.util.asNode(nodeId).PassengerId) AS PassengerId, embeddings as graphsage"""
graphsage = driver.session().run(query).data()
length = len(graphsage[0]['graphsage']) 

columns = []
for i in range(length):
    columns.append('graphsage'+str(i+1))

graphsage_df = pd.DataFrame(pd.DataFrame(graphsage)['graphsage'].tolist(),columns=columns)

graphsage_df['PassengerId'] = pd.DataFrame(graphsage)['PassengerId']

In [43]:
for feature in [connection_size,avg_survived,mother_survived,father,mother,sister,daughter,brother,wife,husband,son,same_gender_survived,similar_age_survived]:
    nodes = nodes.merge(pd.DataFrame(feature),how='left')

In [44]:
nodes = nodes.merge(embedding_df,how='left')
nodes = nodes.merge(graphsage_df,how='left')

In [45]:
nodes.fillna(0,inplace=True)

In [46]:
remove_list = ['PassengerId', 'Pclass', 'Name', 'Ticket', 'Cabin', 'Embarked']

train_cols = [i for i in nodes.columns if i not in remove_list]

In [47]:
train = nodes[nodes['PassengerId'] <= 892][train_cols]
test  = nodes[nodes['PassengerId'] > 892][train_cols]

In [48]:
y_train = train.pop('Survived')
y_test  = test.pop('Survived')

In [49]:
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [50]:
from sklearn.metrics import accuracy_score

accuracy_score(clf.predict(test),y_test)

0.7841726618705036

In [51]:
from sklearn.feature_selection import RFE

In [52]:
estimator = RandomForestClassifier(max_depth=4, random_state=0)

In [53]:
selector = RFE(estimator, n_features_to_select=10, step=1)

In [54]:
selector = selector.fit(train, y_train)

In [55]:
selector_df = pd.DataFrame(zip(train.columns,selector.support_,selector.ranking_))

In [56]:
train_cols = selector_df[selector_df[1]][0].tolist() + ['Survived']

In [57]:
train = nodes[nodes['PassengerId'] <= 892][train_cols]
test  = nodes[nodes['PassengerId'] > 892][train_cols]

y_train = train.pop('Survived')
y_test  = test.pop('Survived')

clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(train, y_train)

from sklearn.metrics import accuracy_score

accuracy_score(clf.predict(test),y_test)

0.8009592326139089

In [58]:
pd.DataFrame(zip(train.columns,clf.feature_importances_))

Unnamed: 0,0,1
0,Sex,0.319473
1,Age,0.038091
2,Fare,0.058859
3,Pclass3,0.050317
4,connection_size,0.032635
5,avg_survived,0.119693
6,same_gender_survived,0.21879
7,graphsage2,0.05024
8,graphsage4,0.040064
9,graphsage5,0.071838
