In [1]:
# !pip install neo4j

In [2]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

##### Setup ODBC using neo4j

In [3]:
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "jhunaidneo4j")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

##### Understand the schema

In [4]:
# get database schema
with driver.session() as session:
    # visualize database schema
    result = session.run("CALL db.schema.visualization")
    for record in result:
        print(record)

<Record nodes=[<Node element_id='-1' labels=frozenset({'Person'}) properties={'name': 'Person', 'indexes': [], 'constraints': []}>] relationships=[<Relationship element_id='-1' nodes=(<Node element_id='-1' labels=frozenset({'Person'}) properties={'name': 'Person', 'indexes': [], 'constraints': []}>, <Node element_id='-1' labels=frozenset({'Person'}) properties={'name': 'Person', 'indexes': [], 'constraints': []}>) type='Date' properties={}>]>


##### Determine the ratio of males to females

In [5]:
# determine the ratio of females to males
with driver.session() as session:
    # get count of genders in the database and divide by total
    db_query = """MATCH (n:Person) WHERE (n.gender) is not null RETURN n.gender as gender"""
    result = session.run(db_query)

    males = 0
    females = 0
    for record in result:
        # find the gender of the record
        gender = record.get('gender')
        if gender == 0:
            males += 1
        else:
            females += 1

# print the ratio of males to females
print(f"Males: {males}; Females: {females}")
ratio = males / females
print(ratio)
print(ratio.as_integer_ratio())

Males: 268; Females: 274
0.9781021897810219
(8809961314856153, 9007199254740992)


It is almost a 1:1 set of males to females, so the dataset is balanced in regards to gender.

##### Determine the percentage of matches

In [6]:
with driver.session() as session:
    db_query = "MATCH p=()-[r:Date]->() RETURN p LIMIT 25"
    result = session.run(db_query)
    matches_found = 0
    for record in result:
        # get the match
        p = record.get('p')
        p = p.graph
        # get the matches
        matches = [r['match'] for r in p.relationships if r.type == 'Date']
        # if there are any matches, then the two nodes are connected
        if any(matches):
            matches_found += 1
    # print the % of matches found
    print(matches_found/len(matches))

0.96


According to the data, there seems to be 96% of dates that were matches.

##### Reasoning for selected applicable columns towards supervised learning

I think the columns I might use to identify a match would be: id, gender, race, age, age_diff, and match. These few columns should be sufficient. The relevance that id provides is the one to many aspect in which a single participant may go on multiple speed dates. Gender, race, age, and age_diff should help paint a picture of the demographics of the speed daters and which combinations of demographics led to success, which is indicated by the match column.

##### Query the relevant data according to the selected columns

In [7]:
db_query = "MATCH p=()-[r:Date]->() RETURN p LIMIT 25"
data = {}

def unpack_df(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Unpacks a column in a dataframe into its own columns"""
    temp_df = df[col].apply(pd.Series)
    if col == 'src_speed_dater':
        temp_col = 'src_'
    elif col == 'trg_speed_dater':
        temp_col = 'trg_'
    else:
        temp_col = ''
    # use the first element of the tuple as the column name
    temp_df.columns = [temp_col + str(tup[0]) for tup in temp_df.iloc[0]]
    # use the second element of the tuple as the value for each row
    temp_df = temp_df.applymap(lambda x: x[1])
    # drop the original column
    df = df.drop(col, axis=1)
    # join the two dataframes
    df = df.join(temp_df)

    return df

with driver.session() as session:
    result = session.run(db_query)
    records = [record for record in result]
    for record in records:
        g = record.get('p').graph
        rels = [rel for rel in g.relationships if rel.type == 'Date']
        # get start node id, gender, race, age
        for rel in rels:
            data[rel.element_id] = {}
            data[rel.element_id]['src_speed_dater'] = rel.start_node.items()
            data[rel.element_id]['trg_speed_dater'] = rel.end_node.items()
            data[rel.element_id]['result'] = rel.items()

        
    # create a pandas dataframe to store the id, gender, race, age, age_diff, and match columns
    df = pd.DataFrame.from_dict(data, orient='index')
    # unpack the columns into their own columns and drop the original columns from the dataframe, making the first element of the tuple the column name and the second element the value
    df = unpack_df(df, 'src_speed_dater')
    df = unpack_df(df, 'trg_speed_dater')
    df = unpack_df(df, 'result')

    # reindex the dataframe
    df = df.reset_index().drop('index', axis=1)

    # drop the race_diff and int_corr columns
    df.drop(['race_diff','int_corr'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,src_gender,src_race,src_id,src_age,trg_gender,trg_race,trg_id,trg_age,age_diff,match
0,0,4,1,21,1,2,20,24,3,0
1,0,4,1,21,1,2,19,28,7,1
2,0,4,1,21,1,2,18,27,6,0
3,0,4,1,21,1,2,17,30,9,0
4,0,4,1,21,1,2,16,25,4,0


##### Create a supervised model to predict a match

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
# split the data into training and testing sets
X = df.drop('match', axis=1)
y = df['match']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# instantiate the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
clf = rfc.fit(X_train, y_train)
preds = clf.predict(X_test)

In [12]:
# print the accuracy score
acc = accuracy_score(y_test, preds)
print(acc)

0.2
