In [112]:
import pandas as pd
import os
import numpy as np

# Load the data

In [113]:
a = os.path.join("..", "data", "Speed Dating Data.csv")

df = pd.read_csv(a, encoding='ISO-8859-1')



In [114]:
# iid: 	unique subject number, group(wave id gender)

people_participated = len(df['iid'].unique())
dates_occured = len(df.index)

print(str(people_participated), "people have participated\n" + str(dates_occured), "dates occured")

551 people have participated
8378 dates occured


# missing values

In [115]:
missing = pd.DataFrame(df.isnull().sum() / len(df), columns=['missing_fraction'])
missing = missing.sort_values(by='missing_fraction', ascending=True)

with(open(os.path.join("..", "info", "missing_values.txt"), "w")) as f:
    f.write(missing.to_string())



In [116]:
# cannot guess: id, pid
df_changed = df.dropna(subset=['id', 'pid']).copy()

# replace missing field of study to other (18)
df_changed.loc[:, 'field'] = df_changed['field'].fillna(18)



# races
def random_else_rase(row):
    possible_races = [r for r in range(1, 7) if r != row['race']]
    return np.random.choice(possible_races)


df_changed.loc[:, 'race'] = df_changed['race'].fillna(6)

# where race_o (race of a partner) is NaN
mask_nan = df_changed['race_o'].isna()

# samerace == True -> race_o = race
df_changed.loc[mask_nan & (df_changed['samerace'] == True), 'race_o'] = df_changed.loc[mask_nan & (df_changed['samerace'] == True), 'race']

# samerace == False -> random race != race
df_changed.loc[mask_nan & (df_changed['samerace'] == False), 'race_o'] = df_changed.loc[mask_nan & (df_changed['samerace'] == False)].apply(random_else_rase, axis=1)

In [117]:
len(df_changed.index)


8367

In [118]:
# those and _o are missing
attributes = ['like', 'fun', 'shar', 'attr', 'prob']

# copy
partner_df = df_changed[['iid', 'pid'] + attributes].copy()

# change pid with iid
partner_df = partner_df.rename(columns={'iid': 'pid', 'pid': 'iid'})
partner_df.columns = ['iid', 'pid'] + [f"{attr}_o_fill" for attr in attributes]

# merge
merged = df_changed.merge(partner_df, on=['iid', 'pid'], how='left')

# fill
for attr in attributes:
    merged[f"{attr}_o"] = merged[f"{attr}_o"].fillna(merged[f"{attr}_o_fill"])

# delete
merged.drop(columns=[f"{attr}_o_fill" for attr in attributes], inplace=True)

# save
df_changed = merged


In [119]:
# drop those which are missing
df_changed = df_changed.dropna(subset=['like', 'like_o', 'fun', 'fun_o', 'shar', 'shar_o', 'attr', 'attr_o', 'prob', 'prob_o'])

df_to_split = df_changed.copy()

In [120]:
len(df_to_split.index)


7165

In [121]:
missing = pd.DataFrame(df_to_split.isnull().sum() / len(df), columns=['missing_fraction'])
missing = missing.sort_values(by='missing_fraction', ascending=True)

with(open(os.path.join("..", "info", "changed_missing_values.txt"), "w")) as f:
    f.write(missing.to_string())

In [122]:
from sklearn.model_selection import train_test_split


df_to_split['pair_key'] = df_to_split.apply(lambda row: frozenset([int(row['iid']), int(row['pid'])]), axis=1)
unique_pairs = df_to_split['pair_key'].drop_duplicates().tolist()

train_pairs, test_pairs = train_test_split(unique_pairs, test_size=0.2, random_state=42)

train_pairs = set(train_pairs)
test_pairs = set(test_pairs)

train_df = df_to_split[df_to_split['pair_key'].isin(train_pairs)].drop(columns=['pair_key'])
test_df = df_to_split[df_to_split['pair_key'].isin(test_pairs)].drop(columns=['pair_key'])

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train size: 5731
Test size: 1434
