In [195]:
import pandas as pd
import os
import numpy as np

# Load the data

In [196]:
a = os.path.join("..", "data", "Speed Dating Data.csv")

df = pd.read_csv(a, encoding='ISO-8859-1')



In [197]:
# iid: 	unique subject number, group(wave id gender)

people_participated = len(df['iid'].unique())
dates_occured = len(df.index)

print(str(people_participated), "people have participated\n" + str(dates_occured), "dates occured")

551 people have participated
8378 dates occured


# missing values

In [198]:
missing = pd.DataFrame(df.isnull().sum() / len(df), columns=['missing_fraction'])
missing = missing.sort_values(by='missing_fraction', ascending=True)

with(open(os.path.join("..", "info", "missing_values.txt"), "w")) as f:
    f.write(missing.to_string())



## Race and PID IID

In [199]:
# cannot guess: id, pid
df_changed = df.dropna(subset=['id', 'pid']).copy()


# races
def random_else_rase(row):
    possible_races = [r for r in range(1, 7) if r != row['race']]
    return np.random.choice(possible_races)


df_changed.loc[:, 'race'] = df_changed['race'].fillna(6)

# where race_o (race of a partner) is NaN
mask_nan = df_changed['race_o'].isna()

# samerace == True -> race_o = race
df_changed.loc[mask_nan & (df_changed['samerace'] == True), 'race_o'] = df_changed.loc[mask_nan & (df_changed['samerace'] == True), 'race']

# samerace == False -> random race != race
df_changed.loc[mask_nan & (df_changed['samerace'] == False), 'race_o'] = df_changed.loc[mask_nan & (df_changed['samerace'] == False)].apply(random_else_rase, axis=1)

In [200]:
len(df_changed.index)


8367

## attributes and *_o

In [201]:
# those and _o are missing
attributes = ['like', 'fun', 'shar', 'attr', 'prob', 'intel', 'sinc', 'amb', 'age', 'met']

# copy
partner_df = df_changed[['iid', 'pid'] + attributes].copy()

# change pid with iid
partner_df = partner_df.rename(columns={'iid': 'pid', 'pid': 'iid'})
partner_df.columns = ['iid', 'pid'] + [f"{attr}_o_fill" for attr in attributes]

# merge
merged = df_changed.merge(partner_df, on=['iid', 'pid'], how='left')

# fill
for attr in attributes:
    merged[f"{attr}_o"] = merged[f"{attr}_o"].fillna(merged[f"{attr}_o_fill"])

# delete
merged.drop(columns=[f"{attr}_o_fill" for attr in attributes], inplace=True)

# save
df_changed = merged


In [202]:
# drop those which are missing
df_changed = df_changed.dropna(subset=['like', 'like_o', 'fun', 'fun_o', 'shar', 'shar_o',
                                       'attr', 'attr_o', 'prob', 'prob_o', 'intel', 'intel_o',
                                       'sinc', 'sinc_o', 'amb', 'amb_o', 'age', 'age_o', 'met', 'met_o'])


In [203]:
len(df_changed.index)


6734

## Attributes scale 1-10

In [204]:
# How often do you go out (not necessarily on dates)?
df_changed.loc[:, 'go_out'] = df_changed['go_out'].fillna(0) # no answear

#How important is it to you that a person you date be of the same racial/ethnic background?
df_changed.loc[:, 'imprace'] = df_changed['imprace'].fillna(0) # no answear

#How important is it to you (on a scale of 1-10) that a person you date be of the same religious background?
df_changed.loc[:, 'imprelig'] = df_changed['imprelig'].fillna(0) # no answear

# Interests
intreset_columns = ['sports', 'tvsports', 'exercise', 'dining', 'museums',
'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies',
'concerts', 'music', 'shopping', 'yoga']
df_changed.loc[:, intreset_columns] = df_changed.loc[:, intreset_columns].fillna(0)


#Overall, on a scale of 1-10, how happy do you expect to be with the people you meet during the speed-dating event?
df_changed.loc[:, 'exphappy'] = df_changed['exphappy'].fillna(0) # no answear

# Out of the 20 people you will meet, how many do you expect will be interested in dating you?
df_changed.loc[:, 'expnum'] = df_changed['expnum'].fillna(0) # no answear

#Overall, how satisfied were you with the people you met? (1=not at all satisfied, 10=extremely satisfied)
df_changed.loc[:, 'satis_2'] = df_changed['satis_2'].fillna(0) # no answear



# What do you look for in the opposite sex?
## Subjects filled out 3-4 weeks after they had been sent their matches
## 6 attributes

In [205]:
columns_1_3 = ['attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3']

# Now, think back to your yes/no decisions during the night of the Speed Dating event.
columns_7_3 = ['attr7_3', 'sinc7_3', 'intel7_3', 'fun7_3', 'amb7_3', 'shar7_3']

#Now we want to know what you think MOST of your fellow men/women look for in the opposite sex. (1-10
columns_4_3 = ['attr4_3', 'sinc4_3', 'intel4_3', 'fun4_3', 'amb4_3', 'shar4_3']

#What do you think the opposite sex looks for in a date? (1-10)
columns_2_3 = ['attr2_3', 'sinc2_3', 'intel2_3', 'fun2_3', 'amb2_3', 'shar2_3']

# Please rate your opinion of your own attributes, on a scale of 1-10 (1= awful and 10=great).
columns_3_3 = ['attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3'] # !!! NO SHAR

#And finally, how do you think others perceive you?
columns_5_3 = ['attr5_3', 'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3'] # !!! NO SHAR

df_changed.loc[:, columns_1_3] = df_changed.loc[:, columns_1_3].fillna(0)
df_changed.loc[:, columns_7_3] = df_changed.loc[:, columns_7_3].fillna(0)
df_changed.loc[:, columns_4_3] = df_changed.loc[:, columns_4_3].fillna(0)
df_changed.loc[:, columns_2_3] = df_changed.loc[:, columns_2_3].fillna(0)
df_changed.loc[:, columns_3_3] = df_changed.loc[:, columns_3_3].fillna(0)
df_changed.loc[:, columns_5_3] = df_changed.loc[:, columns_5_3].fillna(0)



## Survey filled out by students that are interested in participating in order to register for the event.
## 6 Attributes

In [206]:
# We want to know what you look for in the opposite sex.
columns_1_1 = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']

# Now we want to know what you think MOST of your fellow men/women look for in the opposite sex.
columns4_1 = ['attr4_1', 'sinc4_1', 'intel4_1', 'fun4_1', 'amb4_1', 'shar4_1']

# What do you think the opposite sex looks for in a date?
columns2_1 = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']

#How do you think you measure up?
columns3_1 = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']

#And finally, how do you think others perceive you?
columns5_1 = ['attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1']

df_changed.loc[:, columns_1_1] = df_changed.loc[:, columns_1_1].fillna(0)
df_changed.loc[:, columns4_1] = df_changed.loc[:, columns4_1].fillna(0)
df_changed.loc[:, columns2_1] = df_changed.loc[:, columns2_1].fillna(0)
df_changed.loc[:, columns3_1] = df_changed.loc[:, columns3_1].fillna(0)
df_changed.loc[:, columns5_1] = df_changed.loc[:, columns5_1].fillna(0)



## Survey is filled out the day after participating in the event.  Subjects must have submitted this in order to be sent their matches.
## 6 attributes

In [207]:
#Now, think back to your yes/no decisions during the Speed Dating event.
# Try to distribute the 100 points among these six attributes in the way that best reflects the actual importance of
# these attributes inyour decisions
columns_7_2 = ['attr7_2', 'sinc7_2', 'intel7_2', 'fun7_2', 'amb7_2', 'shar7_2']

# We want to know what you look for in the opposite sex.
columns_1_2 = ['attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2']

# What do you think MOST of your fellow men/women look for in the opposite sex?
columns_4_2 = ['attr4_2', 'sinc4_2', 'intel4_2', 'fun4_2', 'amb4_2', 'shar4_2']

# What do you think the opposite sex looks for in a date?
columns_2_2 = ['attr2_2', 'sinc2_2', 'intel2_2', 'fun2_2', 'amb2_2', 'shar2_2']

# How do you think you measure up?
columns_3_2 = ['attr3_2', 'sinc3_2', 'intel3_2', 'fun3_2', 'amb3_2']

# And finally, how do you think others perceive you?
columns_5_2 = ['attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2']

df_changed.loc[:, columns_7_2] = df_changed.loc[:, columns_7_2].fillna(0)
df_changed.loc[:, columns_1_2] = df_changed.loc[:, columns_1_2].fillna(0)
df_changed.loc[:, columns_4_2] = df_changed.loc[:, columns_4_2].fillna(0)
df_changed.loc[:, columns_2_2] = df_changed.loc[:, columns_2_2].fillna(0)
df_changed.loc[:, columns_3_2] = df_changed.loc[:, columns_3_2].fillna(0)
df_changed.loc[:, columns_5_2] = df_changed.loc[:, columns_5_2].fillna(0)


## Half way through meeting all potential dates during the night of the event on their scorecard:
## 6 Attributes

In [208]:
#Please rate the importance of the following attributes in a potential date on a scale of 1-10
columns_1_s = ['attr1_s', 'sinc1_s', 'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s']
#Please rate your opinion of your own attributes, on a scale of 1-10 (1=awful, 10=great) --Be honest!
columns_3_s = ['attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s']

df_changed.loc[:, columns_1_s] = df_changed.loc[:, columns_1_s].fillna(0)
df_changed.loc[:, columns_3_s] = df_changed.loc[:, columns_3_s].fillna(0)


## pf_o_* attributes

In [209]:
partner_pref_cols = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
partner_pref_targets = ['pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha']

partner_pref_df = df_changed[['iid', 'pid'] + partner_pref_cols].copy()

partner_pref_df = partner_pref_df.rename(columns={'iid': 'pid', 'pid': 'iid'})
partner_pref_df.columns = ['iid', 'pid'] + [f"{col}_from_partner" for col in partner_pref_cols]

df_changed = df_changed.merge(partner_pref_df, on=['iid', 'pid'], how='left')

for source_col, target_col in zip(partner_pref_cols, partner_pref_targets):
    df_changed[target_col] = df_changed[target_col].fillna(df_changed[f"{source_col}_from_partner"])

df_changed.drop(columns=[f"{col}_from_partner" for col in partner_pref_cols], inplace=True)


# Other attributes

In [210]:

# What is your primary goal in participating in this event?
df_changed.loc[:, 'goal'] = df_changed['goal'].fillna(8) # other goal

# In general, how frequently do you go on dates?
df_changed.loc[:, 'date'] = df_changed['date'].fillna(0) # no answear

df_changed = df_changed.drop(columns=['career']) # we do not need text

df_changed.loc[:, 'career_c'] = df_changed['career_c'].fillna(15) # other


# Four minutes is:
df_changed.loc[:, 'length'] = df_changed['length'].fillna(0) # no answear

# The number of Speed "Dates" you had was:
df_changed.loc[:, 'numdat_2'] = df_changed['numdat_2'].fillna(0) # no answear


# How many have you contacted to set up a date??
df_changed.loc[:, 'you_call'] = df_changed['you_call'].fillna(0) # no answear
# How many have contacted you?
df_changed.loc[:, 'them_cal'] = df_changed['them_cal'].fillna(0) # no answear


# Have you been on a date with any of your matches?
df_changed.loc[:, 'date_3'] = df_changed['date_3'].fillna(0) # no answear

#How many of your matches have you been on a date with so far?
df_changed.loc[:, 'numdat_3'] = df_changed['numdat_3'].fillna(0) # no answear
df_changed.loc[:, 'num_in_3'] = df_changed['num_in_3'].fillna(0) # no answear


# correlation between participant’s and partner’s ratings of interests in Time 1
df_changed.loc[:, 'int_corr'] = df_changed['int_corr'].fillna(0) # no answear

#How many matches do you estimate you will get (a match occurs when you and your partner both check “Yes” next to decision)?
df_changed.loc[:, 'match_es'] = df_changed['match_es'].fillna(0) # no answear


# DUNNO

In [211]:
# replace missing field of study to other (18)
df_changed = df_changed.drop(columns=['field']) # we do not need text
df_changed.loc[:, 'field_cd'] = df_changed['field_cd'].fillna(18)
#TODO nie wiem czy mozna zastąpić nieuzupełnione field_cd jako other? być może ktoś nie zaznaczył bo nie jest na uczelni?
#TODO nie wiem co to znaczy undergrd (podejrzewam, że to dla osób ze szkół średnich?)
#TODO cos trzeba zrobic z mn_sat
#TODO cos trzeba zrobic z tuition

df_changed.loc[:, 'mn_sat'] = df_changed['mn_sat'].fillna(0)
df_changed.loc[:, 'tuition'] = df_changed['tuition'].fillna(0)
#mam przeczucie ze to wszystko ma ze sobą związek ale nie wiem jak to połączyć


#TODO
'''
from              0.001790
zipcode           0.096324
positin1          0.179279
undergra          0.331463
income            0.386608
'''


'\nfrom              0.001790\nzipcode           0.096324\npositin1          0.179279\nundergra          0.331463\nincome            0.386608\n'

## check missing values once again

In [212]:
df_to_split = df_changed.copy()

missing = pd.DataFrame(df_to_split.isnull().sum() / len(df), columns=['missing_fraction'])
missing = missing.sort_values(by='missing_fraction', ascending=True)

with(open(os.path.join("..", "info", "changed_missing_values.txt"), "w")) as f:
    f.write(missing.to_string())

# Test/train split

In [213]:
from sklearn.model_selection import train_test_split


df_to_split['pair_key'] = df_to_split.apply(lambda row: frozenset([int(row['iid']), int(row['pid'])]), axis=1)
unique_pairs = df_to_split['pair_key'].drop_duplicates().tolist()

train_pairs, test_pairs = train_test_split(unique_pairs, test_size=0.2, random_state=42)

train_pairs = set(train_pairs)
test_pairs = set(test_pairs)

train_df = df_to_split[df_to_split['pair_key'].isin(train_pairs)].drop(columns=['pair_key'])
test_df = df_to_split[df_to_split['pair_key'].isin(test_pairs)].drop(columns=['pair_key'])

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train size: 5387
Test size: 1347
