In [1]:
import warnings

import numpy as np
import pandas as pd

import utility as utl

warnings.filterwarnings('ignore')

In [2]:
# load user profile data
profiles_path = '../source_data/okcupid_2015/user_data_public.csv'
profiles = pd.read_csv(profiles_path, delimiter=",", low_memory=False)
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68371 entries, 0 to 68370
Columns: 2625 entries, q2 to CA_items
dtypes: float64(59), object(2566)
memory usage: 1.3+ GB


In [3]:
df = profiles.replace(to_replace='-', value=np.nan)  # missing values

In [4]:
# consolidate "looking for people who want" values that are similar from the
# perspective of users looking for dates.
# this re-coding is an oversimplification for other purposes.
want_recode = {
    np.nan: np.nan,
    'All who like bi men': 'All who like bi men',
    'All who like bi women': 'All who like bi women',
    'Bi men': 'Bi men',  # for code re-execution convenience
    'Bi men only': 'Bi men',  # does not occur in data set
    'Bi women': 'Bi women',  # for code re-execution convenience
    'Bi women only': 'Bi women',
    'Bi men and women': 'Bi men and women',
    'Everybody': 'Everyone',
    'Everyone': 'Everyone',
    'Gay men only': 'Men who like men',
    'Gay women only': 'Women who like women',
    'Men': 'Men',
    'Men who like men': 'Men who like men',
    'Men who like women': 'Men who like women',
    'Straight men only': 'Men who like women',
    'Straight women only': 'Women who like men',
    'Women': 'Women',
    'Women who like men': 'Women who like men',
    'Women who like women': 'Women who like women',
}
df['lf_want'] = df['lf_want'].apply(lambda x: want_recode[x])


def binary_coder(true_code):
    def f(value):
        if isinstance(value, float) and np.isnan(value):
            return value
        return 1 if value in true_code else 0

    return f


lf_men_set = [
    'All who like bi men',
    'All who like bi women',
    'Bi men',
    'Bi men and women',
    'Everyone',
    'Men',
    'Men who like men',
    'Men who like women',
]
df["lf_men"] = df['lf_want'].apply(binary_coder(lf_men_set))

lf_women_set = [
    'All who like bi men',
    'All who like bi women',
    'Bi women',
    'Bi men and women',
    'Everyone',
    'Women who like men',
    'Women who like women',
    'Women',
]
df["lf_women"] = df['lf_want'].apply(binary_coder(lf_women_set))

lf_who_like_men_set = [
    'All who like bi men',
    'Bi men',
    'Bi women',
    'Bi men and women',
    'Men who like men',
    'Women who like men',
]
df["lf_who_like_men"] = df['lf_want'].apply(binary_coder(lf_who_like_men_set))

lf_who_like_women_set = [
    'All who like bi women',
    'Bi men',
    'Bi women',
    'Bi men and women',
    'Men who like women',
    'Women who like women',
]
df["lf_who_like_women"] = df['lf_want'].apply(
    binary_coder(lf_who_like_women_set))

In [5]:
# code multi-label variables
frames = [
    df,
    utl.multi_dummies(df['d_ethnicity'], 'eth_'),
    utl.multi_dummies(df['lf_for'].str.replace('For  ', ''), 'for_'),
    utl.multi_dummies(df['d_gender'], 'g_'),
]
df = pd.concat(frames, axis=1)

In [6]:
# list features in consideration
prefixes = ['q', 'd_', 'lf_', 'eth_', 'for_', 'g_', 'orientation_']
ignore = [
    "d_age",
    "d_city",
    "d_country",
    "d_ethnicity",
    "d_gender",
    "d_languages",
    "d_orientation",
    "d_username",
    'lf_for',
    'lf_max_age',
    'lf_min_age',
    'lf_want',
]
features = [
    x for x in df.columns if utl.match_any_prefix(x, prefixes) and (x not in ignore)
]
df = df[features]

In [7]:
# code remaining non-numeric features in-place
object_cols = df.select_dtypes(include='object').columns.tolist()
# q columns should be treated as categorical even when they contain numeric values
object_cols += [x for x in features if x.startswith(
    'q') and (x not in object_cols)]

feature = []
code = []
text = []
for col in object_cols:
    if not col.startswith("q"):
        try:
            df[col] = df[col].str.lower()
        except:
            None

    df[col], codes = pd.factorize(df[col], na_sentinel=-1)

    nans = [utl.value_is_nan(c) for c in codes]
    if any(nans):
        print(f'NaN value found in {col}')
        print(codes)

    for c, i in zip(codes, range(len(codes))):
        code.append(i)
        feature.append(col)
        # feather can't handle semicolons
        text.append(str(c).replace(';', ','))

q_codes = pd.DataFrame({'qid': feature, 'code': code, 'text': text})

In [8]:
# drop uninformative columns
df = df.replace(to_replace=-1, value=np.nan)
n_unique = df.nunique()
no_info_columns = df.columns[n_unique < 2]
df = df.drop(columns=no_info_columns)

In [9]:
# count missing values
missing_count = df.isna().sum(axis=1)

# use -1 for missing values in final representation
df = df.replace(to_replace=np.nan, value=-1).astype('int8')

df['missing_count'] = missing_count

In [10]:
# save profiles
df = df.reset_index().drop('index', axis=1)
df.to_feather('../processed_data/profiles.feather')

In [11]:
# save q_codes
q_codes.to_feather('../processed_data/q_codes.feather')