In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import random
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [5]:
questions = {
    1: {
        "type": "multi_choice",
        "num_choices": 4
    },
    2: {
        "type": "mono_choice",
        "num_choices": 4
    }
}

In [6]:
def generate_fake_data(questions, num_rows=50):
    columns = {}
    for k, v in questions.items():
        if v['type'] == "multi_choice":
            for i in range(v['num_choices']):
                columns.update({f"Q{k}C{i+1}" : [0,1]})
        elif v['type'] == "mono_choice":
            columns.update({f"Q{k}" : [0, v['num_choices']-1]})
    data = {}
    for k, v in columns.items():
        data[k] = [random.randint(v[0], v[1]) for _ in range(num_rows)]
     
    df = pd.DataFrame(data)
    return df
        

In [7]:
generate_fake_data(questions, num_rows = 20)


Unnamed: 0,Q1C1,Q1C2,Q1C3,Q1C4,Q2
0,1,0,0,1,0
1,1,0,0,0,2
2,1,1,1,0,0
3,0,1,1,0,0
4,0,0,0,0,3
5,0,1,0,1,2
6,0,0,0,0,3
7,1,1,0,1,2
8,0,1,1,1,3
9,1,1,1,1,3


In [8]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/okcupid_profiles.csv')
print(df.columns)

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')


In [9]:
def clean_okcupid_columns(df, new_column: str):
    rename_dict = {}
    for col_name in df.columns:
        column_name = str(col_name).replace(" ", "_")
        rename_dict[col_name] = f"{new_column}_{column_name}"
    df_ret = df.rename(rename_dict, axis = 1)
    return df_ret

def clean_okcupid_questions_openended(df_open_ended):
    ls = []
    for row in df_open_ended.itertuples():
        not_nan = []
        for i, data in enumerate(row):
            # Skip the index column
            if i == 0:
                continue
            # Remove NaN from original df
            if isinstance(data, float):
                continue
            # If string in cell is less than 50 characters, become None
            if len(data)<50:
                continue
            not_nan.append(data)
        ls.append(not_nan)
    df_open_ended = pd.DataFrame(ls)
    return df_open_ended

In [10]:
def clean_okcupid_dataset(df):
    # change df['body_type'] to required format
    df_body = pd.get_dummies(df['body_type'])
    df_body = clean_okcupid_columns(df_body, "body")
    
    # change df['smokes'] to required format
    df_smokes = pd.get_dummies(df['smokes'])
    df_smokes = clean_okcupid_columns(df_smokes, "smokes")
    
    # move essays over due to Null spaces in the df
    df_open_ended_raw = df[['essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9']]
    df_open_ended = clean_okcupid_questions_openended(df_open_ended_raw)
    df_open_ended = clean_okcupid_columns(df_open_ended, "open_ended")
    
    # Concat all relevant columns
    df = pd.concat([df_body,df_smokes,df_open_ended], axis = 1)
    
    ### Dropping irrelevant rows
    # drop rows in df_body where there is no answer
    df_body_sum = df[['body_a_little_extra', 'body_athletic', 'body_average', 'body_curvy',
       'body_fit', 'body_full_figured', 'body_jacked', 'body_overweight',
       'body_rather_not_say', 'body_skinny', 'body_thin', 'body_used_up']].sum(axis=1)
    index_body = df_body_sum[df_body_sum == 0].index
    df.drop(index_body, inplace = True)
    
    # drop rows in df_smokes where there is no answer
    df_smokes_sum = df[['smokes_no','smokes_sometimes','smokes_trying_to_quit',
                        'smokes_when_drinking','smokes_yes']].sum(axis=1)
    index_smokes = df_smokes_sum[df_smokes_sum == 0].index
    df.drop(index_smokes, inplace = True)
    
    # dropna if row has 0 open-ended text
    df = df.dropna(subset=['open_ended_0', 'open_ended_1',
       'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5',
       'open_ended_6', 'open_ended_7', 'open_ended_8', 'open_ended_9'], how='all')
    return df
cleaned_df = clean_okcupid_dataset(df)
print(cleaned_df.columns)

Index(['body_a_little_extra', 'body_athletic', 'body_average', 'body_curvy',
       'body_fit', 'body_full_figured', 'body_jacked', 'body_overweight',
       'body_rather_not_say', 'body_skinny', 'body_thin', 'body_used_up',
       'smokes_no', 'smokes_sometimes', 'smokes_trying_to_quit',
       'smokes_when_drinking', 'smokes_yes', 'open_ended_0', 'open_ended_1',
       'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5',
       'open_ended_6', 'open_ended_7', 'open_ended_8', 'open_ended_9'],
      dtype='object')


In [11]:
fake_data = generate_fake_data(questions, num_rows = cleaned_df.shape[0])

In [12]:
df = pd.concat([fake_data.reset_index(drop=True),cleaned_df.reset_index(drop=True)], axis=1)

In [13]:
df = df[['open_ended_0', 'open_ended_1',
       'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5',
       'open_ended_6', 'open_ended_7', 'open_ended_8', 'open_ended_9']]

In [14]:
df

Unnamed: 0,open_ended_0,open_ended_1,open_ended_2,open_ended_3,open_ended_4,open_ended_5,open_ended_6,open_ended_7,open_ended_8,open_ended_9
0,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...,,
1,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,i am very open and will share just about anyth...,,,,
2,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement conversation creation contemplation t...,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",
3,playing synthesizers and organizing books acco...,"bataille, celine, beckett. . . lynch, jarmusch...",,,,,,,,
4,hey how's it going? currently vague on the pro...,creating imagery to look at: http://bagsbrown....,"music: bands, rappers, musicians at the moment...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...
47750,a cautious photo for a cautious dude (i'm havi...,mourning the loss (they broke in ecuador i thi...,conscientiousness. adding -ness to words that ...,probably that i have no hair (unless i'm weari...,i get anxious when people ask about favorites....,"water. air. food. clothes (if its cold, though...",was in class friday nights this past semester....,i wish my sexual preferences were clearer to m...,,
47751,i'm nick. i never know what to write about mys...,currently finishing school for film production...,"filmmaking, photography, graphic design, web d...","movies: hook (the greatest adventure ever!), g...",iphone contact lenses headphones camera tv rem...,i do most of my thinking on the bus to/from wo...,when i was 18 i got a tattoo of waldo somewher...,,,
47752,"hello! i enjoy traveling, watching movies, and...","i'm a civil engineer, who enjoys helping the c...",- looking at things objectively - getting thin...,i'm quiet until i get used to the environment ...,"last book: ""game change"". movies: bourne serie...",- iphone - friends and family - internet - bay...,,,,
47753,"""all i have in this world are my balls and my ...","following my dreams... ""you got a dream... you...",it used to be the hair until i mowed it off bu...,where to begin musically: right now i listen t...,"music, family, friends, a basketball, hoop, so...",what i would do on any other day. everydays a ...,i like walking around in other people's house ...,,,
