In [2]:
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

# first overview of the dataset:
df = pd.read_csv('/content/profiles.csv')
print(df.info())
print(df.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   body_type    54650 non-null  object 
 2   diet         35551 non-null  object 
 3   drinks       56961 non-null  object 
 4   drugs        45866 non-null  object 
 5   education    53318 non-null  object 
 6   essay0       54458 non-null  object 
 7   essay1       52374 non-null  object 
 8   essay2       50308 non-null  object 
 9   essay3       48470 non-null  object 
 10  essay4       49409 non-null  object 
 11  essay5       49096 non-null  object 
 12  essay6       46175 non-null  object 
 13  essay7       47495 non-null  object 
 14  essay8       40721 non-null  object 
 15  essay9       47343 non-null  object 
 16  ethnicity    54266 non-null  object 
 17  height       59943 non-null  float64
 18  income       59946 non-null  int64  
 19  job 

In [3]:
# clean data: split 'pets' into 'cats' and 'dogs', and assign oridnal values based on
# whether they like and have cats|dogs (=2), like but don't have cats|dogs (=1), or dislike cats|dogs (=0)

df.dropna(subset=['pets'], inplace=True)
df['cats'] = df['pets'].apply(lambda x: 2 if 'has cats' in x
                              else (1 if 'likes cats' in x
                                    else (0 if 'dislikes cats' in x
                                          else np.nan)))
df['dogs'] = df['pets'].apply(lambda x: 2 if 'has dogs' in x
                              else (1 if 'likes dogs' in x
                                    else (0 if 'dislikes dogs' in x
                                          else np.nan)))

In [4]:
# clean data: split 'offspring' into 'has_offspring' (present) and 'want_offspring' (future), and assign oridnal values based on
# whether they have more than one kid (=2), one kid (=1), or no kids (=0)
# and whether they want kids (=2), might want kinds (=1), or don't want kids (=0)

df.dropna(subset=['offspring'], inplace=True)
df['has_offspring'] = df['offspring'].apply(lambda x: 2 if 'has kids' in x
                              else (1 if 'has a kid' in x
                                    else (0 if 'doesn&rsquo;t have' in x
                                          else np.nan)))
df['want_offspring'] = df['offspring'].apply(lambda x: 2 if 'wants' in x
                              else (1 if 'might want' in x
                                    else (0 if 'doesn&rsquo;t want' in x
                                          else np.nan)))

In [5]:
# clean data: for 'ethnicity', assign users with more than one group of ethnic group selected as 'multi-ethnic'
# check how many users are assigned to 'multi-ethnic'

df.dropna(subset=['ethnicity'], inplace=True)
df['ethn'] = ['multi-ethnic' if ',' in x else x for x in df['ethnicity']]

len(df[df['ethn'] =='multi-ethnic'])/df['ethn'].count()

0.1344271901552886

In [6]:
# clean data: for 'speaks', prioritize language mentioned (over 3000 lang mentioned!) based on top 10 lang spoken fluently,
# and assign boolean values based on whether they speak (=1) or don't speak (=0) the language at all

df.dropna(subset=['speaks'], inplace=True) # drop na
df['speaks_lang_only'] = df['speaks'].replace(' \((fluently|okay|poorly)\)', '', regex=True)
print('number of lanugages mentioned: ', df['speaks_lang_only'].nunique())

df = df.join(df['speaks'].str.split(',', expand=True).add_prefix('speaks_split')) # split into columns
speaks_stack = df[['speaks_split0', 'speaks_split1', 'speaks_split2', 'speaks_split3', 'speaks_split4']].stack() # stack them into a series

speaks_fluently = [x if '(fluently)' in x else None for x in speaks_stack] # filter for 'fluently' only
speaks_fluently = pd.Series(speaks_fluently) # turn list into series
print(speaks_fluently.value_counts()[:10]) # create new features of top 10 languages that are spoken fluently

top_10_lang = ['english', 'spanish', 'chinese', 'french', 'c++', 'russian', 'german', 'hindi', 'italian']

for lang in top_10_lang:
    df[lang] = df['speaks'].apply(lambda x: 1 if lang in x else 0)

number of lanugages mentioned:  1633
english (fluently)     9399
 spanish (fluently)     902
 chinese (fluently)     286
 french (fluently)      267
 c++ (fluently)         208
 german (fluently)      150
 russian (fluently)     134
 other (fluently)       121
 english (fluently)      80
 italian (fluently)      79
Name: count, dtype: int64


In [7]:
# clean data: for 'education' split school (eg, 'high school') from users' status in education (ie, 'working on', 'graduated', 'dropped'),
# and assign boolean values based on whether they're currently in education ('working on' =1) or not in school('graduated' and 'dropped' =0)
# assign ordinal values based on years of education in 'yrs_edu'

df.dropna(subset=['education'], inplace=True)
edu_split = df.education.str.split(' ')
df['edu'] = edu_split.str[-2:].str.join(' ')

df['edu'] = ['college/university' if 'college/university' in x else x for x in df['edu']]

edu_conditions = [
    df['education'].str.contains('working on'),
    df['education'].str.contains('graduated'),
    df['education'].str.contains('dropped')
]
edu_values = [1, 0, 0]

df['in_edu'] = np.select(edu_conditions, edu_values)

yrs_edu_map = {
    'high school': 1,
    'two-year college': 2,
    'college/university': 3,
    'masters program ': 4,
    'ph.d program': 5,
    'law school': 5,
    'med school': 5
}

df['yrs_edu'] = df['edu'].map(yrs_edu_map)

In [8]:
# clean data: for 'religion', 'sign', and 'diet', split the groups (eg, 'atheism', 'scorpio', 'vegan'),
# and how seriously they take the religion|sign|diet, and assign ordinal values.

# clean data: 'religion'
religion_split = df.religion.str.split(' ')
df['relig'] = religion_split.str.get(0)

relig_conditions = [
    df['religion'].str.contains('laughing', na=False),
    df['religion'].str.contains('not too', na=False),
    df['religion'].str.contains('somewhat', na=False),
    df['religion'].str.contains('very', na=False)
]
relig_values = [0, 1, 2, 3]

df['relig_serious'] = np.select(relig_conditions, relig_values, default='')

# clean data: 'sign'
df.dropna(subset=['sign'], inplace=True)

sign_split = df.sign.str.split(" ")
df['sign_zodiac'] = sign_split.str.get(0)

sign_conditions = [
    df['sign'].str.contains('but'),
    df['sign'].str.contains('fun'),
    df['sign'].str.contains('a lot')
]
sign_values = [0, 1, 2]

df['sign_serious'] = np.select(sign_conditions, sign_values, default='')

# clean data: 'diet'
df.dropna(subset=['diet'], inplace=True)
diet_split = df.diet.str.split(' ')
df['diet_food'] = diet_split.str[-1]

diet_conditions = [
    df['diet'].str.contains('mostly'),
    df['diet'].str.contains('strictly')
]
diet_values = [0, 1]

df['diet_serious'] = np.select(diet_conditions, diet_values, default='')

In [9]:
# assign ordinal values to 'drinks', 'drugs', and 'smokes'

drink_map = {
    'not at all': 0,
    'rarely': 1,
    'socially': 2,
    'often': 3,
    'very often': 4,
    'desperately': 5
    }
drug_map = {
    'never': 0,
    'sometimes': 1,
    'often': 2
    }
smoke_map = {
    'no': 0,
    'sometimes': 1,
    'when drinking': 1,
    'trying to quit': 1,
    'yes': 2
    }

df['drinks'] = df['drinks'].map(drink_map)
df['drugs'] = df['drugs'].map(drug_map)
df['smokes'] = df['smokes'].map(smoke_map)

In [10]:
# clean data: for 'job' use OneHotEncoder for efficiency

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
ohejob = ohe.fit_transform(df[['job']])
df = pd.concat([df, ohejob], axis=1).drop(columns = ['job'])

In [11]:
# clean data: sex f = 0, m = 1

#df['sex'] = df['sex'].apply(lambda x: 0 if x == "f" else 1)
label_encoder = LabelEncoder()
label_encoder.fit(df['sex'])
df['sex'] = label_encoder.transform(df['sex'])

In [12]:
# clean data: clean essays by removing glaring regex from columns essay0 ~ essay9
# combine all essay columns into one 'essay' column

to_replace = ['\n', '<br />', "\'", '&amp;', '<i>', '</i>']
df['essay'] = ''

for i in range(10):
  col = f'essay{i}' # f-string or 'formatted string' which allows me to embed expression inside curly braces
  for t in to_replace:
    df[col] = df[col].str.replace(t, ' ')
  df['essay'] = df['essay'] + df[col]

df.dropna(subset=['essay'], inplace=True)

In [13]:
# predict gender: vectorize words, train the Naive Bayes classifier, and predict gender

counter = CountVectorizer(stop_words='english', binary=True, min_df=0.03)
X = counter.fit_transform(df['essay'])
y = df['sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', round(accuracy, 3))

Accuracy: 0.735


In [14]:
# which words did the classifier pick up for female vs male

word = counter.get_feature_names_out()
log_prob = classifier.feature_log_prob_

prob = np.exp(log_prob)
prob = np.round(prob, 5)
word_prob = list(zip(prob[0], word))
word_prob = sorted(word_prob, reverse=True)

print('word probability for female:', word_prob[:20])
print('word probability for male:', word_prob[-20:])

word probability for female: [(0.00701, 'love'), (0.00693, 'like'), (0.00667, 'music'), (0.00637, 'friends'), (0.00618, 'good'), (0.00614, 'food'), (0.00591, 'people'), (0.00589, 'life'), (0.00583, 'time'), (0.00557, 'movies'), (0.00538, 'things'), (0.00538, 'just'), (0.00498, 'new'), (0.00496, 'don'), (0.00485, 'books'), (0.00478, 'really'), (0.00469, 'know'), (0.00446, 'work'), (0.00446, 'want'), (0.0044, 'family')]
word probability for male: [(0.00021, 'gotta'), (0.00021, 'cars'), (0.00021, 'bicycle'), (0.0002, 'summary'), (0.0002, 'north'), (0.00019, 'report'), (0.00019, 'fix'), (0.00019, 'father'), (0.00018, 'seven'), (0.00018, 'math'), (0.00018, 'inception'), (0.00018, 'gotten'), (0.00018, 'basketball'), (0.00016, 'snowboarding'), (0.00015, 'simpsons'), (0.00015, 'fixing'), (0.00015, 'dj'), (0.00015, 'bbq'), (0.0001, 'archer'), (6e-05, 'computers')]


In [15]:
# test out the classifier using new input

new = ['Moderately nerdy guy (age 30) who writes stories, translates songs, and overshares cat pictures. Currently living in New York City but leaning against settling down here permanently, and likes the idea of living somewhere family-friendly (that a suburb). By nerdy, I mean that I tend to get into phases where I get weirdly excited about things. To take a few random recent examples: Which countries have the best and most cost-effective transit systems, and urbanism and city planning more generally. Computability theory, or alternate computation models, like what its implications would look like for a world that had regular time travel. Acrobatics. The history of clothes making. Making homemade liquor. Building this took a surprisingly long time Cats. Elaborate improvised constructs. Also, glacier climbing What a world would look like if time travel was possible but constrained by computability limits, except that cats could effectively get around them by having weak preferences, and the whole thing ran on a hybrid Miyazaki/Moomins/FF7 aesthetic. And so on. By family-oriented, I mean, well I like happy families. And I like kids, and really want kids, and like the kind of family/tribal structures where people of different ages actually interact. I am slightly trad in some ways, by which I mean something like enjoying some traditions and traditional attitudes but not actually being particularly religious. So I like Friday night family dinners (with the candles and the nice tablecloth), and I like casual gender norms (In that, say, I prefer the sound of my wife my spouse), but do regular prayers or really keep kosher (except I eat pork). For stories I wrote this one about first contact, or this one about bike lanes. For song translations, I mostly do Israeli folk songs (like this one, or this one). Life/Relationship Goals Most of all, I want a family. I want a good happy partnership relationship, and I want a bunch of kids. probably practical constraints on how many kids I could practically have or handle, but in principle, a fan of the idea of having lots of kids. I think I would be a good dad. I currently live in New York and work doing vague finance and programming stuff. I like it here for now, but I want to settle down permanently, or raise kids here. Where I want to end up is a hard question  for having kids, want to be somewhere clean and pedestrian-friendly, where the kids can walk safely by themselves to get around or play outside or whatnot. Sometimes I think about going back to Israel, but there are other places that meet those conditions. (Financially, I do have more than enough to support a family, including if I was married to someone who wanted to be a homemaker or work nonprofit or whatever, or to take a few years off for that matter). My ideal relationship involves sharing random ideas that get us excited off each other, cooking or going on hikes together, randomly messaging each other cat pictures, developing a pool of shared references and frames that becomes a little like a shared language, spiderman kisses (that is, where one or both of the people doing the kissing are hanging upside-down), and eventually raising a bunch of kids together. Also, occasional backpacking trips where we wind up cooking improvised pancakes in random abandoned cabins. You? If the above part sounds appealing, you should try to contact me! You can email me find me on facebook, whatever. See? Very welcoming I like the idea of social proof   if we know people in common, you can talk to me through them, or talk to them about whether they think would work. Properties common in people I have good relationships with: Getting delighted by things. Probably the biggest single factor, although it can be hard to describe. If you’re the sort of person who would build the food pyramid I mentioned above, or give names to random wildlife, or stop to look in random windows because you’re curious what the people inside live like, or like to think about the logistics of feeding a continent of sentient hamsters, we are likely to get along and you should give me a call. Compatible life goals. This one is pretty obvious, I guess. Ability to resolve conflicts. I’m not always great at immediately noticing when something is bothering someone, but I do listen when people talk to me about things. It helps a lot in a relationship, if  someone who can bring up things that bother you, and we can talk it through and solve it. Having initiative, and dreams, and like to do things.They  have to be shared interests   I just like people who have interests they get excited about.Finally, if you do want to contact me but aren’t sure how, ideas for conversation starters. What would the world look like if it had time-travelling cats? If you had the choice to be turned into a group of hamsters, how many hamsters would it take for you to prefer being the group of hamsters to being human? What’s something you deeply care about, or something that would make your life or your neighborhood or the world better? If you could be in any sort of place imaginable right now, where would you be? Any random thing you’ve been thinking about lately Just ‘Hi’ is also good!']
test = counter.transform(new)
print(classifier.predict(test))
print(classifier.predict_proba(test))

[1]
[[0.41563752 0.58436248]]
