This notebook sets up the workflow for the various functions we have implemented. It shows an example of how we clustered using Nonnegative Matrix Factorization. We manually inspect the output of NMF to determine the best number of clusters for each group

In [1]:
import pickle
import warnings

from utils.hash import make
from utils.calculate_pmi_features import *
from utils.clean_up import *
from utils.categorize_demographics import *
from utils.reduce_dimensions import run_kmeans
from utils.nonnegative_matrix_factorization import nmf_inspect, nmf_labels
warnings.filterwarnings('ignore')

Getting the data, cleaning it, and categorizing demographic data

In [2]:
df = get_data()

In [3]:
essay_list = ['essay0','essay4','essay5']
df_clean = clean_up(df, essay_list)


In [4]:
df_clean.fillna('', inplace=True)

In [5]:
df.columns.values

array(['username', 'age', 'body_type', 'diet', 'drinks', 'drugs',
       'education', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'ethnicity',
       'height', 'income', 'job', 'last_online', 'location', 'offspring',
       'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
       'speaks', 'status', 'TotalEssays'], dtype=object)

In [6]:
df_clean['religion'] = df_clean['religion'].apply(religion_categories)
df_clean['job'] = df_clean['job'].apply(job_categories)
df_clean['drugs'] = df_clean['drugs'].apply(drug_categories)
df_clean['diet'] = df_clean['diet'].apply(diet_categories)
df_clean['body_type'] = df_clean['body_type'].apply(body_categories)
df_clean['drinks'] = df_clean['drinks'].apply(drink_categories)
df_clean['sign'] = df_clean['sign'].apply(sign_categories)
df_clean['ethnicity'] = df_clean['ethnicity'].apply(ethnicity_categories)
df_clean['pets'] = df_clean['pets'].apply(pets_categories)
df_clean['speaks'] = df_clean['speaks'].apply(language_categories)

Splitting the dataframe by gender, running clustering separately on each.

In [7]:
df_male = df_clean[df_clean['sex'] == 'm']

In [8]:
df_female = df_clean[df_clean['sex'] == 'f']

In [9]:
count_matrix_m, tfidf_matrix_m, vocab_m = col_to_data_matrix(df_male, 'essay0') #save out

In [10]:
count_matrix_f, tfidf_matrix_f, vocab_f = col_to_data_matrix(df_female, 'essay0')

In [11]:
vocab_m

{'friends': 625,
 'europe': 563,
 'focus': 605,
 'artistic': 315,
 ', easy': 50,
 'earth': 534,
 'animals': 305,
 'sure': 1317,
 'starting': 1292,
 'outside': 1068,
 ', enjoy': 52,
 'cold': 425,
 ". i'm pretty": 191,
 'hiking': 733,
 'ambitious': 302,
 'stay': 1295,
 'box': 372,
 'makes': 944,
 '8': 267,
 'longer': 908,
 'thought': 1356,
 'highly': 731,
 'restaurants ,': 1174,
 'school': 1208,
 'hands': 699,
 ', say': 126,
 'crazy': 464,
 'enjoy going': 557,
 'loyal ,': 938,
 'knows': 843,
 'known': 842,
 ', cooking ,': 44,
 'meet people': 961,
 ', cooking': 43,
 ', reading ,': 120,
 'south': 1273,
 ', movies ,': 103,
 'doing .': 514,
 'mission': 975,
 'language': 847,
 '!': 0,
 '. really': 225,
 'include': 795,
 'traveling': 1375,
 'rock climbing': 1183,
 'optimistic': 1059,
 'general': 644,
 "'": 14,
 'definitely': 496,
 'graduate': 680,
 'fullest': 630,
 'cultures': 471,
 'nights': 1031,
 'sounds': 1272,
 'yeah ,': 1489,
 'situations': 1253,
 'just moved': 826,
 'laid ,': 846,
 'kic

In [73]:
nmf_inspect(tfidf_matrix_m, )

TypeError: nmf_inspect() missing 1 required positional argument: 'feature_names'

In [None]:
nmf_inspect(tfidf_matrix_f)