In [14]:
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import random
import math
from collections import Counter
import sys

#--------------------------------------------- LOADING DATA ---------------------------------------------------------
def csv_to_df(filename):
    ''' Function: csv_to_df
        Parameters: filename (string), header (list)
        Returns: dataframe containing information from
                the file and headers from input
    '''

    df = pd.read_csv(filename)
    return df

def okcupid_regex(df):
    ''' Function: okcupid_regex
        Parameters: df (dataframe)
        Returns: dataframe sanitized of HTML tags
    '''
    for col in df:
        if df.dtypes[col] == np.object or df.dtypes[col] == str:
            df[col] = df[col].str.replace(r'<[^>]+>','',regex=True)
            df[col] = df[col].str.replace('\n',' ',regex=True)

    return df

#--------------------------------------DISTANCE FUNCTIONS------------------------------------------------------------
#EUCLIDEAN DISTANCE
def distance_decider(random_person_list,potential_match_list,counter_dict,col):
    '''
    Function: distance_decider
    Parameters: random_person_list (list of a random user),potential_match_list (list of a person who
                could be a potential match for our random user), counter_dict (counter object which keeps track
                of all words used in a specified column of a dataframe), col (integer, gives the specified
                column in the dataframe)
    Returns: delegates the distance to the appropriate distance function based on the datatype of the specified
                attribute
    '''
    column = df.columns
    dataTypeObj = df.dtypes[column[col]]
    dist = 0
    if dataTypeObj == str or dataTypeObj == object:
        dist = nonnumeric_distance(random_person_list,potential_match_list,counter_dict,col)
        return -dist*10
    else: 
        dist += euclidean(random_person_list,potential_match_list,col)
        return dist
    
def euclidean(random_person_list,potential_match_list,col):
    ''' Function: euclidean
        Parameters: random_person_list (list of a random user),potential_match_list (list of a person who
                could be a potential match for our random user), col (integer, gives the specified
                column in the dataframe)
        Returns: float (euclidean distance)
    '''
    y = (random_person_list[col] - potential_match_list[col]) ** 2
    while(y > 1):
        y = y / 10
    y/10000
    y = math.sqrt(y)
    return y

def nonnumeric_distance(random_person_list,potential_match_list,counter_dict,col):
    '''
    Function: nonnumeric_distance
    Parameters: random_person_list (list of a random user),potential_match_list (list of a person who
                could be a potential match for our random user), counter_dict (counter object which keeps track
                of all words used in a specified column of a dataframe), col (integer, gives the specified
                column in the dataframe)
    Returns: float (weighted cosine similarity)
    '''
    random_person_clean = clean_text(random_person_list[col])
    random_person_dict = Counter(random_person_clean)
    potential_clean = clean_text(potential_match_list[col])
    potential_dict = Counter(potential_clean)
    for x in potential_dict:
        potential_dict[x] /= math.sqrt(counter_dict[x])
    return compute_cosine(random_person_dict,potential_dict)


#COSINE SIMILARITY

STOPWORDS =["a", "an", "and", "the", "to", "i", "if", "of", "that", "it",
            "is", "im", "has", "was", "his", "ive", "at", "in", "your", "its",
            "for", "this","am","not","be","my",'am', 'not', 'be', 'me', 'who', 'with', 'all', 'or', 'are',"you"]

def clean_text(words):
    ''' Function: clean_text
        Parameters: a string
        Returns: the string split into a list, but lowercased and 
                 with punctuation removed,
                 and stopwords removed
    '''
    clean_words = []
    for word in words.split():
        word = re.sub("[^\w\s]", '', word)
        word = word.lower()
        if word not in STOPWORDS:
            clean_words.append(word)
    return clean_words

def dot(v1, v2):
    ''' Function: dot
        Parameters: two vectors (list of ints/floats)
        Returns: the dot product of the vectors
    '''
    dots = [v1[i] * v2[i] for i in range(len(v1))]
    return sum(dots)

def compute_cosine(wc1, wc2):
    ''' Function compute_cosine
        Parameters: two dictionaries, with wordcounts
        Returns: a float, the cosine similarity measure
    '''
     
    # Start with all the words in both dictionaries, de-duped
    all_words = set(list(wc1.keys()) + list(wc2.keys()))

    # Make the vectors: 0 if they've never said the word,
    # the wordcount from given dictionary otherwise
    vec1 = {word : (wc1[word] if word in wc1.keys() else 0) 
            for word in all_words}
    vec2 = {word : (wc2[word] if word in wc2.keys() else 0)
            for word in all_words}

    mag1 = len(vec1.values()) # This is better proxy for vector "size" since vector entries are being weighted before being sent into this function
    mag2 = len(vec2.values())
    dot_prod = dot(list(vec1.values()), list(vec2.values()))
    if (mag1* mag2) == 0:
        return 1
    return dot_prod / (mag1 * mag2)


#-------------------------------------------- COMPUTING BEST MATCHES ------------------------------------------------
def find_best_match(df,RANDOM_PERSON,KEY,INPUT):
    ''' Function: find_best_match
        Parameters: df (dataframe), RANDOM_PERSON (series of a random user in dataframe), KEY (index where the 
        random user is located), INPUT (string of input from this program-user)
        Returns: best and worst match to random person, as lists
    '''
    
    #Initialize list of matching distances for all users
    matches = [0]*len(df)
    
    if INPUT == 'all':
        for col in range(len(df.columns)):
            if col == 20:
                continue
            print("LOADING DATA: ",df.columns[col])
            matches = measuring_similarity(RANDOM_PERSON,df,matches,col,KEY)
    else:
        col = df.columns.get_loc(INPUT)
        print("LOADING DATA: ",df.columns[col])
        matches = measuring_similarity(RANDOM_PERSON,df,matches,col,KEY)
    
    #Return best and worst match
    min_num = min(matches)
    max_num = max(matches)
    key = matches.index(min_num)
    badkey = matches.index(max_num)
    best_match = df.iloc[key,:]
    worst_match = df.iloc[badkey,:]
    return best_match.tolist(), worst_match.tolist()

def measuring_similarity(random_person,df,matches,col,KEY):
    ''' Function: measuring_similarity
        Parameters: random_person (series), df (dataframe), matches (list), col (int), KEY (int)
        Returns: matches, a list of the distances between the random person and all users in
                    the dataframe
    '''
    column = df.columns
    dataTypeObj = df.dtypes[column[col]]
    if dataTypeObj == str or dataTypeObj == object:
        counter_dict = total_word_count(df,col)
    else:
        counter_dict = {}
    for i in range(len(df)):
        if i == KEY:
            matches[i] += 10000 #some large number
            continue
        matches[i] += distance_decider(random_person.tolist(),list(df.loc[i]),counter_dict,col)
    
    return matches


#------------------------------------ COUNTER-DICTIONARY-TYPE FUNCTIONS ------------------------------------------

def matching_words(random_person,best_match):
    ''' Function: matching_words
        Parameters: random_person (string), best_match (string)
        Returns: list of words in common between the two users, but just in the ten essays
    '''
    matching_words = []
    for i in range(6,16):
        random_clean = clean_text(random_person[i])
        random_dict = Counter(random_clean)
        bestmatch_clean = clean_text(best_match[i])
        bestmatch_dict = Counter(bestmatch_clean)
        matching_words += [x for x in bestmatch_dict if x in random_dict]
    return matching_words

def total_word_count(df,col):
    ''' Function: total_word_count
        Parameters: df (dataframe), col (int)
        Returns: counter_dict (Counter object which keeps track of all words
                    said by all users for the specified col, as well as the count
                    of how many times each word was used in total
    '''
    
    counter_dict = {}
    for i in range(len(df)):
        potential_clean = clean_text(df.iloc[i, col])
        potential_dict = Counter(potential_clean)
        for x in potential_dict:
            if x not in counter_dict:
                counter_dict[x] = 0
            counter_dict[x] += potential_dict[x]
            
    return counter_dict
#-------------------------------------------- CREATING RANDOM USER ------------------------------------------------
def create_random(df):
    ''' Function: create_random
        Parameters: df (dataframe)
        Returns: series of a random user in dataframe
    '''
    new_dict = {}
    i = 0
    random_person = random.randint(0,len(df) - 1)
    for col in df:
        new_dict[col] = df.iloc[random_person,i]
        i += 1

    RANDOM_PERSON = pd.Series(new_dict)
    KEY = random_person
    return RANDOM_PERSON, KEY

#-------------------------------------------- PRINTING STATISTICS ---------------------------------------------------
def print_report(BEST_MATCH,RANDOM_PERSON):
    ''' Function: print_report
        Parameters: BEST_MATCH (list), RANDOM_PERSON (list)
        Returns: report of the two users profiles as well as the words they shared in common
                    in their ten essays
    '''
    i = 0
    for col in df:
        
        print('------------------------- ',col,' -------------------------')
        print('***RANDOMLY SAMPLED HUMAN***')
        print(RANDOM_PERSON[i])
        print('***BEST MATCH***')
        print(BEST_MATCH[i])
        print('\n')
        i += 1
    print('------------------- MATCHING WORDS -------------------------')
    print(matching_words(RANDOM_PERSON,BEST_MATCH))

In [25]:
#-------------------------------------------- DRIVER CODE ---------------------------------------------------------
if __name__ == "__main__":
    okcupid = 'profiles.csv.zip'
    df = csv_to_df(okcupid)
    df = df.dropna()
    df = df.reset_index(drop=True)
    df = okcupid_regex(df)

    RANDOM_PERSON,KEY = create_random(df)
    cats = ['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status']
    for i in range(len(cats)):
        print(cats[i])
    INPUT = str(input("Give me a name of a category. See a list of categories above. For all categories, write 'all' (without quotes): "))
    try:
        BEST_MATCH, WORST_MATCH = find_best_match(df,RANDOM_PERSON,KEY,INPUT)
        print_report(BEST_MATCH,RANDOM_PERSON)
    except:
        print('Invalid input.')

age
body_type
diet
drinks
drugs
education
essay0
essay1
essay2
essay3
essay4
essay5
essay6
essay7
essay8
essay9
ethnicity
height
income
job
last_online
location
offspring
orientation
pets
religion
sex
sign
smokes
speaks
status
Give me a name of a category. See a list of categories above. For all categories, write 'all' (without quotes): essay0
LOADING DATA:  essay0
-------------------------  age  -------------------------
***RANDOMLY SAMPLED HUMAN***
28
***BEST MATCH***
21


-------------------------  body_type  -------------------------
***RANDOMLY SAMPLED HUMAN***
athletic
***BEST MATCH***
rather not say


-------------------------  diet  -------------------------
***RANDOMLY SAMPLED HUMAN***
mostly anything
***BEST MATCH***
mostly anything


-------------------------  drinks  -------------------------
***RANDOMLY SAMPLED HUMAN***
socially
***BEST MATCH***
not at all


-------------------------  drugs  -------------------------
***RANDOMLY SAMPLED HUMAN***
never
***BEST MATCH***
neve