In [6]:
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import random
import math
from collections import Counter

def csv_to_df(filename):
    ''' Function: csv_to_df
        Parameters: filename (string), header (list)
        Returns: dataframe containing information from
                the file and headers from input
    '''

    df = pd.read_csv(filename) #,names = header)
    return df


def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

def okcupid_regex(df):
    for col in df:
        if df.dtypes[col] == np.object:
            df[col] = df[col].str.replace(r'<[^>]+>','',regex=True)
            df[col] = df[col].str.replace('\n','',regex=True)

    return df

if __name__ == "__main__":
    okcupid = 'profiles.csv.zip'
    df = csv_to_df(okcupid)
    df = df.dropna()
    df = df.reset_index(drop=True)
    df = okcupid_regex(df)

In [8]:
#FIND THE CLOSEST USER USING A DISTANCE MEASUREMENT

'''
def euclidean(our_robot_list,potential_match_list):
    
    #Function: euclidean
    #Parameters: training point dataframe, testing point dataframe
    #Returns: euclidean distance between these two points
    
    
    dist = 0
    for i in range(len(our_robot_list)):
        if our_robot_list[i] == potential_match_list[i]:
            continue
        if type(our_robot_list[i]) == str:
            robot_clean = clean_text(new_row[i])
            robot_dict = Counter(robot_clean)
            potential_clean = clean_text(potential_match_list[i])
            potential_dict = Counter(potential_clean)
            dist += compute_cosine(robot_dict,potential_dict)
        else: 
            dist += (our_robot_list[i] - potential_match_list[i]) ** 2
    return math.sqrt(dist)
'''

def euclidean_test(our_robot_list,potential_match_list, counter_dict):
    '''
    Function: euclidean
    Parameters: training point dataframe, testing point dataframe
    Returns: euclidean distance between these two points
    
    '''
    dist = 0
    i = 6
    if our_robot_list[i] == potential_match_list[i]:
        return 0
    if type(our_robot_list[i]) == str:
        robot_clean = clean_text(our_robot_list[i])
        robot_dict = Counter(robot_clean)
        potential_clean = clean_text(potential_match_list[i])
        potential_dict = Counter(potential_clean)
        for x in potential_dict:
            potential_dict[x] /= math.sqrt(counter_dict[x])
        dist += compute_cosine(robot_dict,potential_dict)
    else: 
        #dist += (our_robot_list[i] - potential_match_list[i]) ** 2
        dist += 0
    return math.sqrt(dist)
    
STOPWORDS =["a", "an", "and", "the", "to", "i", "if", "of", "that", "it",
            "is", "im", "has", "was", "his", "ive", "at", "in", "your", "its",
            "for", "this"]

def clean_text(words):
    ''' Function: clean_text
        Parameters: a string
        Returns: the string split into a list, but lowercased and 
                 with punctuation removed,
                 and stopwords removed
    '''
    clean_words = []
    for word in words.split():
        word = re.sub("[^\w\s]", '', word)
        word = word.lower()
        if word not in STOPWORDS:
            clean_words.append(word)
    return clean_words

def mag(vec):
    ''' Function: mag
        Parameters: a vector (list of ints/floats)
        Returns: the magnitude of the vector
    '''
    mags = [num ** 2 for num in vec]
    return sum(mags) ** 0.5

def dot(v1, v2):
    ''' Function: dot
        Parameters: two vectors (list of ints/floats)
        Returns: the dot product of the vectors
    '''
    dots = [v1[i] * v2[i] for i in range(len(v1))]
    return sum(dots)

def compute_cosine(wc1, wc2):
    ''' Function compute_cosine
        Parameters: two dictionaries, with wordcounts
        Returns: a float, the cosine similarity measure
    '''
     
    # Start with all the words in both dictionaries, de-duped
    all_words = set(list(wc1.keys()) + list(wc2.keys()))

    # Make the vectors: 0 if they've never said the word,
    # the wordcount from given dictionary otherwise
    vec1 = {word : (wc1[word] if word in wc1.keys() else 0) 
            for word in all_words}
    vec2 = {word : (wc2[word] if word in wc2.keys() else 0)
            for word in all_words}

    #mag1 = mag(vec1.values())
    #mag2 = mag(vec2.values())
    mag1 = len(vec1.values()) # This is better proxy for vector "size" since vector entries are being weighted before being sent into this function
    mag2 = len(vec2.values())
    dot_prod = dot(list(vec1.values()), list(vec2.values()))
    if (mag1* mag2) == 0:
        return 1
    return dot_prod / (mag1 * mag2)


In [9]:
#MOST ESSAY/PERSONALITY SIMILARITIES

def total_word_count(df):
    counter_dict = {}
    for i in range(len(df)):
        potential_clean = clean_text(df.iloc[i, 6])
        potential_dict = Counter(potential_clean)
        for x in potential_dict:
            if (x not in counter_dict):
                counter_dict[x] = 0
            counter_dict[x] += potential_dict[x]
            
    return counter_dict

def most_similar(random_person,df,counter_dict):
    most_similar = 0
    for i in range(len(df)):
        x = euclidean_test(random_person.tolist(),list(df.loc[i]), counter_dict)
        if x > most_similar:
            most_similar = x
            key = i
    
    best_match = df.iloc[key,:]
    return best_match.tolist()

def matching_words(random_person,best_match):
    i = 6
    random_clean = clean_text(random_person[i])
    random_dict = Counter(random_clean)
    bestmatch_clean = clean_text(best_match[i])
    bestmatch_dict = Counter(bestmatch_clean)

    matching_words = [x for x in bestmatch_dict if x in random_dict]
    return matching_words

In [19]:
#Creating Series for a Randomly Sampled User

new_dict = {}
i = 0
random_person = random.randint(0,len(df) - 1)
for col in df:
    new_dict[col] = df.iloc[random_person,i]
    i += 1

RANDOM_PERSON = pd.Series(new_dict)

In [20]:
counter_dict = total_word_count(df)
BEST_MATCH = most_similar(RANDOM_PERSON,df,counter_dict)
MATCHING_WORDS = matching_words(RANDOM_PERSON,BEST_MATCH)

print('***RANDOMLY SAMPLED HUMAN***')
print(RANDOM_PERSON[6])
print('\n')
print('***BEST MATCH***')
print(BEST_MATCH[6])
print('\n')
print('***WORDS IN COMMON***')
print(MATCHING_WORDS)
print('-------------------------')

***RANDOMLY SAMPLED HUMAN***
i'm a mix of girly girl and tomboy. i have a fascination withstilettos, dresses, and old british novels (bronte, hardy, dickens)but also love whiskey, my jeans and a good giants game. i golf,sail and am learning to paraglide, but also know how to sew andmake a wicked eclair.i'm always interested in learning/seeing/doing new things and amtrying to strike the right balance between being fun andadventurous with being responsible and dependable. i am openlywarm-hearted and affectionate and have been accused of being a"softie". but i can be quite fiery, so you've got to like a girlwith moxie, honesty, sharp wit and a strong sense of self.family is very important to me. my sibs and parents impress me moreand more each day and i'm incredibly grateful for having suchamazing people in my life. i enjoy both my job and the independenceit affords, but also know that the truly important things in lifeall come down to family and good friends. in many ways, i'm atradition