IMPORTS

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [None]:
# Load df
df = pd.read_csv('cleaned.csv')
#df.drop(columns=['Unnamed: 0']) # Old index

In [None]:
df.head()
df.columns

In [None]:
# extract relevant data
df = df[['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'orientation', 'sex', 'smokes', 'status', 'offspring_status', 'sign_extracted']]
df_ORIG = df.copy()

In [None]:
labels = cosine_similarity(df)[0] # simulate inputs: assume first row is user input and extrac cosine similarity
labels

In [None]:
SIMILAR_THRESHOLD = 0.8

In [None]:
labels = ['not similar' if x < SIMILAR_THRESHOLD else "similar" for x in labels]
labels

In [None]:
# PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(df)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
df['labels'] = labels
df.head()

In [None]:
# append labels / target
finalDf = pd.concat([principalDf, df[['labels']]], axis = 1)

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['similar', 'not similar']
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['labels'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
SIMILARITY_LEVELS=[1, 0.99, 0.9, 0.8, 0.5]
START_PERSON=1

for level in SIMILARITY_LEVELS:
    df = df_ORIG.copy()
    # generate lables
    # calcualte cosine similarity for first row
    labels = cosine_similarity(df)[START_PERSON]
    #print(df.iloc[0])

    # define label based on cosine score and similariy level
    labels = ['not similar' if x < level else "similar" for x in labels]

    # model
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df)
    principalDf = pd.DataFrame(data = principalComponents
                , columns = ['principal component 1', 'principal component 2'])

    # append labels / target
    df['labels'] = labels
    df_labelled = pd.concat([principalDf, df[['labels']]], axis = 1)
    pca1x = df_labelled.iloc[START_PERSON]['principal component 1']
    pca2y = df_labelled.iloc[START_PERSON]['principal component 2']

    # plotting
    fig = plt.figure(figsize = (3,3))
    ax = fig.add_subplot(1,1,1)
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('Similarity Score for level: {}'.format(level), fontsize = 20)
    targets = ['similar', 'not similar']
    colors = ['r', 'b']
    for target, color in zip(targets,colors):
        indicesToKeep = df_labelled['labels'] == target
        ax.scatter(df_labelled.loc[indicesToKeep, 'principal component 1']
                , df_labelled.loc[indicesToKeep, 'principal component 2']
                , c = color
                , s = 50)
    ax.plot(pca1x,pca2y,'go', label='person') 
    ax.legend(['similar', 'not similar', 'person'])

    ax.grid()

In [None]:
SIMILARITY_LEVELS=[1, 0.99, 0.9, 0.8, 0.75, 0.5]
START_PERSON=0
for person in range(0,50):
    for level in SIMILARITY_LEVELS:
        df = df_ORIG.copy()
        # generate lables
        # calcualte cosine similarity for first row
        labels = cosine_similarity(df)[person]
        #print(df.iloc[0])

        # define label based on cosine score and similariy level
        labels = ['not similar' if x < level else "similar" for x in labels]

        # model
        pca = PCA(n_components=2)
        principalComponents = pca.fit_transform(df)
        principalDf = pd.DataFrame(data = principalComponents
                    , columns = ['principal component 1', 'principal component 2'])

        # append labels / target
        df['labels'] = labels
        df_labelled = pd.concat([principalDf, df[['labels']]], axis = 1)
        pca1x = df_labelled.iloc[person]['principal component 1']
        pca2y = df_labelled.iloc[person]['principal component 2']

        # plotting
        fig = plt.figure(figsize = (5,5))
        ax = fig.add_subplot(1,1,1)
        ax.set_xlabel('Principal Component 1', fontsize = 15)
        ax.set_ylabel('Principal Component 2', fontsize = 15)
        ax.set_title('Similarity Score for level: {}'.format(level), fontsize = 20)
        targets = ['similar', 'not similar']
        colors = ['r', 'b']
        for target, color in zip(targets,colors):
            indicesToKeep = df_labelled['labels'] == target
            ax.scatter(df_labelled.loc[indicesToKeep, 'principal component 1']
                    , df_labelled.loc[indicesToKeep, 'principal component 2']
                    , c = color
                    , s = 50)
        ax.plot(pca1x,pca2y,'go', label='person') 
        ax.legend(['similar', 'not similar', 'person'])
        fig.savefig('plots/similarity-plot-person-{}-level-{}.png'.format(person, level))
        ax.grid()