In [1]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz

mentees = '1RSs3hAKgtXYfAc5EKBJOS2hzLf3PokIU'
mentors = '1K6B-ELJhb16ZLRLcN0leEd_ivQqm-Mg1'

mentees_downloaded = drive.CreateFile({'id': mentees})
mentors_downloaded = drive.CreateFile({'id': mentors})

mentees_downloaded.GetContentFile('Cohort 3 Data - Mentees.csv')
mentors_downloaded.GetContentFile('Cohort 3 Data - Mentors.csv')

In [2]:
import pandas as pd
import numpy as np
mentees = pd.read_csv('Cohort 3 Data - Mentees.csv',dtype = {'Id': str})
mentors = pd.read_csv('Cohort 3 Data - Mentors.csv',dtype = {'Id': str})
#Clean DF
#define function to merge columns with same names together
def same_merge(x): return ','.join(x[x.notnull()].astype(str))

#define new DataFrame that merges columns with same names together
mentees = mentees.groupby(level=0, axis=1).apply(lambda x: x.apply(same_merge, axis=1))

mentees_flitered = mentees.filter(items=["Id",
                 "What's your email?",
                 'What role are you looking for help in?',
                 "Which best describes the position you're in?",
                 'How many years of professional experience do you have in your current role?',
                 'What is the main goal you want help with from your mentor?',
                 'How many years of experience do you have as a founder?',
                 'What industry are you looking for help with?',
                 'What company stage do you want your mentor to be in?',
                 'What timezone are you in?',
                 'Have you been matched before?',
                 "Tell us why you want to be a part of Lenny's Mentorship Program!"
                ])

mentors_flitered = mentors.filter(items=["Id",
                 "What's your email?",
                 'Which functional role do you want your mentee to be in?',
                 "Which best describes the position you're in?",
                 'How many years of professional experience do you have in the role you want to be a mentor for?',
                 'Which goals can you help your mentee with?',
                 'How many years of experience do you have as a founder?',
                 'What industry can you help your mentee with?',
                 'What company stage do you want your mentee to be in?',
                 'What timezone are you in?',
                 'Have you been matched before?',
                 'What goals can you help your mentee with?'
                ])
print(mentors.columns.values)
print(mentees.columns.values)
#print(mentors_flitered.columns.values)
#display(mentees_flitered)
#display(mentors_flitered)


['Id' "What's your email?" "What's your LinkedIn or Twitter?"
 'Which functional role do you want your mentee to be in?'
 "Which best describes the position you're in?"
 'How many years of professional experience do you have in the role you want to be a mentor for?'
 'Which goals can you help your mentee with?'
 'How many years of experience do you have as a founder?'
 'What industry can you help your mentee with?'
 'What company stage do you want your mentee to be in?'
 'What timezone are you in?' 'Have you been matched before?'
 'Do you have a scheduling link?'
 'What goals can you help your mentee with?']
['Do you have a scheduling link?' 'Have you been matched before?'
 'How many years of experience do you have as a founder?'
 'How many years of professional experience do you have in your current role?'
 'Id' 'Prioritized'
 "Tell us why you want to be a part of Lenny's Mentorship Program!"
 'What company stage do you want your mentor to be in?'
 'What industry are you looking for h

In [3]:
#Input comma seperated list of value
#Output list of values with whitespace stipped off
def clean_multiselect(x):
    if isinstance(x, str):
        return list(map(str.strip,x.split(',')))
    else:
        return []


In [4]:
#Input Dataframe and multi-select field to Binarize
from sklearn.preprocessing import MultiLabelBinarizer
def MultiLableBinarize_df(input_frame, collumn_name):
    nested_list = list(map(clean_multiselect,input_frame[collumn_name].to_list()))
    mlb = MultiLabelBinarizer()
    mlb_df = pd.DataFrame(mlb.fit_transform(nested_list), columns=mlb.classes_)
    bigger = pd.concat([input_frame,mlb_df],axis=1)
    return bigger


In [5]:
class multiSelect:
    def __init__(self, data = ['empty']):
        if isinstance(data, str):
            self.data = clean_multiselect(data)
        else:
            self.data = data
    def __repr__(self):
        return repr(self.data)

In [6]:
class distanceEstimator:
    def __init__(self, mentor_mentee_question_mapping = []):
        self.mentor_mentee_question_mapping = mentor_mentee_question_mapping
    def multiSelectDistance(self,row,mentee_selection,mentor_selection):
        distance_score = 0
        if isinstance(mentee_selection,list) and isinstance(mentor_selection,list):
            for selection in mentee_selection:
                if selection in mentor_selection:
                    distance_score = distance_score - 10
        return distance_score
    def estimateDistance(self, row):
        distance_score = 1000
        for mapping in self.mentor_mentee_question_mapping:
          if mapping['question_type'] == 'multi-select':
            mentee_selection = row[mapping['mentee_question']].data
            mentor_selection = row[mapping['mentor_question']].data
            distance_score = distance_score + self.multiSelectDistance(row,mentee_selection,mentor_selection)*mapping['question_weight']

        return distance_score

In [7]:
print(mentees_flitered.columns.values)
print(mentors_flitered.columns.values)

['Id' "What's your email?" 'What role are you looking for help in?'
 "Which best describes the position you're in?"
 'How many years of professional experience do you have in your current role?'
 'What is the main goal you want help with from your mentor?'
 'How many years of experience do you have as a founder?'
 'What industry are you looking for help with?'
 'What company stage do you want your mentor to be in?'
 'What timezone are you in?' 'Have you been matched before?'
 "Tell us why you want to be a part of Lenny's Mentorship Program!"]
['Id' "What's your email?"
 'Which functional role do you want your mentee to be in?'
 "Which best describes the position you're in?"
 'How many years of professional experience do you have in the role you want to be a mentor for?'
 'Which goals can you help your mentee with?'
 'How many years of experience do you have as a founder?'
 'What industry can you help your mentee with?'
 'What company stage do you want your mentee to be in?'
 'What time

In [8]:
mentor_mentee_question_mapping = [{'mentee_question':'What industry are you looking for help with?',
                                   'mentor_question':'What industry can you help your mentee with?',
                                   'question_type': 'multi-select',
                                   'question_weight': 4,},
                                  {'mentee_question':'What is the main goal you want help with from your mentor?',
                                   'mentor_question':'Which goals can you help your mentee with?',
                                   'question_type': 'multi-select',
                                   'question_weight': 5,},
                                  {'mentee_question':'What company stage do you want your mentor to be in?',
                                   'mentor_question':'What company stage do you want your mentee to be in?',
                                   'question_type': 'multi-select',
                                   'question_weight': 1,}
                                  ]

for mapping in mentor_mentee_question_mapping:
  if mapping['question_type'] == 'multi-select':
    mentees_flitered[mapping['mentee_question']] = mentees_flitered[mapping['mentee_question']].apply(multiSelect)
    mentors_flitered[mapping['mentor_question']] = mentors_flitered[mapping['mentor_question']].apply(multiSelect)


combined = mentors_flitered.join(mentees_flitered,how='cross',lsuffix='-mentor',rsuffix='-mentee')

In [9]:
dE = distanceEstimator(mentor_mentee_question_mapping)
combined['distance_score'] = combined.apply(dE.estimateDistance, axis = 'columns')
combined = combined.sort_values(by=['distance_score'])
#display(combined)

In [11]:
matched_mentors = {}
matched_mentees = {}
matched_list = []
for index, row in combined.iterrows():
    #print(type(row['Id-mentor']))
    #print(type(row['Id-mentee']))
    #print(row['Id-mentee'] == row['Id-mentor'])
    if row['Id-mentor'] not in matched_mentors:
        matched_mentors[row['Id-mentor']] = 0
    if row['Id-mentee'] not in matched_mentees:
        matched_mentees[row['Id-mentee']] = 0
    if matched_mentors[row['Id-mentor']] >=1:
        continue
    if matched_mentees[row['Id-mentee']] >=1:
        continue
    if row['Id-mentee'] == row['Id-mentor']:
        #print('skipped, matching to self')
        continue
    matched_mentors[row['Id-mentor']] = matched_mentors[row['Id-mentor']] + 1
    matched_mentees[row['Id-mentee']] = matched_mentees[row['Id-mentee']] + 1
    matched_list.append({'Id-mentor':row['Id-mentor'],'Id-mentee':row['Id-mentee'], 'distance_score':row['distance_score']})

results = pd.DataFrame(matched_list)
reuslts_wide = results.join(mentors_flitered.set_index('Id'),on = 'Id-mentor', rsuffix='-mentor').join(mentees_flitered.set_index('Id'),on = 'Id-mentee',lsuffix='-mentor', rsuffix='-mentee')

reuslts_wide.to_csv('matched.csv', index=False)
#print(matched_mentors)
#print(matched_mentees)

skipped, matching to self
skipped, matching to self
