In [2]:
max_mentees_per_mentor = 2

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz

mentees_cohort3 = '1RSs3hAKgtXYfAc5EKBJOS2hzLf3PokIU'
mentors_cohort3 = '1K6B-ELJhb16ZLRLcN0leEd_ivQqm-Mg1'


mentees = '1fCyuLTE1oMjnyyDTGdnZ9wsPFWxNRptl'

mentors = '1VKLd97IGP821CLzx09kK1ajFETgi-adf'


mentees_downloaded = drive.CreateFile({'id': mentees})
mentors_downloaded = drive.CreateFile({'id': mentors})

mentees_downloaded.GetContentFile('Mentees.csv')
mentors_downloaded.GetContentFile('Mentors.csv')

In [3]:
# Load Data
import pandas as pd
import numpy as np
mentees = pd.read_csv('Mentees.csv')
mentors = pd.read_csv('Mentors.csv')


In [4]:
#Clean DF
#define function to merge columns with same names together
def same_merge(x): return ','.join(x[x.notnull()].astype(str))

#define new DataFrame that merges columns with same names together
#mentees = mentees.groupby(level=0, axis=1).apply(lambda x: x.apply(same_merge, axis=1))

print(mentees.columns.values)
print(mentors.columns.values)



['First Name' 'Last Name' 'Email' 'Details' 'Participation Commitment'
 'Slack Display Name' 'LinkedIn Profile' 'Scheduling Link' 'Timezone'
 'In-Person Meeting Location' 'Years of Experience' 'Roles' 'Industry'
 'Company Stage' 'Topics' 'Most Important Attribute' 'Open Answer'
 'Previous Matches' 'Peer Group Program Interest' 'Coda Account' 'Cohorts'
 'Apply mentor' 'Mentor App Also Submitted' 'Created by' 'Created on'
 'Modified by' 'Modified on' 'Offset' 'Content' 'Test content']
['First Name' 'Last Name' 'Email' 'Detail' 'Participation Commitment'
 'Slack Display Name' 'LinkedIn Profile' 'Scheduling Link' 'Timezones'
 'Offset' 'In-Person Meeting Location' 'Years of Experience' 'Roles'
 'Industry' 'Company Stage' 'Topics' 'Most Important Attribute'
 'Open Answer' 'Previous Matches' 'Peer Group Program Interest'
 'Coda Account' 'Cohort' 'Full Name' 'Created By' 'Created on'
 'Modified by' 'Modified on' 'Test content']


In [5]:
mentees_flitered = mentees.filter(items=["Email",
                 "Timezone",
                 'In-Person Meeting Location',
                 "Years of Experience",
                 'Roles',
                 'Industry',
                 'Company Stage',
                 'Topics',
                 'Most Important Attribute',
                 'Created on'
                ])

mentors_flitered = mentors.filter(items=["Email",
                 "Timezones",
                 'In-Person Meeting Location',
                 "Years of Experience",
                 'Roles',
                 'Industry',
                 'Company Stage',
                 'Topics',
                 'Most Important Attribute',
                 'Created on'
                ])
#print(mentors_flitered.columns.values)
#display(mentees_flitered)
#display(mentors_flitered)

In [6]:
#Input comma seperated list of value
#Output list of values with whitespace stipped off
def clean_multiselect(x):
    if isinstance(x, str):
        return list(map(str.strip,x.split(',')))
    else:
        return []


In [7]:
#Input Dataframe and multi-select field to Binarize
from sklearn.preprocessing import MultiLabelBinarizer
def MultiLableBinarize_df(input_frame, collumn_name):
    nested_list = list(map(clean_multiselect,input_frame[collumn_name].to_list()))
    mlb = MultiLabelBinarizer()
    mlb_df = pd.DataFrame(mlb.fit_transform(nested_list), columns=mlb.classes_)
    bigger = pd.concat([input_frame,mlb_df],axis=1)
    return bigger


In [8]:
class multiSelect:
    def __init__(self, data = ['empty']):
        if isinstance(data, str):
            self.data = clean_multiselect(data)
        else:
            self.data = data
    def __repr__(self):
        return repr(self.data)

In [9]:
class distanceEstimator:
    def __init__(self, mentor_mentee_question_mapping = []):
        self.mentor_mentee_question_mapping = mentor_mentee_question_mapping
    def multiSelectDistance(self,row,mentee_selection,mentor_selection):
        distance_score = 0
        matched = []
        if isinstance(mentee_selection,list) and isinstance(mentor_selection,list):
            for selection in mentee_selection:
                if selection in mentor_selection:
                    distance_score = distance_score - 10
                    matched.append(selection)
        return distance_score, matched
    def _estimateDistance(self, row):
        matched = []
        distance_score = 1000
        for mapping in self.mentor_mentee_question_mapping:
          if mapping['mentee_question'] == mapping['mentor_question']:
            mentee_question = mapping['mentee_question'] + "-mentee"
            mentor_question = mapping['mentee_question'] + '-mentor'
          else:
            mentee_question = mapping['mentee_question']
            mentor_question = mapping['mentor_question']
          if mapping['question_type'] == 'multi-select':
            mentee_selection = row[mentee_question].data
            mentor_selection = row[mentor_question].data

            distance_score_temp, matched_temp = self.multiSelectDistance(row,mentee_selection,mentor_selection)

            distance_score = distance_score + distance_score_temp*mapping['question_weight']
            matched = matched + matched_temp
        return distance_score, multiSelect(matched)

    def estimateDistance(self, row):
        distance_score, matched = self._estimateDistance(row)
        return distance_score

    def matched(self,row):
        distance_score, matched = self._estimateDistance(row)
        return matched

In [10]:
print(mentees_flitered.columns.values)
print(mentors_flitered.columns.values)

['Email' 'Timezone' 'In-Person Meeting Location' 'Years of Experience'
 'Roles' 'Industry' 'Company Stage' 'Topics' 'Most Important Attribute'
 'Created on']
['Email' 'Timezones' 'In-Person Meeting Location' 'Years of Experience'
 'Roles' 'Industry' 'Company Stage' 'Topics' 'Most Important Attribute'
 'Created on']


In [11]:
mentor_mentee_question_mapping = [{'mentee_question':'Timezone',
                                   'mentor_question':'Timezones',
                                   'question_type': 'multi-select',
                                   'question_weight': 2,},
                                  {'mentee_question':'In-Person Meeting Location',
                                   'mentor_question':'In-Person Meeting Location',
                                   'question_type': 'multi-select',
                                   'question_weight': 1,},
                                  {'mentee_question':'Roles',
                                   'mentor_question':'Roles',
                                   'question_type': 'multi-select',
                                   'question_weight': 8,},
                                  {'mentee_question':'Industry',
                                   'mentor_question':'Industry',
                                   'question_type': 'multi-select',
                                   'question_weight': 6,},
                                  {'mentee_question':'Company Stage',
                                   'mentor_question':'Company Stage',
                                   'question_type': 'multi-select',
                                   'question_weight': 5,},
                                  {'mentee_question':'Topics',
                                   'mentor_question':'Topics',
                                   'question_type': 'multi-select',
                                   'question_weight': 7,}
                                  ]

for mapping in mentor_mentee_question_mapping:
  if mapping['question_type'] == 'multi-select':
    mentees_flitered[mapping['mentee_question']] = mentees_flitered[mapping['mentee_question']].apply(multiSelect)
    mentors_flitered[mapping['mentor_question']] = mentors_flitered[mapping['mentor_question']].apply(multiSelect)


combined = mentors_flitered.join(mentees_flitered,how='cross',lsuffix='-mentor',rsuffix='-mentee')

In [12]:
#display(combined)

In [13]:
dE = distanceEstimator(mentor_mentee_question_mapping)
combined['distance_score'] = combined.apply(dE.estimateDistance, axis = 'columns')
combined['matched_criteria'] = combined.apply(dE.matched, axis = 'columns')
combined = combined.sort_values(by=['distance_score'])
#display(combined)

In [14]:
matched_mentors = {}
matched_mentees = {}
matched_list = []

mentee_id = 'Email-mentee'
mentor_id = 'Email-mentor'

for index, row in combined.iterrows():
    #print(type(row['Id-mentor']))
    #print(type(row['Id-mentee']))
    #print(row['Id-mentee'] == row['Id-mentor'])
    if row[mentor_id] not in matched_mentors:
        matched_mentors[row[mentor_id]] = 0
    if row[mentee_id] not in matched_mentees:
        matched_mentees[row[mentee_id]] = 0
    if matched_mentors[row[mentor_id]] >= max_mentees_per_mentor:
        continue
    if matched_mentees[row[mentee_id]] >=1:
        continue
    if row[mentee_id] == row[mentor_id]:
        #print('skipped, matching to self')
        continue
    matched_mentors[row[mentor_id]] = matched_mentors[row[mentor_id]] + 1
    matched_mentees[row[mentee_id]] = matched_mentees[row[mentee_id]] + 1
    matched_list.append({mentor_id:row[mentor_id],mentee_id:row[mentee_id], 'distance_score':row['distance_score'], 'matched':str(row['matched_criteria'])})

results = pd.DataFrame(matched_list)
reuslts_wide = results.join(mentors_flitered.set_index('Email'),on = mentor_id, rsuffix='-mentor').join(mentees_flitered.set_index('Email'),on = mentee_id,lsuffix='-mentor', rsuffix='-mentee')

results.to_csv('matched.csv', index=False)

reuslts_wide.to_csv('matched_wide.csv', index=False)
#print(matched_mentors)
#print(matched_mentees)