In [46]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz

mentees = '1RSs3hAKgtXYfAc5EKBJOS2hzLf3PokIU'
mentors = '1K6B-ELJhb16ZLRLcN0leEd_ivQqm-Mg1'

mentees_downloaded = drive.CreateFile({'id': mentees})
mentors_downloaded = drive.CreateFile({'id': mentors})

mentees_downloaded.GetContentFile('Cohort 3 Data - Mentees.csv')
mentors_downloaded.GetContentFile('Cohort 3 Data - Mentors.csv')

In [47]:
import pandas as pd
import numpy as np
mentees = pd.read_csv('Cohort 3 Data - Mentees.csv')
mentors = pd.read_csv('Cohort 3 Data - Mentors.csv')
#Clean DF
mentees_flitered = mentees.filter(items=["Id",
                 "What's your email?",
                 'What role are you looking for help in?',
                 "Which best describes the position you're in?",
                 'How many years of professional experience do you have in your current role?',
                 'What is the main goal you want help with from your mentor?',
                 'How many years of experience do you have as a founder?',
                 'What industry are you looking for help with?',
                 'What company stage do you want your mentor to be in?',
                 'What timezone are you in?',
                 'Have you been matched before?',
                 "Tell us why you want to be a part of Lenny's Mentorship Program!",
                 'What is the main goal you want help with from your mentor?'
                ])

mentors_flitered = mentors.filter(items=["Id",
                 "What's your email?",
                 'Which functional role do you want your mentee to be in?',
                 "Which best describes the position you're in?",
                 'How many years of professional experience do you have in the role you want to be a mentor for?',
                 'Which goals can you help your mentee with?',
                 'How many years of experience do you have as a founder?',
                 'What industry can you help your mentee with?',
                 'What company stage do you want your mentee to be in?',
                 'What timezone are you in?',
                 'Have you been matched before?',
                 'What goals can you help your mentee with?'
                ])
#print(mentors.columns.values)
#print(mentors_flitered.columns.values)
#display(mentees_flitered)
#display(mentors_flitered)


In [48]:
#Input comma seperated list of value
#Output list of values with whitespace stipped off
def clean_multiselect(x):
    if isinstance(x, str):
        return list(map(str.strip,x.split(',')))
    else:
        return []


In [49]:
#Input Dataframe and multi-select field to Binarize
from sklearn.preprocessing import MultiLabelBinarizer
def MultiLableBinarize_df(input_frame, collumn_name):
    nested_list = list(map(clean_multiselect,input_frame[collumn_name].to_list()))
    mlb = MultiLabelBinarizer()
    mlb_df = pd.DataFrame(mlb.fit_transform(nested_list), columns=mlb.classes_)
    bigger = pd.concat([input_frame,mlb_df],axis=1)
    return bigger


In [50]:
class multiSelect:
    def __init__(self, data = ['empty']):
        if isinstance(data, str):
            self.data = clean_multiselect(data)
        else:
            self.data = data
    def __repr__(self):
        return repr(self.data)

In [51]:
class distanceEstimator:
    def __init__(x):
        1==1
    def estimateDistance(self, row):
        #print(mentors.columns.values)
        mentee = row['What industry are you looking for help with?'].data
        mentor = row['What industry can you help your mentee with?'].data

        distance_score = 1000
        if isinstance(mentee,list) and isinstance(mentor,list):
            for industry in mentor:
                if industry in mentee:
                    distance_score = distance_score - 50
        return distance_score

In [52]:
mentees_flitered['What industry are you looking for help with?'] = mentees_flitered['What industry are you looking for help with?'].apply(multiSelect)
mentors_flitered['What industry can you help your mentee with?'] = mentors_flitered['What industry can you help your mentee with?'].apply(multiSelect)

combined = mentors_flitered.join(mentees_flitered,how='cross',lsuffix='-mentor',rsuffix='-mentee')

In [53]:
dE = distanceEstimator()
combined['distance_score'] = combined.apply(dE.estimateDistance, axis = 'columns')
combined = combined.sort_values(by=['distance_score'])
#display(combined)

In [54]:
matched_mentors = {}
matched_mentees = {}
matched_list = []
for index, row in combined.iterrows():
    if row['Id-mentor'] not in matched_mentors:
        matched_mentors[row['Id-mentor']] = 0
    if row['Id-mentee'] not in matched_mentees:
        matched_mentees[row['Id-mentee']] = 0
    if matched_mentors[row['Id-mentor']] >=1:
        continue
    if matched_mentees[row['Id-mentee']] >=1:
        continue
    if row['Id-mentee'] == row['Id-mentor']:
        continue
    matched_mentors[row['Id-mentor']] = matched_mentors[row['Id-mentor']] + 1
    matched_mentees[row['Id-mentee']] = matched_mentees[row['Id-mentee']] + 1
    matched_list.append({'Id-mentor':row['Id-mentor'],'Id-mentee':row['Id-mentee'], 'distance_score':row['distance_score']})

results = pd.DataFrame(matched_list)
reuslts_wide = results.join(mentors_flitered.set_index('Id'),on = 'Id-mentor', rsuffix='-mentor').join(mentees_flitered.set_index('Id'),on = 'Id-mentee',lsuffix='-mentor', rsuffix='-mentee')

reuslts_wide.to_csv('matched.csv', index=False)
#print(matched_mentors)
#print(matched_mentees)