In [1]:
import pandas as pd
import sys

In [2]:
def normalize(data_point: float, data_list: list) -> float:
    ''' Normalize the data to be within the rang of 0.0-1.0
        Round answer to 2 digits
    '''
    if not isinstance(data_point, (int, float)):
        sys.exit(f"Function '{perform_matching.__name__}': Data point/value was not passed.")
    elif not isinstance(data_list, (list)):
        sys.exit(f"Function '{perform_matching.__name__}': Data list of values was not passed.")
    else:
        return format((data_point-min(data_list)) / (max(data_list)-min(data_list)), '0.3f')

In [3]:
def unique_topcs_methods(methods_list: list, topics_list: list):
    
    ### Try to collect all unique entries for methods and topics from reviewers
    #print(f'All possible topics: {set(reviewer_topic_list)}')
    
    methods = []
    #methods = set(methods_list)
    uniq_methods = ', '.join(set(methods_list)).split(', ') 
    #print(f'All possible methods: {set(uniq_methods)}\n')

    topics = []
    #topics = set(topics_list)
    uniq_topics = ', '.join(set(topics_list)).split(', ') 
    #print(f'All possible topics: {set(uniq_topics)}\n')

    counter=0
    #print(topics_list)
    for topic in uniq_topics:
        for t in topics_list:
            if topic == t:
                counter += 1
        #print(f'{topic}: {counter}')
    revi_methods = pd.DataFrame(data={'Methods': sorted(list(set(uniq_methods)))})
    revi_methods.to_csv('reviewer_methods.csv', sep=';', index=False)
    revi_topics = pd.DataFrame(data={'Methods': sorted(list(set(uniq_topics)))})
    revi_topics.to_csv('reviewer_topics.csv', sep=';', index=False)

In [4]:
def filter_top(entrants: pd.DataFrame, matching_df: pd.DataFrame) -> pd.DataFrame:
    ''' Select the top 3 matchings for each candidate and reviewer and return the results.'''
    
    if not isinstance(entrants, pd.core.frame.DataFrame) or not isinstance(matching_df, pd.core.frame.DataFrame):
        sys.exit(f"Function '{filter_top.__name__}': Either 'entrants' or 'matching_df' "
                  "was not passed as a Pandas dataframe.")

    reviewer_collector = {}
    candidate_collector = {}
    perfect_score_list = []

    ## TODO: check to see if I can remove the second condition below (and...)

    ## len(entrants) is the same as entrants.shape[0]

    ## assign starting value of zero
    for candidate in entrants.itertuples():
        if not candidate[1] in candidate_collector and len(candidate_collector) < len(entrants):
            candidate_collector[candidate[1]] = 0
            #print(candidate_collector)

    # key part: must operate on matching_df that has been sorted by total_score, and then limit it
    for reviewer in matching_df.itertuples():
        if not reviewer[1] in reviewer_collector and len(reviewer_collector) < len(entrants):
            reviewer_collector[reviewer[1]] = 0
            #print(reviewer_collector)

    for index, matching_pair in matching_df.iterrows():
        revi = matching_pair['reviewer']
        revi_email = matching_pair['reviewer_email']
        revi_topics = matching_pair['reviewer_topics']
        revi_methods = matching_pair['reviewer_methods']
        candi = matching_pair['candidate']
        candi_email = matching_pair['candidate_email']
        candi_topics = matching_pair['candidate_topics']
        candi_methods = matching_pair['candidate_methods']
        itimized_score = matching_pair['itimized_score_topics_methods_history']
        total_score = matching_pair['total_score']
        
        if revi in reviewer_collector and candi in candidate_collector \
                                      and reviewer_collector[revi] != 3 \
                                      and candidate_collector[candi] != 3:
            perfect_score_list.append((revi, candi, itimized_score, total_score))
            reviewer_collector[revi] += 1
            candidate_collector[candi] += 1
            #print(reviewer_collector)

    final_df = pd.DataFrame(data={'Reviewer Name': [i[0] for i in perfect_score_list],
                                  'Candidate Name': [i[1] for i in perfect_score_list],
                                  "Itimized score [topics, methods, reviewer's history]": [i[2] for i in perfect_score_list],
                                  'total_score': [i[3] for i in perfect_score_list]})

    # Could sort by "Reviewer Name" or "Candidate Name"
    final_df = final_df.sort_values(by=['Candidate Name', 'total_score'], ascending=[True, False])
    #final_df = final_df.sort_values(by=['total_score', 'Candidate Name'], ascending=[False, False])

    #print(perfect_score_list)
    return final_df

In [5]:
def scoring(reviewer: pd.Series, candidate: pd.Series, reviewer_history: list) -> int:
    ''' Score the matching between reviewer and candidate based on:
        1) topics,
        2) methods, and
        3) history of recent reviews done by reviewer.
    '''

    if not isinstance(reviewer, pd.core.series.Series) or not isinstance(candidate, pd.core.series.Series):
        sys.exit(f"Function '{scoring.__name__}': Either 'reviewer' or 'candidate' "
                  "was not passed as a Pandas dataframe.")
    elif not isinstance(reviewer_history, list):
        sys.exit(f"Function '{perform_matching.__name__}': History of reviewers' reviews not passed.")

    score_topics = 0
    score_methods = 0
    score_history = 0
    score_total = 0

    reviewer_topics = reviewer['Topics'].split(sep=', ')
    candidate_topics = candidate['Topics'].split(sep=', ')
    reviewer_methods = reviewer['Methods'].split(sep=', ')
    candidate_methods = candidate['Methods'].split(sep=', ')
    
    for topic in reviewer_topics:
        if topic.lower() in candidate_topics:
            score_topics += 1

    for methodology in reviewer_methods:
        if methodology in candidate_methods:
            score_methods += 1

    reviewer_history_reversed = list(reversed(reviewer_history))
    
    for session in reviewer_history_reversed:
        #print(session, reviewer_history_reversed.index(session),
        #      reviewer_history_reversed.index(session)+1, (reviewer_history_reversed.index(session)+1)/4)
        factor = (reviewer_history_reversed.index(session)+1)/len(reviewer_history_reversed)
        if str(reviewer[session]).lower() != 'reviewed':
             score_history += (1 - factor)
             #print(session, factor, score_history)

    return score_topics, score_methods, score_history

In [6]:
## function annotation
def perform_matching(reviewers: pd.DataFrame, entrants: pd.DataFrame,
                     reviewer_history: list) -> pd.DataFrame:
    '''
    1. Creates lists from dataframes
    2. Calls scoring function
    3. Sorts the top 3 scoreed matches for each pairing
    '''

    if not isinstance(reviewers, pd.core.frame.DataFrame) or not isinstance(entrants, pd.core.frame.DataFrame):
        sys.exit(f"Function '{perform_matching.__name__}': Either 'reviewers' or 'entrants' "
                 "was not passed as a Pandas dataframe.")
    elif not isinstance(reviewer_history, list):
        sys.exit(f"Function '{perform_matching.__name__}': History of reviewers' reviews not passed.")

    reviewer_name_list = []
    reviewer_email_list = []
    reviewer_topic_list = []
    reviewer_method_list = []

    candidate_name_list = []
    candidate_email_list = []
    candidate_topic_list = []
    candidate_method_list = []

    match_scoring_list = []

    score_all_list = []
    score_total_list = []

    # Basic idea here is that each of the X reviewers is listed with each of the Y candidate, and then scored

    for i in range(len(reviewers)):  # number_reviewers_rows; reviewers.shape[0]
        for j in range(len(entrants)):  # number_entrants_rows; entrants.shape[0]
            reviewer_name_list.append(reviewers.loc[i].Name)
            reviewer_email_list.append(reviewers.loc[i].Email)
            reviewer_topic_list.append(reviewers.loc[i].Topics)
            reviewer_method_list.append(reviewers.loc[i].Methods)

            candidate_name_list.append(entrants.loc[j].Name)
            candidate_email_list.append(entrants.loc[j].Email)
            candidate_topic_list.append(entrants.loc[j].Topics)
            candidate_method_list.append(entrants.loc[j].Methods)

            #print(reviewer_topic_list)
            ## scoring is done here
            score_topics, score_methods, score_history = scoring(reviewer=reviewers.loc[i], candidate=entrants.loc[j],
                                             reviewer_history=reviewer_history)
            score_all = [score_topics, score_methods, score_history]
            score_all_list.append(score_all)

            score_total = score_topics + score_methods + score_history
            score_total_list.append(score_total)
            #match_scoring_list.append(scoring(reviewer=reviewers.loc[i], candidate=entrants.loc[j]))
    #print(score_all_list)
    ##############
    ## Normalizing attempt (history = 2)
    top = []
    sub = []
    hist = []
    
    for sublist in (score_all_list):
        top.append(sublist[0])
        sub.append(sublist[1])
        hist.append(sublist[2])
    score_topic_normalized = [normalize(x, top) for x in top]
    score_subject_normalized = [normalize(x, sub) for x in sub]
    score_history_normalized = [normalize(x, hist) for x in hist]

    score_all_list =[]
    score_all_list.extend([list(a) for a in zip(score_topic_normalized,
                                                       score_subject_normalized,
                                                       score_history_normalized)])
    #print(score_all_list)

    score_total_list =[]
    score_total_list =[float(a)+float(b)+float(c) for a,b,c in zip(score_topic_normalized,
                                                       score_subject_normalized,
                                                       score_history_normalized)]
    #matching_df['itimized_score_topics_methods_history'] = hist_normalized
    ##############
    
    results = {'reviewer': reviewer_name_list,
               'reviewer_email': reviewer_email_list,
               'reviewer_topics': reviewer_topic_list,
               'reviewer_methods': reviewer_method_list,
               'candidate': candidate_name_list,
               'candidate_email': candidate_email_list,
               'candidate_topics': candidate_topic_list,
               'candidate_methods': candidate_method_list,
               'itimized_score_topics_methods_history': score_all_list,
               'total_score': score_total_list}
               #'scoring (topics, methods, history)': match_scoring_list}
    #print(results)

    matching_df = pd.DataFrame(data=results)

    #unique_topcs_methods(reviewer_method_list, reviewer_topic_list)
    unique_topcs_methods(results["candidate_topics"], results["reviewer_topics"])
    
    # sort to make picking the top pairs (based on score) easier

    #matching_df = matching_df.sort_values(by=['candidate', 'scoring (total)'], ascending=[True, False])
    matching_df = matching_df.sort_values(by=['total_score'], ascending=[False])

    return matching_df

In [8]:
if __name__ == "__main__":
    ''' This programs optimizes the matching between reviewers and entrants.

        Input:
            1) reviewers CSV file (; seperated)
            2) entrants CSV file (; seperated)
        Output:
            1) CSV formatted file of matchings (; seperated), including itimized
            and total matching scores.
            2) prints to screen the suggested best matchings

        Structure of CSV files:
            entrants:
                Name;Email;Methods;Topics
            reviewers:
                Name;Email;Methods;Topics;2019S;2019F;2020S;2020F;20??s...

        Methods and Topics can be a several entries seperated by a comma.

        For the reviewers, it is assumed that the last 4 columns are the histories of their reviews.
            If they have reviewed, then the word 'reviewed' must be provided in the cell.

        Limitations:
            1. Each candidate is assigned 3 reviewers.
            2. Each reviewer is assigned 3 entrants.
            3. Only the last four history entries of the reviewer are considered in the score

        Contact:
            Daniel Jiang, Robert Bitterling and Karl N. Kirschner*
            University of Applied Sciences Bonn-Rhein-Sieg
            Grantham-Allee 20
            53757 Sankt Augustin - Germany

            Email: k.n.kirschner@gmail.com
            
        Contribution:
            Concept: Kirschner
            Initial coding and structure: Jiang and Bitterling
            Final coding: Kirschner
    '''

    # TODO: add variables for number of reviews to be done
 
    while True:
        try:
            reviewers_data = None
            reviewers_data = pd.read_csv('reviewers.csv', sep=';')
            entrants_data = pd.read_csv('entrants.csv', sep=';')
        except FileNotFoundError:
            print('Input files for reviewer or entrants was not found.')
        else:
            print("""Computing best candidate-reviewer matching based on 1) topics, 2) methodology 
                  and 3) the recent review history of the possible reviewers.


                  NOTE: By default, it is assumed that the last four columns of
                  the reviewer's CSV file should contains the history of the
                  reviews last four sessions (e.g. 2019S; 2019F; 2020S; 2020F). 
                  If this is not the case, please change the following line of
                  code to reflect how many columns to use:
                  'reviewer_history = list(reviewers_data.iloc[:,-4:])'.\n""")
            print('KNK', entrants_data)
            reviewer_history = list(reviewers_data.iloc[:,-4:])

            print(f'Total reviewers: {len(reviewers_data)}; Total entrants: {len(entrants_data)}')

            matchings = perform_matching(reviewers=reviewers_data, entrants=entrants_data,
                                         reviewer_history=reviewer_history)

            top_matches = filter_top(entrants=entrants_data, matching_df=matchings)
            display(top_matches)

            matchings.to_csv('matching_results.csv', sep=';', index=False)
            break

Computing best candidate-reviewer matching based on 1) topics, 2) methodology 
                  and 3) the recent review history of the possible reviewers.


                  NOTE: By default, it is assumed that the last four columns of
                  the reviewer's CSV file should contains the history of the
                  reviews last four sessions (e.g. 2019S; 2019F; 2020S; 2020F). 
                  If this is not the case, please change the following line of
                  code to reflect how many columns to use:
                  'reviewer_history = list(reviewers_data.iloc[:,-4:])'.

KNK                      Name            Email                           Methods  \
0     Zoowee Blubberworth  zb@fakemail.com                   Mathematics, md   
1       Flufffy Gloomkins  fg@fakemail.com                    md, Statistics   
2         Buritt Noseface  bn@fakemail.com                        ff, Python   
3  Peaberry Wigglewhistle  pw@fakemail.com                bioinform

AttributeError: 'Series' object has no attribute 'Methods'