In [1]:
import pandas as pd

In [32]:
def check_for_duplicates(input_df: pd.DataFrame, column_names: list):
    ''' Check a dataframe for duplicate rows based on specified columns.

        Input:
            input_df - input DataFrame
            column_names- specfied column for searching

        Library Dependencies:
            1) Pandas
    '''
    
    duplicates = input_df[input_df.duplicated(subset=column_names, keep=False)]
    
    print('The following rows are duplicated within the dataframe, \n'\
          'which should be adjusted by hand (output CSV file):')
    display(duplicates)
    print()

In [2]:
def normalize(data_point: float, data_list: list) -> float:
    ''' Normalize the data to be within the rang of 0.0 -- 1.0
        Round answer to 2 digits
    '''

    if not isinstance(data_point, (int, float)):
        raise TypeError(f"Function '{normalize.__name__}': data_point object is not an int or float.")
    elif not isinstance(data_list, (list)):
        raise TypeError(f"Function '{normalize.__name__}': data_list object is not alist.")
    else:
        return format((data_point-min(data_list)) / (max(data_list)-min(data_list)), '0.3f')

In [3]:
def scoring(reviewer: pd.Series, entrant: pd.Series, use_reviewer_hist: bool) -> tuple:
    ''' Score the matching between a one reviewer and one entrant based on:
        1) topics,
        2) methods and
        3) history of recent reviews done by reviewer if specified as 'yes'.

        Input:
            reviewer - topics, methods, and history of previous reviews rounds
            entrant - topics, methods
            use_reviewer_hist - use the history of reviews

        Return:
            A tuble - (score_topics, score_methods, score_history)

        Library Dependencies:
            1) Pandas
    '''

    if not isinstance(reviewer, pd.core.series.Series):
        raise TypeError(f"Function '{scoring.__name__}': reviewer object is not a Pandas dataframe.")
    elif not isinstance(entrant, pd.core.series.Series):
        raise TypeError(f"Function '{scoring.__name__}': entrant object is not a Pandas dataframe.")

    score_topics = 0
    score_methods = 0
    score_total = 0

    reviewer_topics = reviewer['Topics'].split(sep=', ')
    reviewer_methods = reviewer['Methods'].split(sep=', ')

    entrant_topics = entrant['Topics'].split(sep=', ')    
    entrant_methods = entrant['Methods'].split(sep=', ')

    for topic in reviewer_topics:
        if topic.lower() in entrant_topics:
            score_topics += 1

    for methodology in reviewer_methods:
        if methodology in entrant_methods:
            score_methods += 1

    if use_reviewer_hist == True:
        score_history = 0
        reviewer_history = reviewer.filter(regex='Round').values.tolist()

        # original index position and reversed list values
        for orig_index, session in reversed(list(enumerate(reviewer_history))):    
            factor = (orig_index+1)/len(reviewer_history) ## scale factor for reducing history score

            if session.lower() != 'yes':
                 score_history += (1 - factor)
    else:
        score_history = 0

    return score_topics, score_methods, score_history

In [4]:
def extract_rows(input_df: pd.DataFrame, column_name: str, filter_str: str):
    ''' Extract DataFrame rows based on a specified column having a specified sting.
        This is done by using booleans.

        Input:
            input_df - input DataFrame
            column_name - specfied column for searching
            filter_str - specified filter

        Return:
            filtered_df - a filtered DataFrame

        Library Dependencies:
            1) Pandas
    '''
    
    if not isinstance(input_df, pd.core.frame.DataFrame):
        raise TypeError(f"Function '{extract_rows.__name__}': reviewer object is not a Pandas dataframe.")
    elif not isinstance(column_name, str):
        raise TypeError(f"Function '{extract_rows.__name__}': column_name object is not a str.")
    elif not isinstance(filter_str, str):
        raise TypeError(f"Function '{extract_rows.__name__}': filter_str object is not a str.")

    filter_mask = []

    for index, row in input_df.iterrows():
        if row[column_name] == filter_str:
            filter_mask.append(True)
        else:
            filter_mask.append(False)

    filtered_df = input_df[filter_mask]

    ## a shorthand Pythonic approach, but less readable:
    # filter_mask = input_df['reviewer'] == filter_str\

    return filtered_df

In [5]:
def filter_top_matching(matchings: pd.DataFrame,
                        reviewer_responsibility: int=4,
                        entrant_needed_reviews: int=3) -> pd.DataFrame:
    ''' Filter the top matchings for each entrant and reviewer.

        Input:
            matchings: matching data that contains the following
                reviewer
                reviewer_email
                reviewer_topics
                reviewer_methods
                entrant
                entrant_email
                entrant_topics
                entrant_methods
                itemized_score
                total_score

        Calls:
            extract_rows function

        Return:
            Resulting filtered data that contains the following
                Reviewer Name
                Entrant Name
                Itemized Score [topics, methods, reviewer's history]
                Total Score

        Library Dependencies:
            1) Pandas
    '''

    if not isinstance(matchings, pd.core.frame.DataFrame):
        raise TypeError(f"Function '{filter_top_matchings.__name__}': " \
                         "matchings object is not a Pandas' dataframe.")

    reviewer_counter = {}
    entrant_counter = {}
    perfect_score_list = []

    entrants = matchings["entrant"].unique().tolist()
    reviewers = matchings["reviewer"].unique().tolist()

    ## assign starting participating count value to zero
    for entrant_name in entrants:
        entrant_counter[entrant_name] = 0
    for reviewer_name in reviewers:
        reviewer_counter[reviewer_name] = 0

    for index, row in matchings.iterrows():
        filtered_df = extract_rows(input_df=matchings,
                                   column_name='reviewer',
                                   filter_str=row['reviewer'])

        filtered_df = filtered_df.head(reviewer_responsibility) ## extract top matching results

        for filtered_index, filtered_row in filtered_df.iterrows():
            ## important condition statement
            if filtered_row['reviewer'] in reviewer_counter \
                    and filtered_row['entrant'] in entrant_counter \
                    and reviewer_counter[filtered_row['reviewer']] != reviewer_responsibility \
                    and entrant_counter[filtered_row['entrant']] != entrant_needed_reviews:

                perfect_score_list.append((filtered_row['reviewer'],
                                           filtered_row['entrant'],
                                           filtered_row['itemized_score'],
                                           filtered_row['total_score']))

                entrant_counter[filtered_row['entrant']] += 1
                reviewer_counter[filtered_row['reviewer']] += 1

                ## remove DataFrame rows for specific entrants and reviewers once counters are filled
                if entrant_counter[filtered_row['entrant']] == entrant_needed_reviews:
                    matchings = matchings[matchings["entrant"].str.contains(filtered_row['entrant']) == False]
                if reviewer_counter[filtered_row['reviewer']] == reviewer_responsibility:
                    matchings = matchings[matchings["reviewer"].str.contains(filtered_row['reviewer']) == False]

    final_df = pd.DataFrame(data={'Reviewer Name': [item[0] for item in perfect_score_list],
                                  'Entrant Name':  [item[1] for item in perfect_score_list],
                                  "Itemized Score [topics, methods, reviewer's history]":\
                                                   [item[2] for item in perfect_score_list],
                                  'Total Score':   [item[3] for item in perfect_score_list]})

    return final_df

In [6]:
def perform_matching(reviewers: pd.DataFrame, entrants: pd.DataFrame,
                     use_reviewer_hist: bool=False) -> pd.DataFrame:
    ''' Perform a scored matching between all entrants and reviewers.

        Input:
            reviewers: dataframe containing the following reviewers information
                name, email, methods, topic and history of reviews (if provided)
            entrants: dataframe containing the following entrants information
                name, email, methods, topic
            use_reviewer_hist: specify to use or not use a reveiwer's history in scoring

        Calls:
            normalize function
            scoring function

        Return:
            matching_df - a Pandas dataframes that contains the follow information
               reviewer - names
               reviewer_email
               reviewer_topics
               reviewer_methods
               entrant - names
               entrant_email
               entrant_topics
               entrant_methods
               itemized_score - [topic, method, history]
               total_score - sum of the itemized scores

        Library Dependencies:
            1) Pandas
    '''

    if not isinstance(reviewers, pd.core.frame.DataFrame):
        raise TypeError(f"Function '{perform_matching.__name__}': reviewers object is not a Pandas' dataframe.")
    elif not isinstance(entrants, pd.core.frame.DataFrame):
        raise TypeError(f"Function '{perform_matching.__name__}': entrants object is not a Pandas' dataframe.")

    reviewer_name_list = []
    reviewer_email_list = []
    reviewer_topic_list = []
    reviewer_method_list = []

    entrant_name_list = []
    entrant_email_list = []
    entrant_topic_list = []
    entrant_method_list = []
        
    score_all_list =[]
    score_total_list =[]

    for i in range(len(reviewers)):
        for j in range(len(entrants)):
            reviewer_name_list.append(reviewers.loc[i].Name)
            reviewer_email_list.append(reviewers.loc[i].Email)
            reviewer_topic_list.append(reviewers.loc[i].Topics)
            reviewer_method_list.append(reviewers.loc[i].Methods)

            entrant_name_list.append(entrants.loc[j].Name)
            entrant_email_list.append(entrants.loc[j].Email)
            entrant_topic_list.append(entrants.loc[j].Topics)
            entrant_method_list.append(entrants.loc[j].Methods)

            score_topics, score_methods, score_history = scoring(reviewer=reviewers.loc[i],
                                                                 entrant=entrants.loc[j],
                                                                 use_reviewer_hist=use_reviewer_hist)

            score_all = [score_topics, score_methods, score_history]
            score_all_list.append(score_all)

            score_total = score_topics + score_methods + score_history
            score_total_list.append(score_total)

    score_all_topic = []
    score_all_method = []
    score_all_history = []

    for sublist in (score_all_list):
        score_all_topic.append(sublist[0])
        score_all_method.append(sublist[1])
        score_all_history.append(sublist[2])

    score_topic_normalized = [normalize(value, score_all_topic) for value in score_all_topic]
    score_method_normalized = [normalize(value, score_all_method) for value in score_all_method]
    
    for item in score_all_history:
        if (item > 0):
            score_history_normalized = [normalize(value, score_all_history) for value in score_all_history]
            break
        else:
            score_history_normalized = score_all_history

    score_itemize_list = []
    score_itemize_list.extend([list(value) for value in zip(score_topic_normalized,
                                                            score_method_normalized,
                                                            score_history_normalized)])

    score_final_list = [float(a) + float(b) + float(c) for a, b, c in zip(score_topic_normalized,
                                                                          score_method_normalized,
                                                                          score_history_normalized)]

    results = {'reviewer': reviewer_name_list,
               'reviewer_email': reviewer_email_list,
               'reviewer_topics': reviewer_topic_list,
               'reviewer_methods': reviewer_method_list,
               'entrant': entrant_name_list,
               'entrant_email': entrant_email_list,
               'entrant_topics': entrant_topic_list,
               'entrant_methods': entrant_method_list,
               'itemized_score': score_itemize_list,
               'total_score': score_final_list}

    matching_df = pd.DataFrame(data=results)

    return matching_df

In [7]:
if __name__ == "__main__":
    ''' This programs optimizes the matching between reviewers and entrants,
        based on their respective methodology and topics. One can also include
        the reviewer's most recent past history of performing/not performing
        review (i.e., done to reduce their burden over time.)

        Input and Flags:
            1) reviewers CSV file (; seperated)
            2) entrants CSV file (; seperated)
            3) use_history - 'True' or 'False'
            4) reviewer_responsibility - number of reviews to be done by each reviewer
            5) entrant_needed_reviews - number of reviews need for each entrant

        Output:
            1) CSV formatted file of matchings (; seperated), including itemized
               and total matching scores.
            2) prints to screen the suggested best matchings

        Library Dependencies:
            1) Pandas

        Structure of input CSV files:
            entrants.csv:
                Name;Email;Methods;Topics
            reviewers.csv:
                Name;Email;Methods;Topics;Round 2019S;Round 2019F; ...

        Methods and Topics can be a several entries seperated by a comma.

        For the reviewers, it is assumed that the last columns are the histories of
            review participation (i.e., 'yes' or 'no'). These are provided in a
            column whose header name include the word 'Round' (see example).

        Contact:
            Karl N. Kirschner
            Department of Computer Science
            University of Applied Sciences Bonn-Rhein-Sieg
            Grantham-Allee 20
            53757 Sankt Augustin - Germany

            Email: k.n.kirschner _at_ gmail.com

        Contribution:
            Concept: Kirschner
            Coding and structure: Jiang and Bitterling (initial development); Kirschner
    '''

    use_history = False
    reviewer_responsibility = 5 
    entrant_needed_reviews = 3

    pd.set_option('display.max_rows', 100)
    pd.set_option('display.min_rows', 100)

    try:
        reviewers_data = pd.read_csv('reviewers.csv', sep=';')
        entrants_data = pd.read_csv('entrants.csv', sep=';')
    except FileNotFoundError:
        print('Input files for reviewer or entrants was not found.')
    else:
        print("""Computing best entrant-reviewer matching based on thier
                 1) topics, 2) methodology and optionally 3) the recent
                 review history of the reviewers.\n""")

        print(f'Total reviewers: {len(reviewers_data)}; Total entrants: {len(entrants_data)}')
        
        matchings = perform_matching(reviewers=reviewers_data,
                                     entrants=entrants_data,
                                     use_reviewer_hist=use_history)

        # Operate on matchings that has a) been grouped and summed and b) sorted by reviewer_score
        matchings['reviewer_score'] = matchings.groupby(by='reviewer')['total_score'].transform('sum')
        matchings.sort_values(by=['reviewer_score', 'total_score', 'reviewer'],
                              ascending=[False, False, True], inplace=True)

        top_matches = filter_top_matching(matchings=matchings,
                                          reviewer_responsibility=reviewer_responsibility,
                                          entrant_needed_reviews=entrant_needed_reviews)

        display(top_matches)

        top_matches.to_csv('matching_results.csv', sep=';', index=False)

Computing best entrant-reviewer matching based on thier
                 1) topics, 2) methodology and optionally 3) the recent
                 review history of the reviewers.

Total reviewers: 15; Total entrants: 24


Unnamed: 0,Reviewer Name,Entrant Name,"Itemized Score [topics, methods, reviewer's history]",Total Score
0,"Persson, Kristin Aslaug",Banerjee,"[0.667, 1.000, 0]",1.667
1,"Persson, Kristin Aslaug",King,"[0.333, 1.000, 0]",1.333
2,"Persson, Kristin Aslaug",Klem,"[0.333, 1.000, 0]",1.333
3,"Persson, Kristin Aslaug",Ojha,"[0.333, 1.000, 0]",1.333
4,"Persson, Kristin Aslaug",Achar,"[0.667, 0.500, 0]",1.167
5,"Tabor, Danny",Banerjee,"[1.000, 0.500, 0]",1.5
6,"Tabor, Danny",Klem,"[0.333, 1.000, 0]",1.333
7,"Tabor, Danny",Ojha,"[0.333, 1.000, 0]",1.333
8,"Tabor, Danny",Achar,"[0.667, 0.500, 0]",1.167
9,"Tabor, Danny",Dutta,"[0.000, 1.000, 0]",1.0


In [8]:
top_matches.sort_values(by=['Entrant Name', 'Total Score'], ascending=[True, False], inplace=True)
display(top_matches)

Unnamed: 0,Reviewer Name,Entrant Name,"Itemized Score [topics, methods, reviewer's history]",Total Score
4,"Persson, Kristin Aslaug",Achar,"[0.667, 0.500, 0]",1.167
8,"Tabor, Danny",Achar,"[0.667, 0.500, 0]",1.167
17,"Zheng, Jie",Achar,"[0.333, 0.500, 0]",0.833
14,"Miao, Yinglong",Bali,"[0.333, 0.500, 0]",0.833
59,"Dutt, Meenakshi",Bali,"[0.000, 0.000, 0]",0.0
61,"Shakib, Farnaz A.",Bali,"[0.000, 0.000, 0]",0.0
0,"Persson, Kristin Aslaug",Banerjee,"[0.667, 1.000, 0]",1.667
5,"Tabor, Danny",Banerjee,"[1.000, 0.500, 0]",1.5
18,"Zheng, Jie",Banerjee,"[0.333, 0.500, 0]",0.833
40,"Aikens, Christine",Casetti,"[0.333, 1.000, 0]",1.333


In [33]:
check_for_duplicates(input_df=top_matches, column_names=['Reviewer Name', 'Entrant Name'])

The following rows are duplicated within the dataframe, 
which should be adjusted by hand (output CSV file):


Unnamed: 0,Reviewer Name,Entrant Name,"Itemized Score [topics, methods, reviewer's history]",Total Score
66,"Lambrecht, Daniel S.",Gusev,"[0.000, 0.000, 0]",0.0
69,"Lambrecht, Daniel S.",Gusev,"[0.000, 0.000, 0]",0.0
70,"King, Rolin",Xia,"[0.000, 0.000, 0]",0.0
71,"King, Rolin",Xia,"[0.000, 0.000, 0]",0.0



