In [None]:
#!pip install pandas openai torch scikit-learn dvc dvc-s3
#!pip install openpyxl retry python-dotenv



In [None]:

import math
from typing import Tuple, List, Text, Dict


In [None]:
def top_n_size(x, y, z=None):
    """
    Input - 
    x - count of supplier relation sentences
    y - count of customer relation sentences
    z - count of other relation sentences

    Balaced approach towards all relations
    Return the minimum of "20% of count each relation" as top_n_relations to consier in final scoring

    """
    if z:
        assert (x > 0) and (y > 0) and (z > 0)
        n1 = math.ceil(x * 0.2)
        n2 = math.ceil(y * 0.2)
        n3 = math.ceil(z * 0.2)
        return min(n1, min(n2, n3))
    else:
        assert (x > 0) and  (y > 0)
        n1 = math.ceil(x * 0.2)
        n2 = math.ceil(y * 0.2)
        return min(n1, n2)

def top_n_size_new(x, y):
    """
    Input - 
    x - count of supplier relation sentences
    y - count of customer relation sentences

    - Ignore the relation count of other relations, find the top_n_size based on 
    only supplier and customer relations count. 
    - This approach slightely favors relation occuring more times
   
    """
    assert (x > 0) and (y > 0) 
    
    # If difference of just 1 relation count, two relations will fight for winning relation
    if abs(x - y) == 1:
        
        return min(x, y)
    else:
        # if more difference in relation counts, favor to relation with more count
        n1 = math.ceil(x * 0.5)
        n2 = math.ceil(y * 0.5)        
        return max(n1, n2)


def log_sum_top_n(scores, top_n_size):
    """
    Logarithmic sum of top_n scores
    function name - log_sum_top_n
    """
    total_score = sum(scores)
    avg_score = total_score / len(scores)
    sorted_classifications = sorted(scores, reverse=True)
    top_n = sorted_classifications[:top_n_size]
    top_n_conf = sum(top_n)
    return avg_score * (1 + math.log(top_n_conf))
    
    



In [None]:
### Debug -- ignore below code

# write test for top_n_size
assert top_n_size(10, 10) == 2
assert top_n_size(10, 10, 5) == 1
assert top_n_size(10, 5, 5) == 1

# print the results of top_n_size_new function with different inputs

# convert print calls to print with input parameters and results like below
print(f"{1} {1}  - {top_n_size_new(1, 1)}")
print(f"{1} {2}  - {top_n_size_new(1, 2)}")
print(f"{2} {3}  - {top_n_size_new(2, 3)}")
print(f"{3} {1}  - {top_n_size_new(3, 1)}")
print(f"{4} {1}  - {top_n_size_new(4, 1)}")
print(f"{4} {2}  - {top_n_size_new(4, 2)}")
print(f"{4} {3}  - {top_n_size_new(4, 3)}")
print(f"{5} {1}  - {top_n_size_new(5, 1)}")
print(f"{5} {2}  - {top_n_size_new(5, 2)}")
print(f"{5} {3}  - {top_n_size_new(5, 3)}")
print(f"{6} {1}  - {top_n_size_new(6, 1)}")
print(f"{6} {2}  - {top_n_size_new(6, 2)}")
print(f"{6} {3}  - {top_n_size_new(6, 3)}")
print(f"{6} {4}  - {top_n_size_new(6, 4)}")
print(f"{7} {2}  - {top_n_size_new(7, 2)}")
print(f"{7} {3}  - {top_n_size_new(7, 3)}")
print(f"{7} {4}  - {top_n_size_new(7, 4)}")
print(f"{8} {2}  - {top_n_size_new(8, 2)}")
print(f"{8} {4}  - {top_n_size_new(8, 4)}")
print(f"{8} {6}  - {top_n_size_new(8, 6)}")
print(f"{10} {3}  - {top_n_size_new(10, 3)}")
print(f"{10} {5}  - {top_n_size_new(10, 5)}")
print(f"{10} {7}  - {top_n_size_new(10, 7)}")



1 1  - 1
1 2  - 1
2 3  - 2
3 1  - 2
4 1  - 2
4 2  - 2
4 3  - 3
5 1  - 3
5 2  - 3
5 3  - 3
6 1  - 3
6 2  - 3
6 3  - 3
6 4  - 3
7 2  - 4
7 3  - 4
7 4  - 4
8 2  - 4
8 4  - 4
8 6  - 4
10 3  - 5
10 5  - 5
10 7  - 5


In [None]:

def agg_relation_score(company_relation_score: Dict, top_n_approach: str):
    """
    Input:
    top_n_approach: "old" or "new"
        "old" - top_n_size function
        "new" - top_n_size_new function
    
    Returns:
    Aggregates the scores for each relation type (supplier, customer, other) and 
    returns a dictionary with the aggregated scores.
    """

    supplier_scores = company_relation_score.get("supplier_scores", [])
    customer_scores = company_relation_score.get("customer_scores", [])
    other_scores = company_relation_score.get("other_scores", [])
 
    label_scores = {"supplier": 0, "customer": 0, "other": 0}

    # no scores for any relation 
    if not supplier_scores and not customer_scores and not other_scores:
        pass
    
    # only one relation has scores
    elif supplier_scores and not customer_scores and not other_scores:
        label_scores["supplier"] = log_sum_top_n(supplier_scores, len(supplier_scores))
    
    elif customer_scores and not supplier_scores and not other_scores:        
        label_scores["customer"] = log_sum_top_n(customer_scores, len(customer_scores))

    elif other_scores and not customer_scores and not supplier_scores:
        label_scores["other"] = log_sum_top_n(other_scores, len(other_scores)) 
    
    # two or more relations have scores
    else:
        if customer_scores and supplier_scores and not other_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(supplier_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(supplier_scores))
        elif customer_scores and other_scores and not supplier_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(other_scores))

        elif supplier_scores and other_scores and not customer_scores:
            if top_n_approach == "old":
                n = top_n_size(len(supplier_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(supplier_scores), len(other_scores))
        elif customer_scores and supplier_scores and other_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(supplier_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(supplier_scores))

        if customer_scores:
            label_scores["customer"] = log_sum_top_n(customer_scores, n)
        if supplier_scores:
            label_scores["supplier"] = log_sum_top_n(supplier_scores, n)
        if other_scores:
            label_scores["other"] = log_sum_top_n(other_scores, n)

    return label_scores
    

In [None]:
def get_winning_relation(company_relation_score: Dict, top_n_approach: str  ):
    """
    Input:
    ------
    Dict with list of scores for each relation type. 
    {"customer_scores": [..],  "supplier_scores": [..], "other_scores": [..]}
    
    top_n_approach: "old" or "new"
    "old" - top_n_size function
    "new" - top_n_size_new function
    
    Return:
    -------
    Dict with relation type and scores for each relation type.
    If there are more than one relation type with the same score, then winning relation is "Supplier".

    """

    # get the scores for each relation type: Dict[relation_key, aggregated_score_for_relation]
    relation_scores = agg_relation_score(company_relation_score, top_n_approach)

    # find the max score and get the relation type of max score
    max_score = max(relation_scores.values())
    max_score_relations = [k for k, v in relation_scores.items() if v == max_score]

    # if there are more than one relation type with max score, then return "Suuplier"
    if len(max_score_relations) > 1:
        winning_relation = "supplier"
    else:
        winning_relation = max_score_relations[0]
    
    relation_scores["winning_relation"] = winning_relation

    return relation_scores


    

In [None]:
# write the test for get_winning_relation

# test case 1
supplier_scores = [0.7]
customer_scores = []
other_scores = []
excepted_supplier_score = log_sum_top_n(supplier_scores, len(supplier_scores))
expected_winning_relation = "supplier"
result = get_winning_relation({"supplier_scores": [0.7], "customer_scores": customer_scores,
                                "other_scores": other_scores}, top_n_approach="old")
assert result == {"supplier": excepted_supplier_score, "customer": 0, "other": 0, "winning_relation": expected_winning_relation}


# test case 2
supplier_scores = [0.7, 0.8]
customer_scores = []
other_scores = []
excepted_supplier_score = log_sum_top_n(supplier_scores, len(supplier_scores))
expected_winning_relation = "supplier"
result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")
assert result == {"supplier": excepted_supplier_score, "customer": 0, "other": 0, "winning_relation": expected_winning_relation}


# test case 3
supplier_scores = [0.7]
customer_scores = [0.8]
other_scores = []
excepted_supplier_score = log_sum_top_n(supplier_scores, len(supplier_scores))
excepted_customer_score = log_sum_top_n(customer_scores, len(customer_scores))
expected_winning_relation = "customer"
result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")
assert result == {"supplier": excepted_supplier_score, "customer": excepted_customer_score, "other": 0, "winning_relation": expected_winning_relation}

# test case 4
supplier_scores = [0.7]
customer_scores = [0.8]
other_scores = [0.9, 0.6]
top_n = top_n_size(len(supplier_scores), len(customer_scores), len(other_scores))
excepted_supplier_score = log_sum_top_n(supplier_scores, top_n)
excepted_customer_score = log_sum_top_n(customer_scores, top_n)
excepted_other_score = log_sum_top_n(other_scores, top_n)
expected_winning_relation = "other"
result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")
assert result == {"supplier": excepted_supplier_score, "customer": excepted_customer_score, "other": excepted_other_score, "winning_relation": expected_winning_relation}

# test case 5
supplier_scores = [0.7, 0.8, 0.6]
customer_scores = [0.8]
other_scores = [0.9, 0.6]
top_n = top_n_size(len(supplier_scores), len(customer_scores), len(other_scores))
excepted_supplier_score = log_sum_top_n(supplier_scores, top_n)
excepted_customer_score = log_sum_top_n(customer_scores, top_n)
excepted_other_score = log_sum_top_n(other_scores, top_n)
expected_winning_relation = "other"
result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")
assert result == {"supplier": excepted_supplier_score, "customer": excepted_customer_score, "other": excepted_other_score, "winning_relation": expected_winning_relation}



In [55]:
# compare old and new approach for top_n_size_new function

# test case 
supplier_scores = [0.97]
customer_scores = [0.90, 0.94, 0.8]
other_scores = [0.8, 0.6, 0.7, 0.6, 0.7]

result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")

print(f"old approach \n {result}")

result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="new")

print(f"new approach \n {result}")


old approach 
 {'supplier': 0.9404545687398327, 'customer': 0.8255496447280829, 'other': 0.5282623851063374, 'winning_relation': 'supplier'}
new approach 
 {'supplier': 0.9404545687398327, 'customer': 1.4165937030263867, 'other': 0.9557162735135517, 'winning_relation': 'customer'}


In [54]:
result

{'supplier': 0.9404545687398327,
 'customer': 1.4165937030263867,
 'other': 0.9557162735135517,
 'winning_relation': 'customer'}

In [38]:
# old approach: call get_winning_relation with param top_n_approach as "old"

result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="old")


# new approach: call get_winning_relation with param top_n_approach as "new"

result = get_winning_relation({"supplier_scores": supplier_scores, "customer_scores": customer_scores, 
                               "other_scores": other_scores}, top_n_approach="new")
