In [1]:
from pymongo import MongoClient
import certifi
import os
from dotenv import load_dotenv
from utils.courses import find_courses_by_name, retrieve_lessons_text, get_google_translations_from_txt, encode_course_by_lesson, cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load env variables
load_dotenv()
MONGO_URI = os.environ["MONGO_DB_URI"]
PATH = os.environ["DIR_PATH"]

In [3]:
#Load the gcf data from the MongoDB db
ca = certifi.where()
cluster = MongoClient(host=MONGO_URI, tlsCAFile=ca)
db = cluster["gcfglobal"]
collection_lessons = db["lesson"]
collection_tutorials = db['tutorial']

# Get the Courses Content

In [4]:
lessons = retrieve_lessons_text(collection_lessons, find_courses_by_name(collection_tutorials, "excel-2016"))

Found 2 tutorials for course excel-2016 



In [5]:
lessons_google = get_google_translations_from_txt(PATH + "/analisis_traducciones/data/translate/","excel-2016", lessons)

In [6]:
encodings = encode_course_by_lesson(lessons, "excel-2016", "multi-qa-distilbert-cos-v1",lessons_google)

In [7]:
def index_of_most_similar(base_seq, seq, i):
    '''
    Given the element of seq at index i,
    returns the index of the most similar element in the base_seq 
    (by cosine distance) 
    '''
    el = seq[i]
    similarities = (list(map(lambda x: cosine_similarity(el,x), base_seq)))
    return similarities.index(max(similarities))

def compute_swapping_order(base, to_order, acc = []):
    '''
    Returns a list of tuples such that if the elements in
    to_order are swapped from the current position (1st element in tuple)
    to the target position (2nd element in tuple). Then the cosine similarity
    of the 1:1 correspondance will be maximum.

    Note that base and to_order don't need to be the same length.
    '''
    if (len(to_order) == (len(acc))):
        return list(zip(range(len(to_order)), acc))
    else:
        return compute_swapping_order(base, to_order,[*acc, index_of_most_similar(base, to_order, len(acc))])

In [8]:
print(compute_swapping_order(encodings['excel-2016']['pt'], encodings['excel-2016']['es']))

[(0, 20), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 15), (16, 16), (17, 17), (18, 20), (19, 19), (20, 20), (21, 21), (22, 22), (23, 29), (24, 24), (25, 25), (26, 26), (27, 27), (28, 28), (29, 29), (30, 30), (31, 31), (32, 4)]
