In [1]:
import pickle
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from tqdm.auto import tqdm
from annoy import AnnoyIndex

class MeanPooling(torch.nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(
            -1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class Embedder(torch.nn.Module):
    def __init__(self, path, max_len=512) -> None:
        super().__init__()
        self.path = path
        self.max_len = max_len

        self.model = AutoModel.from_pretrained(self.path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.path)
        self.config = AutoConfig.from_pretrained(
            self.path, output_hidden_states=True)
        self.pool = MeanPooling()

    def get_embedding(self, inputs):
        encoded_input = self.tokenizer(
            inputs, padding=True, truncation=True, max_length=self.max_len, return_tensors='pt')
        encoded_input = encoded_input.to(next(self.parameters()).device)

        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self.pool(
                model_output[0], encoded_input['attention_mask'])

        return sentence_embeddings.cpu().numpy()

In [2]:
coursera_df = pd.read_csv("data/Coursera.csv")
coursera_df.drop_duplicates(subset=["Course Name"], inplace=True)
print(coursera_df.shape)
coursera_df.head()

(3416, 7)


Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [16]:
name2link = {}
for id, row in coursera_df.iterrows():
    name2link[row["Course Name"]] = row["Course URL"]

id2link = {}
for id, row in coursera_df.iterrows():
    id2link[id] = row["Course URL"]

In [4]:
model_name = "sentence-transformers/LaBSE"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedder = Embedder(model_name)
embedder = embedder.to(device)

name2vector = {}
id2name = {}

t = AnnoyIndex(768, 'angular')

for id, row in tqdm(coursera_df.iterrows()):
    vector = list(embedder.get_embedding([row["Course Description"]])[0])
    name2vector[row["Course Name"]] = vector
    id2name[id] = row["Course Name"]
    t.add_item(id, vector)

0it [00:00, ?it/s]

In [15]:
import json

with open("data/id2name.json", "w") as file:
    json.dump(id2name, file)

In [17]:
with open("data/id2link.json", "w") as file:
    json.dump(id2link, file)

In [6]:
t.build(16) # trees
t.save('data/AnnoyIndex_16.ann')

True

In [7]:
u = AnnoyIndex(768, 'angular')
u.load('data/AnnoyIndex_16.ann') # super fast, will just mmap the file

True

In [18]:
text = "Дата-сайентист (он же Data Scientist, специалист по Data Science) может найти себе работу в любой сфере: от розничной торговли до астрофизики. Потому что именно он — настоящий повелитель больших данных. Вместе с автором кейсов для курса по Data Science Глебом Синяковым разбираемся, почему в современном мире всем так нужны дата-сайентисты."
tmp = u.get_nns_by_vector(list(embedder.get_embedding([text])[0]), 3)
tmp

[3217, 2355, 334]

In [10]:
for el in tmp[0]:
    print(id2name[el])

Introduction to Business Analytics with R
A Crash Course in Data Science
Predictive Analytics and Data Mining
Introduction to Business Analytics: Communicating with Data
Applying Data Analytics in Marketing


In [5]:
# stepik_df = pd.read_excel("data/stepik.xlsx")
# stepik_df.head()