In [1]:
import pandas as pd
import pkuseg
import csv
import torch
import itertools
import numpy as np
from transformers import BertTokenizer, BertModel


# Sample data

In [2]:
data = pd.read_csv('answer.csv', delimiter=',')
sample_portion = 0.01
sample_size = int(sample_portion * len(data['content']))
data_sample = data['content'].sample(n=sample_size)

# Sentence segmentation

In [3]:
seg = pkuseg.pkuseg(model_name='medicine')
# data = pd.read_csv('sampled_answer.csv', delimiter=',', header=None)[0]
data = data_sample
vocabulary = set(pd.read_csv('all_med_words.csv', delimiter='\t', header=None)[0])
segmented_answer = [' '.join([word for word in seg.cut(text) if word in vocabulary]) for text in data]
segment_df = pd.DataFrame(segmented_answer)
segment_df.to_csv('segmented_answer.csv', index=False, header=False)

# Generate sentence pairs and label pairs

In [4]:
data = segment_df[0]
data = [i for i in data if len(i) > 0]

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = BertTokenizer.from_pretrained('shibing624/text2vec-base-chinese')
model = BertModel.from_pretrained('shibing624/text2vec-base-chinese')
# Tokenize sentences
encoded_input = tokenizer(data, padding=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [5]:
def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

length = len(sentence_embeddings)
output = []
for i in range(length):
    if i % 1000 == 0:
        print(i)
    output += [(data[i], data[j], int(cosine_similarity(sentence_embeddings[i], sentence_embeddings[j]) // 0.2)) for j in range(length)]



0
1000
2000


# Select suitable pair and shrink data size

In [6]:
labeled_df =  pd.DataFrame(output)
labeled_df = labeled_df[(labeled_df[2] == 4) | (labeled_df[2] < 1)]
labeled_df[2] = labeled_df[2] // 4
labeled_df = labeled_df.sample(n=24000, replace=True)
labeled_df.to_csv('cnn_data.csv', index=False)