In [None]:
%%capture
!pip install transformers

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from collections import Counter

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
lemmatizer = WordNetLemmatizer()
lemma = lemmatizer.lemmatize
my_stop_words = [lemma(t) for t in stopwords.words('english')]

def remove_punctuation(text):
    table = text.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    return text

def tokenize(text):
    no_punct = remove_punctuation(text)
    stems = [lemma(t) for t in word_tokenize(no_punct) if not t in my_stop_words ]
    return stems

def naive_terms(texts, n=3):
    print("Term finding started.")
    vectorizer = CountVectorizer(tokenizer = tokenize, strip_accents = 'ascii', ngram_range = (1,n))#, stop_words=my_stop_words)
    X = vectorizer.fit_transform(texts)
    terms = vectorizer.inverse_transform(X)
    print("Term finding finished.")
    return terms

In [None]:
data = pd.read_csv('skills.csv')
skills_texts = data['ALL'].to_list()

In [None]:
terms = naive_terms(skills_texts)
terms = list(x for c in terms for x in c)
my_counter = Counter()
my_counter.update(terms)
# Code to sort based in count value
#sorted_count = {k: v for k, v in sorted(my_counter.items(), key=lambda item: item[1], reverse=True)}
most_common = [ x[0] for x in my_counter.most_common(800) ]

Term finding started.




Term finding finished.


In [None]:
most_common

In [None]:
import pickle
import numpy as np
import os
from transformers import AutoTokenizer, T5EncoderModel
import torch
import torch.nn.functional as F
from tqdm import tqdm
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

device = "cuda" if torch.cuda.is_available() else "cpu"
def embedd_bert(text):
  st_model = 't5-large'
  batch_size = 32
  tokenizer = AutoTokenizer.from_pretrained(st_model)
  model = T5EncoderModel.from_pretrained(st_model).to(device)
  word_embeddings = []
  for i in tqdm(range(0,len(text),batch_size), desc="Embedding for "+" Size: "+str(len(text))):
    encoded_input = tokenizer(text[i:i+batch_size], return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
      model_output = model(**encoded_input)
      word_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
      word_embedding = F.normalize(word_embedding, p=2, dim=1)
      word_embeddings = word_embeddings + word_embedding.tolist()
  return word_embeddings

In [None]:
word_embeddings = embedd_bert(most_common)

Embedding for  Size: 800: 100%|██████████| 25/25 [01:21<00:00,  3.26s/it]


In [None]:
keyword_embeddings = pd.DataFrame({'Keyword':most_common, 'embedding':word_embeddings})

In [None]:
from nltk.cluster import KMeansClusterer
import nltk

def clustering(data,NUM_CLUSTERS = 15):

    sentences = data['Keyword']

    X = np.array(data['embedding'].tolist())

    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25,avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data

In [None]:
keyword_clusters = clustering(keyword_embeddings, 2)