#### This file includes functions to:
   - Detect potential clusterings that may have new tags
   - Generate candidate tags for these clusterings
   - Linguistic feature embedding(dependecy parsing)

(They work for English and French.)

#### Import the following packages. You can run python -m spacy download fr_core_news_sm to downlowad if you do not have one.

In [14]:
from collections import defaultdict
import spacy
import nltk
from nltk.util import ngrams
import numpy as np
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

In [12]:
def detect_new_tag_clustering(cluster,threshold=0.6):
    """
    This function determine whether the input cluster needs a new tag or not. 
    If percentage of the dominant tag is over shreshold hundredths, this cluster does not need a new tag.
    If percentage of the dominant tag is below shreshold hundredths, this cluster needs a new tag.
    
    Parameters:
    -----------
        cluster: a list of tags in the cluster
    
    Return:
    -----------
        boolean: yes-this cluster needs a new tag;no-this cluster does not need a new tag
    """
    counter_dict = defaultdict(int)
    for tag in cluster:
        counter_dict[tag]+=1
    total = sum(list(counter_dict.values()))
    tag_count_pairs = sorted(list(counter_dict.items()),key = lambda x: x[1],reverse=True)
    if tag_count_pairs[0][1]/total>threshold:
        return True
    else:
        return False

True

#### Example:

In [14]:
cluster1 = ["Tag1","Tag2","Tag1","Tag2","Tag1","Tag1","Tag1","Tag1","Tag1"]
detect_new_tag_clustering(cluster1)
cluster2 = ["Tag5","Tag2","Tag1","Tag2","Tag1","Tag7","Tag1","Tag5","Tag1"]
detect_new_tag_clustering(cluster2)

False

In [62]:
def generate_tag_candidates(comments, approach="NER", langauge="English", top=10):
    """
    This function takes a list of comments/indices and generate a list of potential tags.
    
    Parameters:
    -----------
        comments: list of str
        approach: define the approach to generate new tags. The default one is "NER". You can also use "unigram", "bigram". 
        langauge: English or French
        top: return top 10(defalut) common entities
    Return:
    -----------
        candidates: list of str
    """

    if langauge=="English":
        nlp = nlp_en
    elif langauge=="French":
        nlp = nlp_fr
    
    for comment in comments:
        doc = nlp(comment)
        if approach=="NER":
            entity_counter_dict = defaultdict(int)
            for ent in doc.ents:
                entity_counter_dict[ent.text] += 1
            entity_count_pairs = sorted(list(entity_counter_dict.items()),key = lambda x: x[1],reverse=True)
            return [ent for (ent, count) in entity_count_pairs[:top]]
        if approach=="bigram" or approach=="trigram":
            n_gram_counter_dict = defaultdict(int)
            tokens = [token.lemma_ for token in doc]
            n=2 if approach=="bigram" else 3
            n_grams = ngrams(tokens, n)
            for n_gram in n_grams:
                n_gram_counter_dict[n_gram] += 1
            n_gram_count_pairs = sorted(list(n_gram_counter_dict.items()),key = lambda x: x[1],reverse=True)
            return [n_gram for (n_gram, count) in n_gram_count_pairs[:top]]
            
            

In [64]:
comments = ["How to enter Canada from U.S?", "Where is Canada?"]
generate_tag_candidates(comments, approach="trigram", langauge="English")

[('how', 'to', 'enter'),
 ('to', 'enter', 'Canada'),
 ('enter', 'Canada', 'from'),
 ('Canada', 'from', 'U.S'),
 ('from', 'U.S', '?')]

cited from [here](https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db)

In [11]:
embeddings_dict = {}
with open("../data/glove.6B/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [7]:
v = [[1,2,2],[100,0,50]]
np.array(v).mean(axis=0)

array([50.5,  1. , 26. ])

In [12]:
def pred_embedding(pred_list, dim=100):
    """
    This function maps a list of pred to a 1x100 200 or 300 vector(the default one is 100d)
    Parameters:
    ------------
        pred_list: list of str
    Return:
    ------------
        emb: 1xdim array
    """
    vectors = []
    for pred in pred_list:
        if pred in embeddings_dict:
            vectors.append(embeddings_dict[pred])
        else:
            vectors.append([0]*dim)
    return np.array(vectors).mean(axis=0) # get mean     

In [13]:
pred_embedding(["use","get"], dim=100)

array([-0.11049001,  0.37868   ,  0.13138498, -0.34656549, -0.31117502,
        0.10618401, -0.35618   ,  0.26565   ,  0.1083475 ,  0.0336    ,
       -0.10457   , -0.02349998,  0.086239  ,  0.31344   ,  0.30633998,
       -0.24361001,  0.46379   ,  0.422775  , -0.49794   ,  0.307405  ,
        0.1629845 ,  0.07167001,  0.181355  , -0.15389001, -0.0874075 ,
        0.024791  , -0.22953   , -0.63475   ,  0.23915249, -0.08699501,
        0.012312  ,  0.960665  , -0.49604   ,  0.163051  ,  0.43383002,
        0.392805  , -0.34657502,  0.130058  ,  0.205599  ,  0.1603395 ,
       -0.262295  , -0.37798   , -0.27833   , -0.67448497, -0.34548002,
        0.052679  , -0.206057  , -0.51545   ,  0.133735  , -1.0046    ,
       -0.06104501,  0.410275  , -0.21164551,  1.121715  ,  0.0728007 ,
       -2.2982502 ,  0.217875  , -0.15031   ,  1.9992499 , -0.05432   ,
       -0.066075  ,  0.822565  , -0.261895  ,  0.2889545 ,  0.98931503,
       -0.11211199,  0.593485  ,  0.068685  , -0.01613   , -0.51