In [4]:
%run "__imports__.ipynb"


'''
Based on the Google News model and word2vec approach, 
every word is associated to a vector with num_features entries.
The vector associated to the sentence is the average of values for all the words in the
sentence that are also contained in the language model
'''
def avg_feature_vector (sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)

    return feature_vec


'''
Build the set of keywords, based on the metadata reords in GeoCatalogue
Get all the records, save all the keywords and remove duplicates
'''
def build_keywords_set (article_keywords, keywords_set):
    for key in article_keywords:
         #if key is contained by multiple items separated by ','
        if ',' in key:
            keylist = key.split(', ')
            for key in keylist:
                if not key.lower() in keywords_set:
                    keywords_set.append(key.lower())
        else:
            if not key.lower() in keywords_set:
                keywords_set.append(key.lower())

    return keywords_set

'''
Build a dictionary of keywords, based on the metadata reords in GeoCatalogue
Get all the records, save all the keywords and see how often they appear
'''
def build_keywords_dict (article_keywords, keywords_dict):
    for key in article_keywords:
        if key:
         #if key is contained by multiple items separated by ','
            if ',' in key:
                keylist = key.split(', ')
                for key in keylist:
                    if key.lower() in keywords_dict:
                        keywords_dict[key.lower()] += 1
                    else:
                        keywords_dict[key.lower()] = 1
            else:
                if key.lower() in keywords_dict:
                    keywords_dict[key.lower()] += 1
                else:
                    keywords_dict[key.lower()] = 1
    return keywords_dict

'''
Set the language model, by default it is Google News dataset and it is used
to get similarities between words and sentences
'''
def init_language_model (language_model=LANGUAGE_MODEL):
    print(language_model)
    model = KeyedVectors.load_word2vec_format(language_model, binary=True)
    index2word_set = set(model.wv.index2word)
    
    return model, index2word_set


'''
For all the entries in the list of the keywords, save only the ones similar to
the search query
'''
def get_best_keywords_from_list (keywords_set, sent1, similarity_index):
    best_keywords = []

    for keyw in keywords_set:
        sent2 = avg_feature_vector(keyw, model, num_features=NUM_FEATURES, index2word_set=index2word_set)
        all_zeros_sent2 = not np.any(sent2)
        if not all_zeros_sent2:
            sim = 1 - spatial.distance.cosine(sent1, sent2)
            if sim > similarity_index:
                best_keywords.append(keyw)

    print("best keywords for similarity %f: " %similarity_index)
    print(best_keywords)
    return best_keywords


'''
Auxiliary function to get corresponding parts of speech tags between NLP libraries
'''
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


'''
This function will remove all stop words and punctuation in the text and return
a preprocessed variation of the initial description
In order to keep the essential words in the description, the following were removed:
the stopwords, the non-literal characters, short common words and all the words were
transformed to their basic form
'''  
def prepareDescription(text, keepwords="", abbrevations=""):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    
    # Split the text words into tokens
    word_tokens = word_tokenize(text)
    
    #Get only lowercase entries
    word_tokens = [word.lower() for word in word_tokens]
    
    # Get all stop words in english.
    stop_words = stopwords.words('english')

    #Lemmatize words
    lemmatizer = WordNetLemmatizer()

    main_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokens]
    word_tokens = main_words

    # Below list comprehension will return only keywords that are not in stop words
    main_words = [word for word in word_tokens if not word in stop_words]
    main_words = list( dict.fromkeys(main_words) )

    #remove also short words, but keep special words in the list
    return ' '.join([word for word in main_words if (len(word) > 3 and DICT.check(word)) or word in keepwords or word in abbrevations])

'''
Based on the training done by the network, one or multiple labels should be applied
on metadata record descriptions that have no keywords attached to them

In this case, the starting probability is 0.5, meaning that the algorithm will return
all labels with probability higher that 0.5
In case there is no one, the function can be called with a smaller probability parameter
until at least one label is obtained
'''
def predict_label(abstract, probability=0.5):

    abstract = abstract[0:len(abstract)-1]
    #abstract = '\'' + abstract + '\''
    #escaped = escaped.translate(str.maketrans({"'": r"\'", 
    #                                           "(" : r"\(", ")" : r"\)"}))
    #print(escaped)
    
    #send the labeling command to fasttext application for a record description
    #to get the best labels for the given entry
    p1 = Popen(("echo " + abstract).split(), stdout = PIPE, close_fds=True)
    command = (FASTTEXT_PATH + FASTTEXT_COMMAND + " predict " + FASTTEXT_PATH + MODEL_NAME + ".bin" + " - -1 " + 
                str(float(probability)))
    print(command)
    p2 = Popen(command.split(), stdin = p1.stdout, stdout = subprocess.PIPE,
               stderr=subprocess.STDOUT,  close_fds=True)
    stdout, stderr = p2.communicate()
    out_label = stdout[0:len(stdout)-1].decode("utf-8")
    return out_label


'''
In order to build training data, metadata descriptions are obtained from the GeoCatalogue
and the best keywords are selected from the entire list
The best keywords are words similar to the search query, that can be found in the metadata
'''
def build_training_test_database(records, best_keywords):
    #read file and see which of the articles contain these keywords
    #build a training file
    training_file = open(TRAINING_FILE, 'w')
    test_file = open(TEST_FILE, 'w')

    for rec in records:
        #if there are multiple keywords per metadata records, they are separated by ','
        article_best_keywords = []
        for key in records[rec].subjects:
            if ',' in key:
                keylist = key.split(', ')
                for single_key in keylist:
                    #check if the keywords related to the metadata record is in the list of
                    #the best keywords, than apped it to the list and write it in the training file
                    if single_key.lower() in best_keywords:
                        article_best_keywords.append(single_key.lower())
            else:
                if key.lower() in best_keywords:
                    article_best_keywords.append(key.lower())
        #remove the new lines
        records[rec].abstract = records[rec].abstract.replace('\n', ' ')
        if len(article_best_keywords) != 0:
            #replace all spaces in keywords with '-' and write them as labels
            for key in article_best_keywords:
                key = key.replace(' ', '-')
                training_file.write('__label__%s ' % (key))
            #write the simplified abstract of the metadata record in the training file
            #if there are labels attached to it
            training_file.write(prepareDescription(records[rec].abstract) + '\n')

        #write the simplified abstract of the metadata record in the test file
        test_file.write(prepareDescription(records[rec].abstract) + '\n')

    training_file.close()
    test_file.close()


'''
Run the classsifer
At this point, it i called through bash command and it can be tested for various
combinations of parameters
'''
def train_fasttext_classifer(training_file=TRAINING_FILE):
    lr = 0.5
    epoch = 25
    wordNgrams = 2
    
    bashCommand = ('rm ' + MODEL_NAME)
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

    bashCommand = (FASTTEXT_PATH + FASTTEXT_COMMAND + ' supervised -input ' +
                training_file + ' -output ' + FASTTEXT_PATH + MODEL_NAME + ' -lr ' + str(lr) +
                ' -epoch ' + str(epoch) + ' -wordNgrams ' + str(wordNgrams))
    print(bashCommand)
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()


'''
After the classifier is trainied, it is run on the records that have no keywords associated
in order to provide the most similar entries.
The classifier tries to get the most probable matches, starting from a threshold of 0.5 and then
it decreases the probability by 0.1 until it gets to a results.
In case no result is obtained, then the search query will be marked as a label for that abstract
'''
def run_classifier_on_data(count, test_file=TEST_FILE):
    decr_rate = 0.1
    
    myfile = LABELED_FILE + str(count)
    print("fisierul de deschis e %s" %myfile)
    fp=open(myfile, "w")
    no_labeled_text = []
    
    #get all the entries in the test file and try to get labels for them 
    with open(test_file) as f:
        mylist = f.readlines()
        for abstract in mylist:
            out_label = predict_label(abstract)
            if out_label == '':
                prob = 0.4
                while (prob > 0):
                    out_label = predict_label(abstract, prob)
                    if out_label != '':
                        break
                    else:
                        prob = prob - decr_rate
                if prob <= 0:
                    # if there is no label, add the search query as label
                    out_label = "__label__" + SEARCH_QUERY.replace(" ", "-")
            fp.write(out_label + " " + abstract)
    fp.close()


'''
Show how many entries are in each category
In case there are few entries, they will be printed in order
to get an example over how the classifier runs.
All the final entries can be found in LABELED_FILE
'''
def show_results(title, count, labeled_file=LABELED_FILE):
    text_dict = {}
    pattern = r'(__label__(\w|-)+\s*)+(\w\s)+'
    
    with open(labeled_file + str(count)) as f:
        text_list = f.readlines()
        for line in text_list:
            match = re.search(pattern, line)
            if match is None:
                print("match is None")
            else:
                labels = match.group(0)
                text = line.split(labels)[1]
                list_of_labels = re.split('__label__', labels)
                for label in list_of_labels:
                    if label != '':
                        mylabel = label.split(' ')[0]
                        if mylabel in text_dict:
                            text_dict[mylabel].append(text)
                        else:
                            text_dict[mylabel] = [text]
    
    print("text_dict len %d" %len(text_dict))
    for key in text_dict:
        text_dict[key] = len(text_dict[key])
        
    df = pandas.DataFrame.from_dict(text_dict, orient='index')
    ax = df.plot(kind='bar', title="sim"+str(title))
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))


def calculate_wcss(data):
    wcss = []
    for n in range(2, 21):
        kmeans = KMeans(n_clusters=n)
        kmeans.fit(X=data)
        wcss.append(kmeans.inertia_)
    
    return wcss

def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
     
    return np.asarray(sent_vec) / numw

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    #https://spacy.io/api/annotation
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.lemma_ in keepwords or token.pos_ in allowed_postags])
    
    return texts_out

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])# if token.pos_ in allowed_postags])
    return texts_out

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Calculating {}-topic model'.format(num_topics))
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


def format_topics_sentences(texts, ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


def replace_abbrevations (text, abbreviations) :

    for abbr in list(abbreviations.keys()):
        index = 0
        index = text.find(abbr.lower(), index)
        while index != -1:
            #is this part of a word? if there is a letter in front,
            #then don't take it into consideration
            #at this point, the text has been preproceesed and tere should not be letters in front or
            #after the abbreviation
            replace = 1
            if index != 0:
                if text[index-1].isalpha():
                    replace = 0
            
            if replace == 1:
                if index + len(abbr) < len(text):
                    if text[index + len(abbr)].isalpha():
                        replace = 0
                    
            if replace == 1:
                text = text[:index] + abbreviations[abbr] + text[index + len(abbr):]
                      
            index = text.find(abbr.lower(), index + 1)
    
    return text


def wmd(q1, q2):
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    stop_words = stopwords.words('english')
    q1 = [w for w in q1 if w not in stop_words]
    q2 = [w for w in q2 if w not in stop_words]
    return model.wmdistance(q1, q2)
    
def norm_wmd(q1, q2):
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    stop_words = stopwords.words('english')
    q1 = [w for w in q1 if w not in stop_words]
    q2 = [w for w in q2 if w not in stop_words]
    return norm_model.wmdistance(q1, q2)
    
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


"""
All the abbreviations related to energy in the text will be replaced by their corresponding words,
so it would be easier to compute the distances
"""

def replace_abbrevations (text, abbreviations):

    for abbr in list(abbreviations.keys()):
        index = 0
        index = text.find(abbr.lower(), index)
        while index != -1:
            #is this part of a word? if there is a letter in front,
            #then don't take it into consideration
            #at this point, the text has been preproceesed and tere should not be letters in front or
            #after the abbreviation
            replace = 1
            if index != 0:
                if text[index-1].isalpha():
                    replace = 0
            
            if replace == 1:
                if index + len(abbr) < len(text):
                    if text[index + len(abbr)].isalpha():
                        replace = 0
                    
            if replace == 1:
                text = text[:index] + abbreviations[abbr] + text[index + len(abbr):]
                      
            index = text.find(abbr.lower(), index + 1)
    
    return text