# Phrasebank from [elsevier corpus](https://researchcollaborations.elsevier.com/en/datasets/elsevier-oa-cc-by-corpus)

This notebook has the purpose of extracting the most common phrases from the training data.

- E.g. phrasebank_pdf: generate a academic phrasebank from a poupular [scientific writing guidebooks](http://www.phrasebank.manchester.ac.uk/), or a high level scientific journal.
- E.g. phrasebank_elsevier: generate a academic phrasebank from [Elsevier OA CC-BY corpus](https://huggingface.co/datasets/orieg/elsevier-oa-cc-by).


## Workflows

In [2]:
### Step 1: Load the data
from datasets import load_dataset

dataset = load_dataset("orieg/elsevier-oa-cc-by",trust_remote_code=True)

In [1]:
import nltk
from nltk.util import ngrams

nltk.download('punkt')
from collections import Counter, defaultdict

def generate_ngrams_nltk(tokens_generator, n):
    ngram_counts = Counter()

    for tokens in tokens_generator:
        # Generate n-grams
        n_grams = ngrams(tokens, n)
        # Update the count of each n-gram
        ngram_counts.update(n_grams)

    return ngram_counts

def generate_multiple_ngrams_nltk(tokens_generator, n_values):
    ngram_freqs = defaultdict(Counter)

    for tokens in tokens_generator:
        for n in n_values:
            n_grams = ngrams(tokens, n)
            ngram_freqs[n].update(n_grams)

    return ngram_freqs


def smart_join(tokens:list or tuple):
    if not tokens:
        return ""

    # Start with the first token
    phrase = tokens[0]

    # Iterate over the remaining tokens
    for token in tokens[1:]:
        if phrase and (
            (phrase[-1].isalnum() and token[0].isalnum())  # Both tokens are alphanumeric
            or (phrase[-1] not in ',.;!?' and not token[0].isalnum())  # Punctuation rules
        ):
            # Add a space before joining if both parts are alphanumeric or punctuation needs following space
            phrase += ' ' + token
        else:
            # Join without a space if punctuation or special cases
            phrase += token

    return phrase

def flatten(nested_list):
    for item in nested_list:
        if isinstance(item, list):
            yield from flatten(item)  # Recurse into sublist
        else:
            yield item

def filter_frequent_ngrams(ngram_counts, min_frequency):
    # Filter ngrams which occur at least min_frequency times
    frequent_ngrams = []
    for ngram, count in ngram_counts.items():

        if count >= min_frequency and ngram[0].isalnum() and ngram[-1].isalnum():
            frequent_ngrams.append(smart_join(ngram))

    return frequent_ngrams

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
dataset['train']

Dataset({
    features: ['title', 'abstract', 'subjareas', 'keywords', 'asjc', 'body_text', 'author_highlights'],
    num_rows: 32072
})

In [4]:
import json
from nltk.tokenize import word_tokenize

# Define keys
keys = ['title', 'abstract', 'author_highlights','body_text']

# Open a temporary file in write mode
with open('temp_tokens.json', 'w') as f:
    for key in keys:
        for sentences in list(flatten(dataset['train'][key])):
            # Tokenize the sentence and write the tokens directly to the file
            tokens = word_tokenize(sentences)
            json.dump(tokens, f)
            f.write('\n')  # Write a newline character after each list of tokens

In [2]:
import json

def tokens_generator(file_path, chunk_size=10000):
    tokens = []
    with open(file_path, 'r') as f:
        for line in f:
            tokens += json.loads(line)
            if len(tokens) >= chunk_size:
                yield tokens
                tokens = []
        if tokens:
            yield tokens

# Define the n values for which you want to calculate n-grams
n_values = [3]

# Create a generator for the tokens
tokens_gen = tokens_generator('temp_tokens.json')

# Generate the n-grams and count their frequencies
ngram_freqs = generate_multiple_ngrams_nltk(tokens_gen, n_values)

# Filter the frequent n-grams
#phrases_2 = filter_frequent_ngrams(ngram_freqs[2], 4000)
phrases_3 = filter_frequent_ngrams(ngram_freqs[3], 1500)
#phrases_4 = filter_frequent_ngrams(ngram_freqs[4], 1000)
#phrases_5 = filter_frequent_ngrams(ngram_freqs[5], 500)

# Combine and sort the phrases


#sorted_phrases = sorted(phrases_2 + phrases_3) # + phrases_4 + phrases_5)
# excelude ngrams that contain Table, Fig, Figure, et, al
#check_list = ['Table','table','Fig','Figs','fig','figs','Figure','figure', 'Appendix','Kim','Liu','Wang','Zhang','Li',
#               '0','1','2','3','4','5','6','7','8','9','10','20','40',
#               'm','nm','mm','cm','km','kg','g','s','h','min','sec','day','week','month','year','C','F','K','Î¼l','ml']

#not any(ele in ngram for ele in check_list)


# Write the sorted phrases to a Markdown file
with open('../elsevier_phrasebank.md', 'w') as file:
    for line in phrases_3:
        file.write(line + '\n')

: 

In [3]:
phrases_2

['models and',
 'effect of',
 'animal models',
 'developed in',
 'in a',
 'case for',
 'response in',
 'in the',
 'on the',
 'properties of',
 'models to',
 'to predict',
 'analysis of',
 'using the',
 'evaluation of',
 'of new',
 'study of',
 'levels of',
 'association with',
 'with the',
 'of the',
 'and functional',
 'opportunities for',
 'along the',
 'to achieve',
 'data collection',
 'presentation of',
 'performance of',
 'of two',
 'climate change',
 'in urban',
 'land use',
 'study in',
 'The effect',
 'with a',
 'of research',
 'role of',
 'dynamics of',
 'efficacy of',
 'of a',
 'model for',
 'The data',
 'data of',
 'if the',
 'decision making',
 'across the',
 'South Africa',
 'used in',
 'effect in',
 'to a',
 'in South',
 'case of',
 'expression of',
 'the risk',
 'risk of',
 'of large',
 'outside the',
 'of both',
 'around the',
 'the world',
 'from a',
 'genes in',
 'data on',
 'development and',
 'and gas',
 'in human',
 'work in',
 'in vitro',
 'in vivo',
 'of each',
