In [None]:
import os
import seaborn as sns
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk import bigrams, word_tokenize, FreqDist

In [57]:
dataset_file_path = os.path.join('..','checked_dataset', 'dataset_negatives.txt')

with open(dataset_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

sentences = []
microorganisms = []
stresses = []
plants = []
for line in lines:
    microorganism = line.split(" | ")[1]
    stress = line.split(" | ")[2]
    plant = line.split(" | ")[3]
    sentence = line.split(" | ")[4]

    microorganism_parts = microorganism.split(" ")
    for microorganism_part in microorganism_parts:
        microorganisms.append(microorganism_part)

    stress_parts = stress.split(" ")
    for stress_part in stress_parts:
        stresses.append(stress_part)   
   
    plant_parts = plant.split(" ")
    for plant_part in plant_parts:   
        plants.append(plant_part)
        
    sentences.append(sentence)

fulltext = " ".join(sentences)

In [58]:
entities = microorganisms + plants + stresses
spelling_errors = ["piechaudi", "microccocus", "yunnanense"]
personalized_stopwords = [",", ".", "(", ")", ";", ":", "reference", "-ref-", "figure", "stress", "plant", "plants", "bacteria", "pgpr", "virus", "viruses", "genera", "strain", "strains", "halophyte", "bacterial", "fungal", "fungi", "species", "mm", "ml", "nacl", "et. al", "also", "like", "sp.", "spp."]
stop_words = set(stopwords.words('english'))
fulltext_tokens = word_tokenize(fulltext)
filtered_fulltext = [w for w in fulltext_tokens if not w.lower().strip() in stop_words
                     and not any(chr.isdigit() for chr in w.strip())
                     and not w.strip() in entities
                     and not w.lower().strip() in personalized_stopwords
                     and not w.lower().strip() in spelling_errors
                     and not len(w)<=3]

In [None]:
final_tokens = [token.lower() for token in filtered_fulltext]
frequency = FreqDist(final_tokens)
frequency.most_common(15)

In [None]:
frequency.plot(15)

In [None]:
## Generate bigrams
ngrams = bigrams(final_tokens)

## Creating FreqDist
ngram_fd = FreqDist(ngrams).most_common(20)

## Sort values by highest frequency
ngram_sorted = {k:v for k,v in sorted(ngram_fd, key=lambda x: x[1], reverse=True)}

## Join bigram tokens with '_' + maintain sorting
ngram_joined = {'_'.join(k):v for k,v in sorted(ngram_fd, key=lambda x: x[1], reverse=True)}

## Convert to Pandas series for easy plotting
ngram_freqdist = pd.Series(ngram_joined)

## Setting figure & ax for plots
fig, ax = plt.subplots(figsize=(4,4))

## Setting plot to horizontal for easy viewing + setting title + display  
bar_plot = sns.barplot(x=ngram_freqdist.values, y=ngram_freqdist.index, orient='h', ax=ax)
plt.title('Frequency Distribution')
plt.xlabel("Counts")
plt.ylabel("Bigram Combinations")
plt.show();