In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Thesis/Data

/content/drive/My Drive/Thesis/Data


In [None]:
pip install cupy-cuda12x



# **Reading Data and cleaning**

In [None]:
#Data for model

import inflect
import pandas as pd

def process_sentences_to_dataframe(input_file):
    p = inflect.engine()
    data = []  # List to store processed lines

    with open(input_file, 'r') as file:
        family_number = 1
        for line in file:
            line = line.strip().strip('"')
            colon_index = line.find(':')
            if colon_index != -1:
                ordinal = p.ordinal(family_number)
                after_colon = line[colon_index+1:].strip()
                new_line = f"{ordinal.capitalize()} family: {after_colon}"
                data.append(new_line)
                family_number += 1

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Sentence'])
    return df

# Usage
input_file_path = 'Notannotated700sentence.txt'  # Path to the file containing the original sentences
df = process_sentences_to_dataframe(input_file_path)

In [None]:
df

Unnamed: 0,Processed Sentence
0,1st family: a healthy queen observed workers a...
1,2nd family: queenright with an extensive brood...
2,3rd family: Queen’s presence confirmed strong ...
3,4th family: Queen is laying well the worker po...
4,5th family: Queen is healthy drone population ...
...,...
699,"700th family: queen presence weak, workers exh..."
700,"701st family: queen observed lethargic, worker..."
701,"702nd family: queenless situation confirmed, w..."
702,"703rd family: queenright uncertain, worker act..."


In [None]:
#Data for inferrence
import pandas as pd

def read_txt_to_df(file_path, column_name='Sentence'):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    df = pd.DataFrame(lines, columns=[column_name])

    return df

file_path = 'Inferrence_data.txt'
inferrence_data = read_txt_to_df(file_path)

In [None]:
inferrence_data = inferrence_data[705:]

In [None]:
inferrence_data

Unnamed: 0,Sentence
705,"702th family: queenright uncertain, workers ar..."
706,"703th family: queenright uncertain, workers ex..."
707,"704th family: queen presence weak, workers are..."
708,"705th family: queen is missing, worker activit..."
709,"706th family: queen observed lethargic, worker..."
...,...
905,"902th family: vibrant queen, worker population..."
906,"903th family: queenless situation, workers exh..."
907,"904th family: vibrant queen, workers are activ..."
908,"905th family: vibrant queen, vibrant worker ac..."


In [None]:
def find_duplicate_rows(df, column_name='Sentence'):
    df['Text After Colon'] = df[column_name].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else None)
    duplicates = df[df.duplicated(['Text After Colon'], keep=False)]
    return duplicates[['Sentence']]

In [None]:
duplicate_rows = find_duplicate_rows(df)

In [None]:
duplicate_rows_infer = find_duplicate_rows(inferrence_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Text After Colon'] = df[column_name].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else None)


In [None]:
duplicate_rows

Unnamed: 0,Sentence
111,112th family: Queen aging workers steady limit...
114,115th family: Queen missing workers sluggish f...
115,116th family: Queen visible workers active dro...
116,117th family: Queen productive workers industr...
117,118th family: Queen declining workers maintain...
...,...
542,"543rd family: potential queenless situation, w..."
590,"591st family: potential queenless situation, w..."
606,"607th family: questionable queenright status, ..."
625,"626th family: queenright uncertain, worker pop..."


In [None]:
duplicate_rows_infer

Unnamed: 0,Sentence
705,"702th family: queenright uncertain, workers ar..."
784,"781th family: queenright uncertain, workers ar..."


In [None]:
#Remove duplicate
def remove_duplicates(df, column_name='Sentence'):
    df['Text After Colon'] = df[column_name].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else '')
    duplicate_mask = df.duplicated('Text After Colon', keep='first')
    non_duplicate_mask = ~duplicate_mask
    clean_df = df.loc[non_duplicate_mask].drop(columns=['Text After Colon'])
    return clean_df

In [None]:
df = remove_duplicates(df)

In [None]:
df

Unnamed: 0,Sentence
0,1st family: a healthy queen observed workers a...
1,2nd family: queenright with an extensive brood...
2,3rd family: Queen’s presence confirmed strong ...
3,4th family: Queen is laying well the worker po...
4,5th family: Queen is healthy drone population ...
...,...
699,"700th family: queen presence weak, workers exh..."
700,"701st family: queen observed lethargic, worker..."
701,"702nd family: queenless situation confirmed, w..."
702,"703rd family: queenright uncertain, worker act..."


In [None]:
inferrence_data = remove_duplicates(inferrence_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Text After Colon'] = df[column_name].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else '')


In [None]:
inferrence_data

Unnamed: 0,Sentence
705,"702th family: queenright uncertain, workers ar..."
706,"703th family: queenright uncertain, workers ex..."
707,"704th family: queen presence weak, workers are..."
708,"705th family: queen is missing, worker activit..."
709,"706th family: queen observed lethargic, worker..."
...,...
905,"902th family: vibrant queen, worker population..."
906,"903th family: queenless situation, workers exh..."
907,"904th family: vibrant queen, workers are activ..."
908,"905th family: vibrant queen, vibrant worker ac..."


In [None]:
inferrence_data.to_csv("InferrenceData.csv")

In [None]:
df.to_csv("NotAnnotatedDataCleaned.csv")

# **Annotated data processing through spacy:**

In [None]:
%cd /content/drive/My Drive/Thesis/Data/Docanno

/content/drive/My Drive/Thesis/Data/Docanno


In [None]:
 import spacy

 !python -m spacy download en_core_web_lg

 nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json

# Path to the JSONL file containing the data
jsonl_file_path = '/content/Data.jsonl'

# This will store the formatted training data
training_data = {
    'classes': [
        'BeeProduct',
        'Drone',
        'EggCount',
        'Forage',
        'HealthStatus',
        'HoneyFrames',
        'Queen',
        'Worker'],
    'annotations': []
}

# Function to process the JSONL file
def process_jsonl_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)  # Parse the JSON data from the line
            temp_dict = {
                'text': data['text'],
                'entities': []
            }
            for start, end, label in data['label']:
                # Add the entity annotation
                temp_dict['entities'].append((start, end, label.upper()))
            training_data['annotations'].append(temp_dict)

# Process the file
process_jsonl_file(jsonl_file_path)

# Optionally, print the first few annotations to verify
print(training_data['annotations'][:2])  # Adjust slice as needed to see more or fewer examples


[{'text': '3rd family: Queen’s presence confirmed, strong brood development, a significant amount of propolis, bees actively foraging for nectar, and all ten honey frames being filled.', 'entities': [(12, 20, 'QUEEN'), (20, 38, 'HEALTHSTATUS'), (40, 52, 'EGGCOUNT'), (68, 86, 'HEALTHSTATUS'), (90, 98, 'BEEPRODUCT'), (105, 122, 'HEALTHSTATUS'), (127, 133, 'BEEPRODUCT'), (143, 158, 'HONEYFRAMES'), (166, 172, 'HEALTHSTATUS')]}, {'text': '4th family: Queen is laying well, the worker population is high, honey and beeswax levels are sufficient, and there is steady nectar collection with seven honey frames completed.', 'entities': [(12, 18, 'QUEEN'), (18, 32, 'HEALTHSTATUS'), (38, 44, 'WORKER'), (45, 63, 'HEALTHSTATUS'), (65, 71, 'BEEPRODUCT'), (75, 83, 'BEEPRODUCT'), (83, 104, 'HEALTHSTATUS'), (126, 132, 'FORAGE'), (148, 167, 'HONEYFRAMES')]}]


In [None]:
training_data

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [None]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("Annotated_training_data.spacy") # save the docbin object

100%|██████████| 99/99 [00:00<00:00, 3408.30it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
base_config = """
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all other default settings.
[paths]
train = null
dev = null

[system]
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["ner"]
batch_size = 1000

[components]

[components.ner]
factory = "ner"
"""

# Write the base config to a file
with open('base_config.cfg', 'w') as file:
    file.write(base_config)


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./Spacy_model_output --paths.train ./Annotated_training_data.spacy --paths.dev ./Annotated_training_data.spacy

[38;5;4mℹ Saving to output directory: Spacy_model_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0     69.47    0.00    0.00    0.00    0.00
  4     200   4590.96   93.20   93.55   92.85    0.93
  9     400   1254.10   95.20   95.81   94.60    0.95
 16     600    731.24   99.12   98.75   99.50    0.99
 24     800    487.29   99.38   98.88   99.87    0.99
 34    1000    435.37   99.56   99.25   99.87    1.00
 47    1200    318.71   99.87   99.75  100.00    1.00
 62    1400    296.15   99.87   99.75  100.00    1.00
 81    1600    226.48   99.87   99.87   99.87    1.00
104    1800    203.78   99.94   99.87  100.00    1.00
132    2000    164.48   99.94  100.00   99.87    1.00
166    2200    187.68   99.94  100.00   99.87    1.00
206    2400    175.24   99.94   99.87

In [None]:
%cd /content/drive/My Drive/Thesis/Data/Docanno/Spacy_model_output

/content/drive/My Drive/Thesis/Data/Docanno/Spacy_model_output


In [None]:
nlp_ner = spacy.load("model-best")

In [None]:
%cd /content/drive/My Drive/Thesis/Data

/content/drive/My Drive/Thesis/Data


In [None]:
import spacy
import json
import pandas as pd

input_file_path = 'NotAnnotatedDataCleaned.csv'
output_file_path = '700annonatedNERdata.json'

ner_df = pd.read_csv(input_file_path)


# Process the sentences and extract entities
results = []
for sentence in ner_df['Sentence']:
    doc = nlp_ner(sentence.strip())
    entities = [{'text': ent.text, 'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} for ent in doc.ents]
    results.append({'sentence': sentence.strip(), 'entities': entities})

#with open(output_file_path, 'w') as outfile:
    #json.dump(results, outfile, ensure_ascii=False, indent=4)




In [None]:
print(len(results))
print(results[:2])

558
[{'sentence': '1st family: a healthy queen observed workers are active abundant nectar and pollen collected with a full count of eggs and six honey frames filled.', 'entities': [{'text': 'a', 'start': 12, 'end': 13, 'label': 'HEALTHSTATUS'}, {'text': 'healthy', 'start': 14, 'end': 21, 'label': 'HEALTHSTATUS'}, {'text': 'queen', 'start': 22, 'end': 27, 'label': 'QUEEN'}, {'text': 'workers', 'start': 37, 'end': 44, 'label': 'WORKER'}, {'text': 'are active', 'start': 45, 'end': 55, 'label': 'HEALTHSTATUS'}, {'text': 'nectar', 'start': 65, 'end': 71, 'label': 'FORAGE'}, {'text': 'pollen', 'start': 76, 'end': 82, 'label': 'FORAGE'}, {'text': 'full count of eggs', 'start': 100, 'end': 118, 'label': 'EGGCOUNT'}, {'text': 'six honey frames', 'start': 123, 'end': 139, 'label': 'HONEYFRAMES'}]}, {'sentence': '2nd family: queenright with an extensive brood pattern workers exhibit high foraging behavior honey stores are high and seven honey frames are heavy with product.', 'entities': [{'text'

In [None]:
def extract_entities(text):
    doc = nlp_ner(text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    return entities

predictions = []

for text in results:
    #print(text)
    entities = extract_entities(text["sentence"].strip())
    predictions.append({
            'text': text["sentence"].strip(),
            'entities': entities
        })

print(predictions[:2])

[{'text': '1st family: a healthy queen observed workers are active abundant nectar and pollen collected with a full count of eggs and six honey frames filled.', 'entities': [(12, 13, 'HEALTHSTATUS'), (14, 21, 'HEALTHSTATUS'), (22, 27, 'QUEEN'), (37, 44, 'WORKER'), (45, 55, 'HEALTHSTATUS'), (65, 71, 'FORAGE'), (76, 82, 'FORAGE'), (100, 118, 'EGGCOUNT'), (123, 139, 'HONEYFRAMES')]}, {'text': '2nd family: queenright with an extensive brood pattern workers exhibit high foraging behavior honey stores are high and seven honey frames are heavy with product.', 'entities': [(12, 22, 'QUEEN'), (55, 62, 'WORKER'), (76, 84, 'HEALTHSTATUS'), (94, 99, 'BEEPRODUCT'), (107, 119, 'HEALTHSTATUS'), (120, 138, 'HONEYFRAMES'), (149, 162, 'HEALTHSTATUS')]}]


In [None]:
print(predictions[:2])

[{'text': '1st family: a healthy queen observed workers are active abundant nectar and pollen collected with a full count of eggs and six honey frames filled.', 'entities': [(12, 13, 'HEALTHSTATUS'), (14, 21, 'HEALTHSTATUS'), (22, 27, 'QUEEN'), (37, 44, 'WORKER'), (45, 55, 'HEALTHSTATUS'), (65, 71, 'FORAGE'), (76, 82, 'FORAGE'), (100, 118, 'EGGCOUNT'), (123, 139, 'HONEYFRAMES')]}, {'text': '2nd family: queenright with an extensive brood pattern workers exhibit high foraging behavior honey stores are high and seven honey frames are heavy with product.', 'entities': [(12, 22, 'QUEEN'), (55, 62, 'WORKER'), (76, 84, 'HEALTHSTATUS'), (94, 99, 'BEEPRODUCT'), (107, 119, 'HEALTHSTATUS'), (120, 138, 'HONEYFRAMES'), (149, 162, 'HEALTHSTATUS')]}]


In [None]:
len(predictions)

558

In [None]:
#annotated_data = predictions + training_data['annotations']

In [None]:
#len(annotated_data)

657

In [None]:
colors = {
  'Queen': '#FCDC00',
  'Worker': '#009CE0',
  'BeeProduct': '#0062B1',
  'EggCount': '#AEA1FF',
  'Forage': '#7B64FF',
  'HealthStatus': '#68CCCA',
  'HoneyFrames': '#FA28FF',
  'Drone': '#A4DD00'
}
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("""10th family: Queen’s health is excellent,
'worker bees are efficiently foraging,
high beeswax production and the egg laying at peak,
 with seven honey frames sealed.""")

from spacy import display
spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

In [None]:
for pred in predictions:
    # Create a Doc object with the text from your JSON file
    doc = nlp.make_doc(pred["text"])
    # Create a list of Span objects from the entity predictions
    spans = [doc.char_span(start, end, label=label) for start, end, label in pred["entities"]]
    # Filter out bad spans that are None
    spans = [span for span in spans if span is not None]
    # Overwrite the doc.ents with your predicted spans
    doc.ents = spans
    # Render the doc using displacy
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

In [None]:
%cd /content/drive/My Drive/Thesis/Data/Docanno

/content/drive/My Drive/Thesis/Data/Docanno


In [None]:
with open('AnnotatedNerData.json', 'w', encoding='utf-8') as f:
    json.dump(predictions, f, ensure_ascii=False, indent=4)

In [None]:
len(predictions)

558