# Annotation extraction

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import sklearn
import matplotlib.pyplot as plt
import re

# Method for normalizing text

In [2]:
def normalize_tweet(text):
    """
    Removes hashtags, @s, links, and punctuation
    :param text:Text to be cleaned
    :return: text with mentions, hashtages, and urls removes
    """
    processed_text = text.lower()
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|t\.)\S+", "", processed_text)
    processed_text = re.sub(r"(?:\.|,|\?|-)", " ", processed_text)
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|\.com)", "", processed_text)
    processed_text = re.sub(r'[^\w\s]', '', processed_text)
    processed_text = " ".join(processed_text.split())
    return processed_text

# Method for extracting the abstract from a spot

In [3]:
def extract_spot(text):
    searchString = text
    startWord = "abstract" 
    endWord = "title"
    results = ""

    index = 0
    while True:
        try:
            startIndex = searchString.index(startWord, index)
            endIndex = searchString.index(endWord, startIndex)

            results += (searchString[startIndex + len(startWord):endIndex].strip()) + ". "

            # move the index to the end
            index = endIndex + len(endWord)

        except ValueError:
            # str.index raises a ValueError if there is no match; in that
            # case we know that we’re done looking at the string, so we can
            # break out of the loop
            break
    return results

In [4]:
train = pd.read_csv("data/tag-me-train-spots.csv")
test = pd.read_csv("data/tag-me-test-spots.csv")
val = pd.read_csv("data/tag-me-val-spots.csv")

In [5]:
#turns each element in the annotations column for all three dataframes into a string

train_annotations = train['annotations'].astype(str)
test_annotations = test['annotations'].astype(str)
val_annotations = val['annotations'].astype(str)

In [6]:
for string in range(0, len(train_annotations)):
    train_annotations[string] = normalize_tweet(train_annotations[string])
for string in range(0, len(test_annotations)):
    test_annotations[string] = normalize_tweet(test_annotations[string])
for string in range(0, len(val_annotations)):
    val_annotations[string] = normalize_tweet(val_annotations[string])

In [7]:
count = 0
for m in re.finditer('abstract', train_annotations[0]):
    count += 1
    print(m.start(), m.end())
print("Count: " + str(count))

144 152
534 542
1358 1366
2021 2029
Count: 4


In [8]:
count = 0
for string in train_annotations:
    for m in re.finditer('abstract', string):
        count += 1
for string in test_annotations:
    for m in re.finditer('abstract', string):
        count += 1
for string in val_annotations:
    for m in re.finditer('abstract', string):
        count += 1
print("Count: " + str(count) + " abstract sentences")

Count: 121179 abstract sentences


In [9]:
for string in range(0, len(train_annotations)):
    train_annotations[string] = extract_spot(train_annotations[string])
for string in range(0, len(test_annotations)):
    test_annotations[string] = extract_spot(test_annotations[string])
for string in range(0, len(val_annotations)):
    val_annotations[string] = extract_spot(val_annotations[string])

In [10]:
train_annotations.to_csv('data/annotations_train.csv')
test_annotations.to_csv('data/annotations_test.csv')
val_annotations.to_csv('data/annotations_val.csv')