In [8]:
import numpy as np
import re
import os
import pandas as pd

In [2]:
def clean(text):
    text=text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text=re.sub('[“"”]',' " ',text)
    retain='[^abcdefghijklmnopqrstuvwxyz!#?" ]'
    text=re.sub('[()–-]',' ',text)
    text=re.sub(retain,'',text)
    text=text.replace('?',' ? ')
    text=text.replace('#',' # ')
    text=text.replace('!',' ! ')
    text=text.split()
    return ' '.join(text)

In [88]:
train_folder = "datasets/train-articles" # check that the path to the datasets folder is correct, 
dev_folder = "datasets/dev-articles"     # if not adjust these variables accordingly
train_labels_file = "datasets/train-task2-TC.labels"
dev_template_labels_file = "datasets/dev-task-TC-template.out"
task_TC_output_file = "baseline-output-TC.txt"

def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles


def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file. 
    Return values are four arrays with article ids, labels 
    (or ? in the case of a template file), begin of a fragment, 
    end of a fragment. 
    """
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

def compute_features_b(articles, span_starts, span_ends):
    # only one feature, the length of the span
    return np.array([ int(sp_ends)-int(sp_starts) for sp_starts, sp_ends in zip(span_starts, span_ends) ]).reshape(-1, 1)

def clean(text):
    text=text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text=re.sub('[“"”]',' " ',text)
    retain='[^abcdefghijklmnopqrstuvwxyz!#?". ]'
    text=re.sub('[()–-]',' ',text)
    text=re.sub(retain,'',text)
    text=re.sub('[.]',' . ',text)
    text=text.replace('?',' ? ')
    text=text.replace('#',' # ')
    text=text.replace('!',' ! ')
    return ' '.join(text.split())

def read_span(id,span,dev=False):
    if dev:
        return dev_articles[id][span[0]:span[1]]
    else:
        return articles[id][span[0]:span[1]]

In [190]:
# loading articles' content from *.txt files in the train folder
articles = read_articles_from_file_list(train_folder)
# loading gold labels, articles ids and sentence ids from files *.task-TC.labels in the train labels folder 
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))
dev_articles = read_articles_from_file_list(dev_folder)
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
print("Loaded %d annotations from %d articles" % (len(dev_span_starts), len(set(dev_article_ids))))
Train,Dev=pd.DataFrame(),pd.DataFrame()
def find_sent(id,start,end,dev=False):
    if dev:
        dic=dev_articles
    else:
        dic=articles
    x=dic[id].rfind('\n',0,start)
    if x == -1 :
        x=0
    y=dic[id].find('\n',end)
    x=max(x,dic[id].rfind('.',0,start))
    y=min(y,dic[id].find('.',end))
    return dic[id][x:y]
Train['ID'],Dev['ID']=ref_articles_id,dev_article_ids
Train['Start'],Train['End']=[int(i) for i in ref_span_starts], [int(i) for i in ref_span_ends]
Dev['Start'],Dev['End']=[int(i) for i in dev_span_starts], [int(i) for i in dev_span_ends]
Train['Span'],Dev['Span']=Train['End']-Train['Start'],Dev['End']-Dev['Start']
Train['Sentence']=[articles[k][i:j] for i,j,k in zip(Train['Start'],Train['End'],Train['ID'])]
Dev['Sentence']=[dev_articles[k][i:j] for i,j,k in zip(Dev['Start'],Dev['End'],Dev['ID'])]
Train['Title']=Train['ID'].apply(lambda  x : articles[x].split('\n')[0])
Dev['Title']=Dev['ID'].apply(lambda  x : dev_articles[x].split('\n')[0])
Train['Target'],Dev['Target']=train_gold_labels,dev_labels
Train['Paragraph']=[find_sent(id,start,end) for id,start,end in zip(Train['ID'],Train['Start'],Train['End'])]
Dev['Paragraph']=[find_sent(id,start,end,dev=True) for id,start,end in zip(Dev['ID'],Dev['Start'],Dev['End'])]
del Train['Start'],Train['End'],Dev['Start'],Dev['End']

Loaded 6129 annotations from 357 articles
Loaded 1063 annotations from 74 articles


In [191]:
def clean(text):
    final=[]
    for word in text.split():
        if (('www' not in word) and ('http' not in word) and  ('.com' not in word)):
            final.append(word)
    final=re.sub('[^{} ]'.format(string.ascii_letters+string.digits+string.punctuation), '', ' '.join(final))
    return final

In [192]:
Train['Paragraph'],Train['Title'],Train['Sentence']=Train['Paragraph'].apply(lambda x : clean(x)),Train['Title'].apply(lambda x : clean(x)),Train['Sentence'].apply(lambda x : clean(x))

In [187]:
final='Defeat Jihad"'
final=re.sub('[^{} ]'.format(string.ascii_letters+string.digits+string.punctuation), '', final)
final

'Defeat Jihad"'

In [195]:
Train.drop(['ID'],axis=1).to_csv('Train.csv',index=False)
Dev.drop(['ID'],axis=1).to_csv('Dev.csv',index=False)