In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
import glob
import os.path
import numpy as np
import sys
import codecs
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.externals import joblib



In [0]:
train_folder = "./drive/My Drive/datasets/train-articles" # check that the path to the datasets folder is correct, 
dev_folder = "./drive/My Drive/datasets/dev-articles"     # if not adjust these variables accordingly
train_labels_file = "./drive/My Drive/datasets/train-task2-TC.labels"
dev_template_labels_file = "./drive/My Drive/datasets/dev-task-TC-template.out"
task_TC_output_file = "TFIDF_LR.txt"

def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles


def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file. 
    Return values are four arrays with article ids, labels 
    (or ? in the case of a template file), begin of a fragment, 
    end of a fragment. 
    """
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

def compute_features_b(articles, span_starts, span_ends):
    # only one feature, the length of the span
    return np.array([ int(sp_ends)-int(sp_starts) for sp_starts, sp_ends in zip(span_starts, span_ends) ]).reshape(-1, 1)

def clean(text):
    text=text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text=re.sub('[“"”]',' " ',text)
    retain='[^abcdefghijklmnopqrstuvwxyz!#?". ]'
    text=re.sub('[()–-]',' ',text)
    text=re.sub(retain,'',text)
    text=re.sub('[.]',' . ',text)
    text=text.replace('?',' ? ')
    text=text.replace('#',' # ')
    text=text.replace('!',' ! ')
    return ' '.join(text.split())

# loading articles' content from *.txt files in the train folder
articles = read_articles_from_file_list(train_folder)
dev_articles = read_articles_from_file_list(dev_folder)

def read_span(id,span,dev=False):
    if dev:
        return dev_articles[id][span[0]:span[1]]
    else:
        return articles[id][span[0]:span[1]]

In [6]:
# loading gold labels, articles ids and sentence ids from files *.task-TC.labels in the train labels folder 
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))

# reading data from the development set
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)

Loaded 6129 annotations from 357 articles


In [0]:
in_data=[(id, [int(sps),int(spe)])for id, sps, spe in zip(ref_articles_id, ref_span_starts, ref_span_ends)]
df=pd.DataFrame(in_data,columns=['ID','Span'])
df['Sentence']=[read_span(id,span) for id,span in zip(df['ID'].tolist(),df['Span'].tolist())]
df['Sentence']=df['Sentence'].apply(lambda x : clean(x))
df['Span']=df['Span'].apply(lambda x : x[1]-x[0])
df['Target']=train_gold_labels
df=df.drop_duplicates()

In [0]:
dev_data=[(id, [int(sps),int(spe)])for id, sps, spe in zip(dev_article_ids, dev_span_starts, dev_span_ends)]
df_dev=pd.DataFrame(dev_data,columns=['ID','Span'])
df_dev['Sentence']=[read_span(id,span,dev=True) for id,span in zip(df_dev['ID'].tolist(),df_dev['Span'].tolist())]
df_dev['Sentence']=df_dev['Sentence'].apply(lambda x : clean(x))
df_dev['Span']=df_dev['Span'].apply(lambda x : x[1]-x[0])

In [0]:
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [0]:
df=df.drop(df[df['Target']=='Loaded_Language'].sample(frac = 0.075, random_state=1234).index)
df=df.drop(df[df['Target']=='Name_Calling,Labeling'].sample(frac = 0.025, random_state=1234).index)

In [4]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(2,6),stop_words='english',analyzer='char')
tfidf_w = TfidfVectorizer(sublinear_tf=True, min_df=6,ngram_range=(1,3),stop_words='english',analyzer='word')
features = np.append(tfidf.fit_transform(df.Sentence).toarray(),df['Span'].values.reshape(-1,1),axis=1).astype('float16')
features = np.append(tfidf_w.fit_transform(df.Sentence).toarray(),features,axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(features,df['Target'],stratify=df['Target'],test_size=0.2,random_state=1234)

NameError: name 'TfidfVectorizer' is not defined

In [230]:
features.shape

(5599, 37772)

In [3]:
model_lr=LogisticRegression(penalty='l2', class_weight='balanced', solver="liblinear", max_iter=500)
model_lr.fit(X_train,Y_train)
pred_lr=model_lr.predict(X_test)

NameError: name 'LogisticRegression' is not defined

In [2]:
cm=confusion_matrix(Y_test,pred_lr)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (10,8))
sns.heatmap(cm,annot=True)

NameError: name 'confusion_matrix' is not defined

In [233]:
cf_rep=classification_report(Y_test,pred_lr)
print(cf_rep)
sum([float(rep.split()[3]) for rep in cf_rep.split('\n')[2:16]])/14

                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.20      0.21      0.20        29
          Appeal_to_fear-prejudice       0.33      0.32      0.33        59
    Bandwagon,Reductio_ad_hitlerum       0.31      0.36      0.33        14
           Black-and-White_Fallacy       0.29      0.29      0.29        21
         Causal_Oversimplification       0.28      0.26      0.27        42
                             Doubt       0.50      0.54      0.51        99
         Exaggeration,Minimisation       0.48      0.41      0.44        92
                       Flag-Waving       0.49      0.66      0.56        44
                   Loaded_Language       0.64      0.78      0.70       384
             Name_Calling,Labeling       0.67      0.49      0.57       196
                        Repetition       0.45      0.29      0.35        80
                           Slogans       0.39      0.38      0.38        24
       Thou

0.39214285714285707

In [222]:
DF=pd.concat([df,df_dev])
features = tfidf.fit_transform(DF.Sentence).toarray().astype('float16')
features_w = tfidf.fit_transform(DF.Sentence).toarray().astype('float16')
features = np.append(features,DF['Span'].values.reshape(-1,1),axis=1)
features= np.append(features,features_w,axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [0]:
Train_X, Train_Y = features[:len(df)], DF[:len(df)]['Target']
Test=features[len(df):]

In [0]:
model_lr=LogisticRegression(penalty='l2', class_weight='balanced', solver="liblinear", max_iter=500)
model_lr.fit(Train_X, Train_Y)
pred_lr=model_lr.predict(Test)

In [226]:
##### writing predictions to file
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, pred_lr, dev_span_starts, dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

Predictions written to file TFIDF_LR.txt


In [0]:
from google.colab import files
files.download('./TFIDF_LR.txt')