In [1]:
import xml.etree.ElementTree as etree
from glob import glob
import json
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
def parse_tree(fname):
    '''
    fname: filename
    return: {date: [title, text]} after parsing the xml file
    '''
    tree = etree.parse(fname)
    root = tree.getroot()
    subject = []
    for idx, wrt in enumerate(root.findall("WRITING")):
        title = wrt.find('TITLE').text
        if title != None:
            title = " ".join(title.replace('\n',' ').strip().split())
        else:
            title = ""
        
        date = wrt.find('DATE').text
        if date != None:
            date = " ".join(date.replace('\n',' ').strip().split())
        else:
            date = ""
        
        text = wrt.find('TEXT').text
        if text != None:
            text = " ".join(text.replace('\n',' ').strip().split())
        else:
            text = ""
        
        if date=="" and title=="" and text=="":
            continue
        
        subject.append([date, title, text])
        

    return subject

In [3]:
corpus = []
from glob import glob
fpaths = sorted(glob("train/data/*.xml"), key=lambda x: int(x[18:-4]))
for fp in fpaths:
#     print(fp[18:-4])
    corpus.append(parse_tree(fp))
#     break

In [4]:
len(corpus)

340

In [5]:
testset = []
from glob import glob
# print(glob("test/DATA/*.xml")[0])
fpaths = sorted(glob("test/DATA/*.xml"), key=lambda x: int(x[17:-4]))
for fp in fpaths:
#     print(fp)
    testset.append(parse_tree(fp))

    
file = open("test/T1_erisk_golden_truth.txt", "r")
lines = file.readlines()
file.close()

gt_list_test = []
for line in lines:
    gt = line.split(" ")
    gt[1] = int(gt[1])
    gt_list_test.append(gt)
#     print(gt)
gt_list_test = sorted(gt_list_test, key=lambda x: int(x[0][7:]))
Y_test = [elem[1] for elem in gt_list_test]

In [6]:
with open("corpus.json", 'w+') as f:
    json.dump(corpus, f)

In [7]:
file = open("train/golden_truth.txt", "r")
lines = file.readlines()
file.close()

gt_list = []
for line in lines:
    gt = line.split(" ")
    gt[1] = int(gt[1])
    gt_list.append(gt)
#     print(gt)

gt_list = sorted(gt_list, key=lambda x: int(x[0][7:]))

In [8]:
gt = [elem[1] for elem in gt_list]

# Evaluation Metrics

In [9]:
sum(gt)/len(gt)

0.12058823529411765

In [10]:
import copy
X_test = copy.deepcopy(testset)
for subj in X_test:
    for doc in subj:
        del doc[0]

In [11]:
def evaluate(X_test, Y_test, model, c_fp=0.12, c_fn=1, c_tp=1, o = 0):
    '''
    X_test: [[title, text]]; size = total number of entries in all subjects; nested array of subject and entries
    Y_test: [0/1]; size = # of subjects
    model: 
    return: sum of average score for each subject
    '''
    erde = []
    for (subject, y) in tqdm(zip(X_test, Y_test), total = len(Y_test)):
        score = 0
        for idx, doc in enumerate(subject):
            token = [doc[0]+' '+doc[1]]
            pred = model.predict(token)
            if y == 0: # negative
                if pred == 0:
                    score += c_fp
                else:
                    score += c_fn
            else: # positive
                if pred == 0:
                    pass
                else:
                    score += c_tp * (1 - 1/(1+np.exp(idx - o)))
#         print(score)
        erde.append(score)
    return erde

        
            
        
    

# feature extraction

In [12]:
# newline-seperated document with title + text
new_corpus = []
for subj in corpus:
    tmp = ""
    for doc in subj:
#         print(doc)
        tmp += (doc[1]+" "+doc[2]).replace("\'","")
    new_corpus.append(tmp)

In [13]:
X_test_feature = []
for subj in testset:
    tmp = ""
    for doc in subj:
#         print(doc)
        tmp += (doc[1]+" "+doc[2]).replace("\'","")
    X_test_feature.append(tmp)

In [16]:
nums = [len(subj) for subj in X_test]

## Naive Bayes

In [192]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words="english", ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(new_corpus, gt)

In [236]:
predicted_nb = text_clf.predict(X_test_feature)
np.mean(predicted_nb == Y_test)

0.7541371158392435

In [None]:
erde_nb = evaluate(X_test, Y_test, text_clf)

 12%|█▏        | 51/423 [04:42<48:52,  7.88s/it]  

## SVM

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english", ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),
])
_ = text_clf_svm.fit(new_corpus, gt)
predicted_svm = text_clf_svm.predict(X_test_feature)
np.mean(predicted_svm == Y_test)

0.7754137115839244

In [None]:
erde_svm = evaluate(X_test, Y_test, text_clf_svm)

 12%|█▏        | 52/423 [03:09<29:37,  4.79s/it]

## Grid Search

In [198]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf_svm, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(new_corpus, gt)
predicted_grid = gs_clf.predict(X_test_feature)
np.mean(predicted_grid == Y_test)

0.8250591016548463

In [None]:
erde_gs = evaluate(X_test, Y_test, gs_clf)

## GradientBoost


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
text_clf_gbbost = Pipeline([('vect', CountVectorizer(stop_words="english", ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('clf-gboost', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0))
])
_ = text_clf_gbbost.fit(new_corpus, gt)
predicted_xgboost = text_clf_gbbost.predict(X_test_feature)
np.mean(predicted_xgboost == Y_test)

0.7966903073286052

In [15]:
erde_gb = evaluate(X_test, Y_test, text_clf_gbbost)

  score += c_tp * (1 - 1/(1+np.exp(idx - o)))
100%|██████████| 423/423 [1:37:31<00:00, 13.83s/it]  


In [19]:
def rank_and_save(erde_gs, fpaths, output_name):
    scores = np.array([v/nums[idx] for idx, v in enumerate(erde_gs)])
    ranking_result = []
    for value, file in zip(scores, fpaths):
        ranking_result.append((value, file))
    ranking_result.sort(reverse=True)
    with open(output_name, "w", encoding="utf-8") as f:
        for v, p in ranking_result:
            f.write(f"{v} {p}\n")
rank_and_save(erde_gb, fpaths, "GradientBoost.txt")

## Adaboost

In [200]:
from sklearn.ensemble import AdaBoostClassifier
text_clf_ada = Pipeline([
                      ('vect', CountVectorizer(stop_words="english", ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('clf-ada', AdaBoostClassifier(n_estimators=100)),
])
_ = text_clf_ada.fit(new_corpus, gt)
predicted_adaboost = text_clf_ada.predict(X_test_feature)
np.mean(predicted_adaboost == Y_test)

0.8652482269503546

In [None]:
erde_ada = evaluate(X_test, Y_test, text_clf_ada)