# Task1 Thesis Tagging Data Analysis

In [1]:
import os
import pickle
import numpy as np
import pandas
import nltk
nltk.download('punkt')
from tabulate import tabulate

[nltk_data] Downloading package punkt to
[nltk_data]     /home/student/05/b05505004/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data_path = os.path.join("data", "train_processed.pkl")
with open(data_path, "rb") as f:
    datas = pickle.load(f)
data_len = len(datas)
print(datas[0][0])

{'number': 'T00001_S001', 'sentence': 'Modern cyber security operations collect an enormous amount of logging and alerting data.', 'label': [0]}


In [3]:
total_len = 0
sentence_cnt = 0
max_sentence_len = 0
max_sentence_per_article = 0

label_cnt = np.zeros(6)
correlation = np.zeros((6,6))
label_position = np.zeros((25,6))

for article in datas:
    if len(article) > max_sentence_per_article:
        max_sentence_per_article = len(article)
    for order, data in enumerate(article):
        tokens = nltk.word_tokenize(data["sentence"])
        cnt = len(tokens)
        total_len += cnt
        if cnt > max_sentence_len:
            max_sentence_len = cnt
        
        for l in data["label"]:
            label_cnt[l] += 1
            label_position[order][l] += 1
            
        for i in range(6):
            for j in range(i + 1, 6):
                if i in data["label"] and j in data["label"]:
                    correlation[i][j] += 1
                    correlation[j][i] += 1
        sentence_cnt += 1
avg_len = total_len / sentence_cnt
label_percent = label_cnt / np.sum(label_cnt)

## Some Statistics

In [4]:
print("Article count: {}".format(data_len))
print("Sentence count: {}".format(sentence_cnt))
print("Average sentence length: {}".format(avg_len))
print("Max sentence length: {}".format(max_sentence_len))
print("Average sentence per article: {}".format(sentence_cnt / data_len))
print("Max sentence per article: {}".format(max_sentence_per_article))

Article count: 7000
Sentence count: 46867
Average sentence length: 25.507563957582093
Max sentence length: 206
Average sentence per article: 6.695285714285714
Max sentence per article: 24


## Count for Each Label Appearing

In [5]:
header = ["BACKGROUND","OBJECTIVES","METHODS","RESULTS", "CONCLUSIONS","OTHERS"]
print(tabulate([label_cnt, label_percent], headers=header, tablefmt='orgtbl'))

|   BACKGROUND |   OBJECTIVES |      METHODS |      RESULTS |   CONCLUSIONS |     OTHERS |
|--------------+--------------+--------------+--------------+---------------+------------|
| 13353        |  9329        | 13655        | 11772        |  5313         | 901        |
|     0.245807 |     0.171732 |     0.251367 |     0.216704 |     0.0978039 |   0.016586 |


## Count for Two Labels Appearing Concurrently
兩個 label 同時出現的次數統計

In [6]:
correlation_row = [[h] + r for h,r in zip(header, correlation.tolist())]
print(tabulate(correlation_row, headers=header, tablefmt='orgtbl'))

|             |   BACKGROUND |   OBJECTIVES |   METHODS |   RESULTS |   CONCLUSIONS |   OTHERS |
|-------------+--------------+--------------+-----------+-----------+---------------+----------|
| BACKGROUND  |            0 |         1034 |       349 |       179 |           115 |        0 |
| OBJECTIVES  |         1034 |            0 |      1572 |       563 |           299 |        0 |
| METHODS     |          349 |         1572 |         0 |      1500 |           417 |        0 |
| RESULTS     |          179 |          563 |      1500 |         0 |          2372 |        0 |
| CONCLUSIONS |          115 |          299 |       417 |      2372 |             0 |        0 |
| OTHERS      |            0 |            0 |         0 |         0 |             0 |        0 |


## Count for Label Appearing in Specific Sentence Index 
文章段落中第 n 個句子出現特定 label 的次數統計

In [7]:
label_position_row = [[i] + r for i,r in enumerate(label_position.tolist())]
print(tabulate(label_position_row, headers=header, tablefmt='orgtbl'))

|    |   BACKGROUND |   OBJECTIVES |   METHODS |   RESULTS |   CONCLUSIONS |   OTHERS |
|----+--------------+--------------+-----------+-----------+---------------+----------|
|  0 |         5114 |         1908 |       346 |       265 |           134 |       62 |
|  1 |         3825 |         1783 |      1432 |       538 |           185 |       84 |
|  2 |         2163 |         1965 |      2292 |      1029 |           302 |       96 |
|  3 |         1147 |         1504 |      2652 |      1615 |           525 |       90 |
|  4 |          546 |          979 |      2433 |      1927 |           807 |      128 |
|  5 |          283 |          554 |      1851 |      1912 |           821 |      113 |
|  6 |          123 |          296 |      1228 |      1638 |           805 |       98 |
|  7 |           67 |          168 |       710 |      1185 |           618 |       77 |
|  8 |           35 |           93 |       371 |       760 |           453 |       53 |
|  9 |           16 |           