In [128]:
import numpy as np
from collections import Counter
from collections import defaultdict
from nltk.tag.perceptron import PerceptronTagger
import nltk

In [130]:
tagger = PerceptronTagger()
tagset = 'universal'

In [136]:
def read_sentences_with_pos_tags(path):
    sentences_with_pos_tags = []

    with open(path, 'r', encoding='utf-8') as file:
        current_sentence = []
        for line in file:
            line = line.replace('\n', '')                   # Eliminate all \n
            if line.strip() and not line.startswith('#'):   # Skip empty lines and comments
                fields = line.split(' ')
                word = fields[0].lower()        
                xpos = nltk.tag._pos_tag([word], tagset, tagger, lang='eng')[0][1]
                chunk = fields[2]                        
                current_sentence.append((word, xpos, chunk))
            elif current_sentence:
                sentences_with_pos_tags.append(current_sentence)
                current_sentence = []

    return sentences_with_pos_tags

In [137]:
path = "../../data/chunking/train.txt"
sentences = read_sentences_with_pos_tags(path)

In [152]:
sentences[:5]

[[('confidence', 'NOUN', 'B-NP'),
  ('in', 'ADP', 'B-PP'),
  ('the', 'DET', 'B-NP'),
  ('pound', 'NOUN', 'I-NP'),
  ('is', 'VERB', 'B-VP'),
  ('widely', 'ADV', 'I-VP'),
  ('expected', 'VERB', 'I-VP'),
  ('to', 'PRT', 'I-VP'),
  ('take', 'VERB', 'I-VP'),
  ('another', 'DET', 'B-NP'),
  ('sharp', 'ADJ', 'I-NP'),
  ('dive', 'NOUN', 'I-NP'),
  ('if', 'ADP', 'B-SBAR'),
  ('trade', 'NOUN', 'B-NP'),
  ('figures', 'NOUN', 'I-NP'),
  ('for', 'ADP', 'B-PP'),
  ('september', 'NOUN', 'B-NP'),
  (',', '.', 'O'),
  ('due', 'ADJ', 'B-ADJP'),
  ('for', 'ADP', 'B-PP'),
  ('release', 'NOUN', 'B-NP'),
  ('tomorrow', 'NOUN', 'B-NP'),
  (',', '.', 'O'),
  ('fail', 'NOUN', 'B-VP'),
  ('to', 'PRT', 'I-VP'),
  ('show', 'NOUN', 'I-VP'),
  ('a', 'DET', 'B-NP'),
  ('substantial', 'ADJ', 'I-NP'),
  ('improvement', 'NOUN', 'I-NP'),
  ('from', 'ADP', 'B-PP'),
  ('july', 'NOUN', 'B-NP'),
  ('and', 'CONJ', 'I-NP'),
  ('august', 'NOUN', 'I-NP'),
  ("'s", 'PRT', 'B-NP'),
  ('near-record', 'NOUN', 'I-NP'),
  ('deficits'

In [139]:
def replace_low_frequency_words(sentences_with_pos_tags, filter_count=1):
    # Count the frequencies of each word
    word_counts = Counter(word for sentence in sentences_with_pos_tags for word, _, _ in sentence)

    # Replace words with count less than filter_count to 'UNK' and their tags to 'UNK_TAG'
    processed_sentences = []
    for sentence in sentences_with_pos_tags:
        new_sentence = []
        for word, xpos, chunk in sentence:
            if word_counts[word] < filter_count:
                new_word = 'UNK'                # Only set the word to UNK
                new_xpos = 'UNK'
            else:
                new_word = word
                new_xpos = xpos
            new_sentence.append((new_word, new_xpos, chunk))
        processed_sentences.append(new_sentence)

    return processed_sentences

In [153]:
filtered_sentences = replace_low_frequency_words(sentences, filter_count=3)
filtered_sentences[:3]

[[('confidence', 'NOUN', 'B-NP'),
  ('in', 'ADP', 'B-PP'),
  ('the', 'DET', 'B-NP'),
  ('pound', 'NOUN', 'I-NP'),
  ('is', 'VERB', 'B-VP'),
  ('widely', 'ADV', 'I-VP'),
  ('expected', 'VERB', 'I-VP'),
  ('to', 'PRT', 'I-VP'),
  ('take', 'VERB', 'I-VP'),
  ('another', 'DET', 'B-NP'),
  ('sharp', 'ADJ', 'I-NP'),
  ('dive', 'NOUN', 'I-NP'),
  ('if', 'ADP', 'B-SBAR'),
  ('trade', 'NOUN', 'B-NP'),
  ('figures', 'NOUN', 'I-NP'),
  ('for', 'ADP', 'B-PP'),
  ('september', 'NOUN', 'B-NP'),
  (',', '.', 'O'),
  ('due', 'ADJ', 'B-ADJP'),
  ('for', 'ADP', 'B-PP'),
  ('release', 'NOUN', 'B-NP'),
  ('tomorrow', 'NOUN', 'B-NP'),
  (',', '.', 'O'),
  ('fail', 'NOUN', 'B-VP'),
  ('to', 'PRT', 'I-VP'),
  ('show', 'NOUN', 'I-VP'),
  ('a', 'DET', 'B-NP'),
  ('substantial', 'ADJ', 'I-NP'),
  ('improvement', 'NOUN', 'I-NP'),
  ('from', 'ADP', 'B-PP'),
  ('july', 'NOUN', 'B-NP'),
  ('and', 'CONJ', 'I-NP'),
  ('august', 'NOUN', 'I-NP'),
  ("'s", 'PRT', 'B-NP'),
  ('UNK', 'UNK', 'I-NP'),
  ('UNK', 'UNK', 'I-NP

In [154]:
def create_vocab_index(sentences_with_pos_tags):
    def build_index(items, start_index=0):
        item_to_index = defaultdict(lambda: len(item_to_index) + start_index)
        for item in items:
            item_to_index[item]
        return dict(item_to_index)

    all_words = [word for sentence in sentences_with_pos_tags for word, _, _ in sentence]
    all_xpos_tags = [xpos for sentence in sentences_with_pos_tags for _, xpos, _ in sentence]

    return build_index(all_words, start_index=1), build_index(all_xpos_tags, start_index=1)

In [155]:
word_to_index, xpos_to_index =create_vocab_index(filtered_sentences)

In [156]:
print(len(word_to_index), len(xpos_to_index))

6325 13


In [157]:
def convert_to_indexes(filtered_sentences_tags, word_to_index, xpos_to_index):
    hidden_states = []
    observations = []

    for sentence in filtered_sentences_tags:
        # if len(sentence) <= 5:
        #     continue
        sentence_xpos_indexes = [xpos_to_index[xpos] for _, xpos, _ in sentence]
        sentence_word_indexes = [word_to_index[word] for word, _, _ in sentence]
        
        hidden_states.append(sentence_xpos_indexes)
        observations.append(sentence_word_indexes)

    return hidden_states, observations

In [164]:
hidden_states, observations = convert_to_indexes(filtered_sentences, word_to_index, xpos_to_index)

In [165]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, observations[:5][index])) + ']')
    print('-----------------------------')

[1, 2, 3, 1, 4, 5, 4, 6, 4, 3, 7, 1, 2, 1, 1, 2, 1, 8, 7, 2, 1, 1, 8, 1, 6, 1, 3, 7, 1, 2, 1, 9, 1, 6, 10, 10, 8]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 16, 20, 21, 18, 22, 8, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 33]
-----------------------------
[1, 2, 3, 1, 1, 1, 6, 4, 1, 6, 3, 1, 7, 1, 4, 4, 6, 1, 3, 10, 2, 1, 2, 3, 1, 1, 8]
[34, 35, 3, 36, 37, 38, 31, 39, 40, 8, 24, 41, 42, 43, 44, 45, 8, 46, 24, 32, 2, 47, 48, 3, 49, 50, 33]
-----------------------------
[9, 1, 10, 4, 1, 2, 1, 4, 4, 4, 2, 3, 1, 6, 1, 6, 1, 3, 7, 1, 1, 2, 11, 1, 1, 1, 7, 1, 8]
[51, 52, 32, 53, 54, 16, 47, 44, 55, 56, 57, 3, 34, 31, 58, 8, 59, 60, 61, 43, 62, 2, 63, 64, 65, 66, 67, 68, 33]
-----------------------------
[3, 4, 4, 3, 1, 2, 3, 1, 4, 4, 6, 1, 1, 1, 6, 12, 1, 2, 11, 7, 12, 1, 1, 6, 1, 3, 1, 8, 1, 9, 7, 1, 1, 1, 4, 8]
[69, 44, 70, 3, 71, 35, 3, 72, 73, 74, 8, 75, 76, 77, 8, 78, 79, 27, 80, 81, 82, 79, 83, 8, 84, 3, 4, 18, 85, 29, 86, 87, 88, 52, 89, 33]
----------------

In [147]:
def add_noise_to_states(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(1, number_states + 1))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [166]:
noise_level = 0.8
noisy_hidden_states = add_noise_to_states(hidden_states, len(xpos_to_index), flip_prob=noise_level)

In [167]:
for i in range(len(hidden_states)):
    hidden_states[i].insert(0, 0)
    noisy_hidden_states[i].insert(0, 0)
    observations[i].insert(0, 0)

In [168]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, observations[:5][index])) + ']')
    print('-----------------------------')

[0, 1, 2, 3, 1, 4, 5, 4, 6, 4, 3, 7, 1, 2, 1, 1, 2, 1, 8, 7, 2, 1, 1, 8, 1, 6, 1, 3, 7, 1, 2, 1, 9, 1, 6, 10, 10, 8]
[0, 6, 1, 5, 6, 9, 12, 13, 11, 13, 9, 13, 7, 12, 4, 4, 12, 2, 9, 7, 6, 10, 4, 8, 9, 1, 8, 13, 11, 2, 1, 8, 4, 1, 6, 7, 3, 8]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 16, 20, 21, 18, 22, 8, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 33]
-----------------------------
[0, 1, 2, 3, 1, 1, 1, 6, 4, 1, 6, 3, 1, 7, 1, 4, 4, 6, 1, 3, 10, 2, 1, 2, 3, 1, 1, 8]
[0, 3, 3, 11, 10, 7, 11, 8, 7, 3, 10, 7, 1, 1, 10, 10, 4, 8, 5, 2, 7, 5, 4, 1, 9, 2, 9, 4]
[0, 34, 35, 3, 36, 37, 38, 31, 39, 40, 8, 24, 41, 42, 43, 44, 45, 8, 46, 24, 32, 2, 47, 48, 3, 49, 50, 33]
-----------------------------
[0, 9, 1, 10, 4, 1, 2, 1, 4, 4, 4, 2, 3, 1, 6, 1, 6, 1, 3, 7, 1, 1, 2, 11, 1, 1, 1, 7, 1, 8]
[0, 9, 7, 10, 1, 7, 9, 5, 13, 4, 5, 9, 1, 1, 2, 7, 4, 3, 3, 8, 10, 8, 10, 1, 1, 8, 7, 11, 11, 9]
[0, 51, 52, 32, 53, 54, 16, 47, 44, 55, 56, 57, 3, 34, 31, 58, 8, 59, 60, 61, 43, 

In [169]:
file_path = f"../../data/chunking_synthetic_dataset(noise-{noise_level}).npz"
obs_object = np.array(observations, dtype=object)
hid_object = np.array(hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
np.savez(file_path, num_states_specific=len(xpos_to_index) + 1, num_obs=len(word_to_index) + 1, observation=obs_object, real_hidden_specific=hid_object, noisy_hidden_specific=noisy_hid_object, noisy_level=noise_level)

In [170]:
data_path = f"../../result/chunking-noise-0.9_iter-20_timestamp-0226_150117_state.npz"
loaded_npz = np.load(data_path, allow_pickle=True)
sampled_hidden_states = loaded_npz['hidden_state']
print(sampled_hidden_states[:5])

[array([ 0,  5,  2,  3, 13,  2,  8,  8,  2,  3,  3,  1,  2,  3,  1, 11,  2,
        11,  2, 11,  2,  3,  8,  2,  8,  2,  3,  2,  3, 13,  2, 11,  2, 11,
         2,  3,  1,  2])
 array([ 0,  5,  2,  3,  3,  3,  1,  2,  3,  1,  2,  3,  1,  3,  1,  8,  2,
         2,  3,  3,  1,  2,  5,  2,  3,  1, 13,  2])
 array([ 0,  5,  3,  1,  1,  8,  2, 11,  8,  2,  5,  2,  3,  1,  8,  8,  2,
         3,  3,  3,  1,  8,  2,  3,  3, 13,  2,  3, 11,  2])
 array([ 0,  3,  8,  2,  3, 13,  2,  3, 13,  2,  8,  2,  3, 13,  8,  2,  1,
         8,  2,  3,  1,  8,  2,  8,  2,  3,  3, 13,  2,  6,  2,  3,  1,  1,
        11,  8,  2])
 array([ 0,  5,  3,  1,  2, 11,  2,  3, 13,  8,  8,  2,  3,  3,  2,  3,  8,
         2,  8, 11,  2,  3,  8,  2,  3,  1, 13,  2, 11,  8,  8,  2])       ]


In [171]:
len(sampled_hidden_states)

8971

In [172]:
def create_tsv_from_data(data, file_path, pos=True):
    # Open the file for writing
    with open(file_path, 'w', encoding='utf-8') as file:
        for index, sentence in enumerate(data):
            for i, (word, _, chunk) in enumerate(sentence):
                if pos: 
                    file.write(f"{word}\t{sampled_hidden_states[index][i+1]}\t{chunk}\n")
                else: 
                    file.write(f"{word}\t{chunk}\n")
            file.write("\n")

In [173]:
create_tsv_from_data(sentences, "/Users/binglunli/Desktop/CRF++-0.58/train_file_sampled.txt")
create_tsv_from_data(sentences, "/Users/binglunli/Desktop/CRF++-0.58/train_file_controlled.txt", pos=False)

In [73]:
text_sentences = read_sentences_with_pos_tags("../data/chunking/test.txt")
create_tsv_from_data(text_sentences, "/Users/binglunli/Desktop/CRF++-0.58/experiment/test_controlled.txt", pos=False)

In [104]:
def test_output_accuracy(output_path, truth_col, prediction_col):
    truth = []
    prediction = []
    with open(output_path, 'r', encoding='utf-8') as file: 
        for line in file: 
            line = line.replace('\n', '')       
            if line.strip():
                fields = line.split('\t')
                # print(fields)
                truth.append(fields[truth_col])
                prediction.append(fields[prediction_col])
            
    return np.array(truth), np.array(prediction)

In [205]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
def compute_score(true_values, predicted_values):
    acc_score = accuracy_score(true_values, predicted_values)
    # rec_score = recall_score(true_values, predicted_values, average='weighted')
    prec_score = precision_score(true_values, predicted_values, average='weighted')
    f1 = f1_score(true_values, predicted_values, average='weighted')
    return acc_score, prec_score, f1 

In [206]:
output_path = "/Users/binglunli/Desktop/CRF++-0.58/experiment/output/output_controlled.txt"
truth, prediction = test_output_accuracy(output_path, 1, 2)
compute_score(truth, prediction)

  _warn_prf(average, modifier, msg_start, len(result))


(0.9277540852087995, 0.9261090960207328, 0.9261159430304706)

In [207]:
output_path = "/Users/binglunli/Desktop/CRF++-0.58/experiment/output/output_real.txt"
truth, prediction = test_output_accuracy(output_path, 2, 3)
compute_score(truth, prediction)

  _warn_prf(average, modifier, msg_start, len(result))


(0.9599383667180277, 0.959634323086229, 0.9597060730112831)

In [208]:
output_path = "/Users/binglunli/Desktop/CRF++-0.58/experiment/output/output_sampled.txt"
truth, prediction = test_output_accuracy(output_path, 2, 3)
compute_score(truth, prediction)

  _warn_prf(average, modifier, msg_start, len(result))


(0.9482689391613247, 0.9469701566552355, 0.9469474964927541)