In [None]:
import pandas as pd
import numpy as np

In [None]:
train_data = pd.read_csv("./sst_train.csv")
val_data = pd.read_csv("./sst_dev.csv")
test_data = pd.read_csv("./sst_test.csv")

In [None]:
word_set = set()
for index, row in train_data.iterrows():
    text_string = row['text']
    words = [x.strip() for x in text_string.split(' ')]
    for w in words:
        word_set.add(w)

In [None]:
print("Number of unique tokens: " + str(len(word_set)))

Number of unique tokens: 13822


In [None]:
def get_freq_dict(dataset):
    """
    input a pd.Dataframe object
    output a dictionary containing the key as row index, value as number of unknown words wrt. to the training set
    """
    word_freq = {}
    
    for index, row in dataset.iterrows():
        text_string = row['text']
        words = [x.strip() for x in text_string.split(' ')]
        num_unknown = 0
        for w in words:
            if w not in word_set:
                num_unknown += 1
        word_freq[index] = num_unknown
        
    return word_freq

In [None]:
val_word_freq = get_freq_dict(val_data)
test_word_freq = get_freq_dict(test_data)

In [None]:
def sort_dict_by_value(d):
    """
    return a list of tuples
    """
    return sorted(d.items(), key=lambda x:x[1])

In [None]:
sorted_val_word_freq = sort_dict_by_value(val_word_freq)
sorted_test_word_freq = sort_dict_by_value(test_word_freq)

In [None]:
len(sorted_val_word_freq)
sorted_val_word_freq[857][1]

5

In [None]:
val_few_unk = pd.DataFrame() #zero unknonw token wrt. to the training set


for i in range(250):
    index = sorted_val_word_freq[i][0]
    dfi = val_data.iloc[[index]]
    val_few_unk = pd.concat([val_few_unk,dfi])

val_few_unk = val_few_unk.iloc[:, 1:4]
val_few_unk.to_csv('val_few_unk.csv')

In [None]:
val_many_unk = pd.DataFrame() #at least 5 unknown words wrt. to the training set

for i in range(857, 872):
    index = sorted_val_word_freq[i][0]
    dfi = val_data.iloc[[index]]
    val_many_unk = pd.concat([val_many_unk,dfi])
    
val_many_unk = val_many_unk.iloc[:, 1:4]
val_many_unk.to_csv('val_many_unk.csv')

In [None]:
sorted_test_word_freq

[(0, 0),
 (1, 0),
 (2, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (14, 0),
 (15, 0),
 (17, 0),
 (19, 0),
 (21, 0),
 (22, 0),
 (27, 0),
 (28, 0),
 (30, 0),
 (31, 0),
 (34, 0),
 (35, 0),
 (37, 0),
 (38, 0),
 (39, 0),
 (43, 0),
 (44, 0),
 (45, 0),
 (49, 0),
 (50, 0),
 (52, 0),
 (55, 0),
 (56, 0),
 (59, 0),
 (61, 0),
 (64, 0),
 (65, 0),
 (69, 0),
 (72, 0),
 (74, 0),
 (78, 0),
 (82, 0),
 (83, 0),
 (85, 0),
 (87, 0),
 (89, 0),
 (93, 0),
 (96, 0),
 (97, 0),
 (99, 0),
 (100, 0),
 (104, 0),
 (105, 0),
 (106, 0),
 (112, 0),
 (113, 0),
 (114, 0),
 (115, 0),
 (116, 0),
 (121, 0),
 (122, 0),
 (123, 0),
 (125, 0),
 (126, 0),
 (127, 0),
 (130, 0),
 (133, 0),
 (137, 0),
 (138, 0),
 (145, 0),
 (146, 0),
 (151, 0),
 (154, 0),
 (155, 0),
 (157, 0),
 (158, 0),
 (160, 0),
 (161, 0),
 (163, 0),
 (166, 0),
 (167, 0),
 (170, 0),
 (171, 0),
 (173, 0),
 (174, 0),
 (176, 0),
 (183, 0),
 (185, 0),
 (188, 0),
 (190, 0),
 (191, 0),
 (193, 0),
 (195, 0),
 (197, 0),
 (199, 0),
 (203, 0),
 

In [None]:
print(len(sorted_val_word_freq))
print(len(sorted_test_word_freq))
print(sorted_test_word_freq[840][1])
print(sorted_test_word_freq[1797][1])

872
1821
0
5


In [None]:
test_few_unk = pd.DataFrame() #zero unknonw token wrt. to the training set

for i in range(250):
    index = sorted_test_word_freq[i][0]
    dfi = test_data.iloc[[index]]
    test_few_unk = pd.concat([test_few_unk,dfi])

test_few_unk = test_few_unk.iloc[:, 1:4]
test_few_unk.to_csv('test_few_unk.csv')

In [None]:
test_many_unk = pd.DataFrame() #at least 5 unknown words wrt. to the training set

for i in range(1780, 1821):
    index = sorted_test_word_freq[i][0]
    dfi = test_data.iloc[[index]]
    test_many_unk = pd.concat([test_many_unk,dfi])
    
test_many_unk = test_many_unk.iloc[:, 1:4]
test_many_unk.to_csv('test_many_unk.csv')