In [39]:
import pandas as pd

In [40]:
FILENAME_DEV = './data/dataset_conll/all.sentence.dev.txt'
FILENAME_TRAIN = './data/dataset_conll/all.sentence.train.txt'
FILENAME_TEST = './data/dataset_conll/all.sentence.test.txt'
FILES = [FILENAME_DEV, FILENAME_TRAIN, FILENAME_TEST]


In [41]:
dataframes = []
for filename in FILES:
    with open(filename, 'r') as f:
        lines = f.readlines()
        data = [line.strip() for line in lines]
        # get last word from each item as a label and match with data
        labeled_data = []
        for item in data:
            label = item.split()[-1]
            sentence_data = ' '.join(item.split()[:-1])
            labeled_data.append((sentence_data, label))

        # create a dataframe
        df = pd.DataFrame(labeled_data, columns=['sentence', 'label'])
        dataframes.append(df)

# merge all dataframes
df = pd.concat(dataframes)

In [None]:
df.head()

In [None]:
labels = pd.DataFrame(df['label'].value_counts())
labels.head()

In [None]:
# display labels distribution
import matplotlib.pyplot as plt
labels.plot(kind='bar')
plt.show()


In [None]:
# find missing labels or sentences
missing_labels = df[df['label'].isnull()]
missing_labels.head()

In [None]:
missing_sentences = df[df['sentence'].isnull()]
missing_sentences.head()


In [47]:
# split data into train, test and validation
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train, validation = train_test_split(test, test_size=0.5, random_state=42, shuffle=True)

In [None]:
# ensure that the distribution of labels is the same in all datasets
tr_c = train['label'].value_counts()
va_c = validation['label'].value_counts()
te_c = test['label'].value_counts()

# display ratio of labels in each dataset
tr_ratio = tr_c / tr_c.sum()
va_ratio = va_c / va_c.sum()
te_ratio = te_c / te_c.sum()

In [None]:
# Create a DataFrame to combine the ratios for comparison
df_ratios = pd.DataFrame({'Test': te_ratio, 'Train': tr_ratio, 'Validation': va_ratio})

# Plotting the bar chart to compare the ratios for different labels
df_ratios.plot(kind='bar', figsize=(10, 6), colormap='Set2')
plt.title('Label ratios')
plt.ylabel('Ratio')
plt.xlabel('Labels')
plt.legend(title='Dataset')
plt.show()

In [49]:
import os

# ensure directories exist
if not os.path.exists('./output'):
    os.makedirs('./output')

train.to_csv('./output/train.csv', index=False)
validation.to_csv('./output/validation.csv', index=False)
test.to_csv('./output/test.csv', index=False)