In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os


In [2]:
df = pd.read_csv('./vnexpress_crawled.csv')
df.shape

(33311, 5)

In [3]:
# Extract the Category attribute paired with Title, Description, and Content
title_df = df[['Category', 'Title']]
description_df = df[['Category', 'Description']]
content_df = df[['Category', 'Content']]

## Split the data into train, test and validate


In [4]:
# Function to Split the data into train and test (8:2)
def train_test_split_data(data):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
    return train_data, test_data

# Function to Split the train data into train and validate (9:1)
def train_validate_split_data(train_data):
    train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42, shuffle=True)
    return train_data, validate_data

In [5]:
# Split the data into train and test
title_train, title_test = train_test_split_data(title_df)
description_train, description_test = train_test_split_data(description_df)
content_train, content_test = train_test_split_data(content_df)

# Split the train data into train and validate
title_train, title_validate = train_validate_split_data(title_train)
description_train, description_validate = train_validate_split_data(description_train)
content_train, content_validate = train_validate_split_data(content_train)


## Create the directory structure


In [7]:
def create_directory(output_dir, subdir, split_dir):
    path = os.path.join(output_dir, subdir, split_dir)
    os.makedirs(path, exist_ok=True)
    return path

In [8]:
path_sets = set()

output_dir = './'
subdir = "subdirectory"
split_dir = "split_directory"

for subdir in ['title', 'description', 'content']:
    for split_dir in ['train', 'test', 'validate']:
        dir_path = create_directory(output_dir, subdir, split_dir)
        path_sets.add(dir_path)

## Write splitted data into text file


In [9]:
def write_data_to_file(category_file, input_file, data):
    with open(category_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(data['Category'].astype(str)))
    with open(input_file, 'w', encoding='utf-8') as f:
        if 'Title' in data.columns:
            f.write('\n'.join(data['Title'].astype(str)))
        elif 'Description' in data.columns:
            f.write('\n'.join(data['Description'].astype(str)))
        elif 'Content' in data.columns:
            f.write('\n'.join(data['Content'].astype(str)))

# Path to the root folder
output_dir = './'

# write the data into text files
write_data_to_file(output_dir + 'title/validate/category.txt', output_dir + 'title/validate/input.txt', title_validate)
write_data_to_file(output_dir + 'title/test/category.txt', output_dir + 'title/test/input.txt', title_test)
write_data_to_file(output_dir + 'title/train/category.txt', output_dir + 'title/train/input.txt', title_train)

write_data_to_file(output_dir + 'description/validate/category.txt', output_dir + 'description/validate/input.txt', description_validate)
write_data_to_file(output_dir + 'description/test/category.txt', output_dir + 'description/test/input.txt', description_test)
write_data_to_file(output_dir + 'description/train/category.txt', output_dir + 'description/train/input.txt', description_train)

write_data_to_file(output_dir + 'content/validate/category.txt', output_dir + 'content/validate/input.txt', content_validate)
write_data_to_file(output_dir + 'content/test/category.txt', output_dir + 'content/test/input.txt', content_test)
write_data_to_file(output_dir + 'content/train/category.txt', output_dir + 'content/train/input.txt', content_train)
