# Split data and save into train, val, test

Need the following file structure for each langauge:

```
es-ca
    -> es-ca.es
    -> es-ca.ca
    
ca-it
    -> ca-it.it
    -> ca-it.ca
    
es-it
    -> es-it.it
    -> es-it.es
```

In [None]:
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
# Sources for datasets
#!wget https://archive.org/download/ParaCrawl-v8.1-0000/es-ca.txt.gz             # Paracrawl
#!wget https://opus.nlpl.eu/download.php?f=MultiCCAligned/v1/moses/es-it.txt.zip # MultiCCAligned
#!wget https://opus.nlpl.eu/download.php?f=WikiMatrix/v1/moses/ca-it.txt.zip     # WikiMatrix

In [None]:
def split_data(data, train_size, val_size, test_size):
  if train_size + val_size + test_size != 1.0:
    raise Exception("Train, validation, and test sizes must add up to 1.") 
  
  train_mark = int(len(data) * train_size)
  val_mark = train_mark + int(len(data) * val_size)

  train_data = data[0:train_mark]
  val_data = data[train_mark:val_mark]
  test_data = data[val_mark:]

  return train_data, val_data, test_data

def save_data(data, data_folder_name, filename):
  with open(os.path.join(data_folder_name, filename), mode="w") as f:
    for line in data:
      f.write(" ".join(word_tokenize(line)) + "\n")

In [None]:
# source = catalan   (ca)
# pivot  = spanish   (es)
# target = italian  (it)

In [None]:
# split src pivot data
es_data = []
ca_data = []

with open('../es-ca.txt') as f:
  count = 0
  for line in f.readlines():
    es_line, ca_line = line.split('\t')
    es_data.append(es_line)
    ca_data.append(ca_line)
    count+=1
    if count % 1000000 == 0:
      print(f"Reading line {count}")

In [None]:
tgt_data = []     # Italian data
pvt_tgt_data = [] # Spanish data (parallel to Italian data)

src_tgt_data = [] # Catalan data (parallel to Italian data)
tgt_src_data = [] # Italian data (parallel to Catalan data)

with open("/mnt/c/Work/CSE599g1/project/es-it/es-it.it") as f:
  trgt_data = f.readlines()

with open("/mnt/c/Work/CSE599g1/project/es-it/es-it.es") as f:
  pvt_tgt_data = f.readlines()

with open("/mnt/c/Work/CSE599g1/project/ca-it/ca-it.ca") as f:
  src_tgt_data = f.readlines()

with open("/mnt/c/Work/CSE599g1/project/ca-it/ca-it.it") as f:
  tgt_src_data = f.readlines()

In [None]:
# Split data into train val and test
src_train, src_val, src_test = split_data(ca_data, 0.8, 0.1, 0.1)
pvt_src_train, pvt_src_val, pvt_src_test = split_data(es_data, 0.8, 0.1, 0.1)

tgt_train, tgt_val, tgt_test = split_data(tgt_data, 0.8, 0.1, 0.1)
pvt_tgt_train, pvt_tgt_val, pvt_tgt_test = split_data(pvt_tgt_data, 0.8, 0.1, 0.1)

src_tgt_train, src_tgt_val, src_tgt_test = split_data(src_tgt_data, 0.8, 0.1, 0.1)
tgt_src_train, tgt_src_val, tgt_src_test = split_data(tgt_src_data, 0.8, 0.1, 0.1)

In [None]:
# Save data

save_data(src_train, "src_pvt_data", "src_train.txt")
save_data(src_val, "src_pvt_data", "src_val.txt")
save_data(src_test, "src_pvt_data", "src_test.txt")

save_data(pvt_src_train, "src_pvt_data", "pvt_src_train.txt")
save_data(pvt_src_val, "src_pvt_data", "pvt_src_val.txt")
save_data(pvt_src_test, "src_pvt_data", "pvt_src_test.txt")

save_data(tgt_train, "pvt_tgt_data", "tgt_train.txt")
save_data(tgt_val, "pvt_tgt_data", "tgt_val.txt")
save_data(tgt_test, "pvt_tgt_data", "tgt_test.txt")

save_data(pvt_tgt_train, "pvt_tgt_data", "pvt_tgt_train.txt")
save_data(pvt_tgt_val, "pvt_tgt_data", "pvt_tgt_val.txt")
save_data(pvt_tgt_test, "pvt_tgt_data", "pvt_tgt_test.txt")

save_data(src_tgt_train, "src_tgt_data", "src_tgt_train.txt")
save_data(src_tgt_val, "src_tgt_data", "src_tgt_val.txt")
save_data(src_tgt_test, "src_tgt_data", "src_tgt_test.txt")

save_data(tgt_src_train, "src_tgt_data", "tgt_src_train.txt")
save_data(tgt_src_val, "src_tgt_data", "tgt_src_val.txt")
save_data(tgt_src_test, "src_tgt_data", "tgt_src_test.txt")