# Tokenization of Translated Sentences
This file can be used to tokenize translated sentence using Spacy.
For information on Spacy models and languages covered by Spacy visit here: https://spacy.io/models

In [73]:
! spacy download it_core_news_md

Collecting it-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.7.0/it_core_news_md-3.7.0-py3-none-any.whl (42.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: it-core-news-md
Successfully installed it-core-news-md-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_md')


In [50]:
import os, csv
import pandas as pd
import spacy

In [89]:
# target 
language = "Italian"

RESOURCE = 'Sense-Projection'
DATA = RESOURCE+'/data/'+language

TOKENS_FILE = os.path.join(DATA, 'tokens-'+language+'.tsv')
TRANS_FILE = os.path.join(DATA, 'trans-sentences-'+language+'.tsv')


# DATA

In [90]:
trans_df = pd.read_csv(TRANS_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
sids = [x[0] for x in trans_df.values.tolist()]
trans_df.head()

['d001.s002', 'd001.s003', 'd001.s004', 'd001.s005', 'd001.s006', 'd001.s007', 'd001.s008', 'd001.s009', 'd001.s010', 'd001.s011', 'd001.s012', 'd001.s013', 'd001.s014', 'd001.s015', 'd001.s016', 'd001.s017', 'd001.s018', 'd001.s019', 'd001.s020', 'd001.s021', 'd001.s022', 'd001.s023', 'd001.s024', 'd001.s025', 'd001.s026', 'd001.s027', 'd001.s028', 'd001.s029', 'd001.s030', 'd001.s031', 'd001.s032', 'd001.s033', 'd001.s034', 'd001.s035', 'd001.s036', 'd001.s037', 'd002.s001', 'd002.s002', 'd002.s003', 'd002.s004', 'd002.s005', 'd002.s006', 'd002.s007', 'd002.s008', 'd002.s009', 'd002.s010', 'd002.s011', 'd002.s012', 'd002.s013', 'd002.s014', 'd002.s015', 'd002.s016', 'd002.s017', 'd002.s018', 'd002.s019', 'd002.s020', 'd002.s021', 'd002.s022', 'd002.s023', 'd002.s024', 'd002.s025', 'd002.s026', 'd002.s027', 'd002.s028', 'd002.s029', 'd002.s030', 'd002.s031', 'd002.s032', 'd002.s033', 'd002.s034', 'd002.s035', 'd002.s036', 'd002.s037', 'd002.s038', 'd002.s039', 'd002.s040', 'd002.s041'

In [91]:
tgt_sents = [x[1] for x in trans_df.values.tolist()]

# TOKENIZATION

In [92]:
# nlp = spacy.load("es_core_news_md")
nlp = spacy.load("it_core_news_md")


In [94]:
tk_ids, tks_list, pos_list , lemma_list = [], [], [], []
total = 0
x = []
for sid, tgt_s in zip(sids, tgt_sents):
    doc = nlp(tgt_s)
    seq_len = len(doc)
    total += seq_len
    for token in doc:
        tks_list.append(token.text)
        pos_list.append(token.pos_)
        lemma_list.append(token.lemma_)
    tk_ids += [f'{sid}.t{i:03}' for i in range(1, seq_len+1)]
len(tk_ids), len(tks_list), len(pos_list), len(lemma_list)
#print(total)

(2739, 2739, 2739, 2739)

In [96]:
tks_df = pd.DataFrame({
    'Token ID': tk_ids
    , 'Token': tks_list
    , 'Lemma': lemma_list
    , 'POS': pos_list
    })
tks_df.head()

Unnamed: 0,Token ID,Token,Lemma,POS
0,d001.s002.t001,L,L,DET
1,d001.s002.t002,EPAR,EPAR,PROPN
2,d001.s002.t003,descrive,descrivere,VERB
3,d001.s002.t004,il,il,DET
4,d001.s002.t005,modo,modo,NOUN


In [97]:
tks_df.to_csv(TOKENS_FILE, sep='\t', index=False)