You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
%%time
# Clean, tokenize, and apply padding / truncating such that each document length = 70
# also, retain only the top 8,000 words in the vocabulary and set the remaining words
# to 1 which will become common index for rare words
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)
I end up with
WARNING:root:....tokenizing data
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/multiprocess/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/opt/conda/lib/python3.6/site-packages/multiprocess/pool.py", line 44, in mapstar
return list(map(*args))
File "/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py", line 90, in process_text
return [tokenizer(cleaner(doc)) for doc in text]
File "/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py", line 90, in <listcomp>
return [tokenizer(cleaner(doc)) for doc in text]
File "/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py", line 57, in textacy_cleaner
no_accents=True)
File "/opt/conda/lib/python3.6/site-packages/textacy/preprocess.py", line 217, in preprocess_text
text = fix_bad_unicode(text, normalization='NFC')
File "/opt/conda/lib/python3.6/site-packages/textacy/preprocess.py", line 38, in fix_bad_unicode
return fix_text(text, normalization=normalization)
File "/opt/conda/lib/python3.6/site-packages/ftfy/__init__.py", line 156, in fix_text
while pos < len(text):
TypeError: object of type 'float' has no len()
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py in apply_parallel(func, data, cpu_cores)
73 pool = Pool(cpu_cores)
---> 74 transformed_data = pool.map(func, chunked(data, chunk_size), chunksize=1)
75 finally:
/opt/conda/lib/python3.6/site-packages/multiprocess/pool.py in map(self, func, iterable, chunksize)
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
/opt/conda/lib/python3.6/site-packages/multiprocess/pool.py in get(self, timeout)
643 else:
--> 644 raise self._value
645
TypeError: object of type 'float' has no len()
During handling of the above exception, another exception occurred:
UnboundLocalError Traceback (most recent call last)
<timed exec> in <module>()
/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py in fit_transform(self, data)
336
337 """
--> 338 tokenized_data = self.fit(data, return_tokenized_data=True)
339
340 logging.warning(f'...fit is finished, beginning transform')
/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py in fit(self, data, return_tokenized_data)
278 now = get_time()
279 logging.warning(f'....tokenizing data')
--> 280 tokenized_data = self.parallel_process_text(data)
281
282 if not self.padding_maxlen:
/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py in parallel_process_text(self, data)
233 end_tok=self.end_tok)
234 n_cores = self.num_cores
--> 235 return flattenlist(apply_parallel(process_text, data, n_cores))
236
237 def generate_doc_length_stats(self):
/opt/conda/lib/python3.6/site-packages/ktext/preprocess.py in apply_parallel(func, data, cpu_cores)
76 pool.close()
77 pool.join()
---> 78 return transformed_data
79
80
UnboundLocalError: local variable 'transformed_data' referenced before assignment
The text was updated successfully, but these errors were encountered:
somehow able to get pass this error by changing transformed_data = pool.map(func, chunked(data, chunk_size), chunksize=1) to transformed_data = pool.imap(func, chunked(data, chunk_size), chunksize=1)
Hi,
Using ktext with another github issue dataset I am getting an exception
UnboundLocalError: local variable 'transformed_data' referenced before assignment
when following https://github.com/hamelsmu/Seq2Seq_Tutorial/blob/master/notebooks/Tutorial.ipynb.My dataset (just not from kaggle) looks like this
Then splitting it into train and test:
and when running
I end up with
The text was updated successfully, but these errors were encountered: