Description
What happened?
No hail log file is available.
On 0.2.109: 5k samples and 8 interval lists -- WORKED
5k samples and 1 interval list -- WORKED
On 0.2.120: 2k samples and 1 interval list -- WORKED
On 0.2.120: 2k samples and 2 interval lists -- WORKED
On 0.2.120: 2k samples and 4 interval list -- ERROR
On 0.2.120: 2k samples and 8 interval list -- ERROR (edited)All of these runs were on driver: 96 CPU/684G RAM
Workers 4 CPU and 8GB RAM
Spark configuration allocated 512GB for driverI have tried the above in various configurations... Maybe a specific interval list is problematic, but that does not seem to be the case
The interval lists are the same across runs.
And lastly, the error is the usual Py4J Error. Usually I address this w/ more driver RAM, but I can't go any higher and this used to work fine in Hail 0.2.109.
I tried downgrading from 120-->109, but I don't believe that I can in Terra, due to Spark incompatibilities.
filtered_mt is a MatrixTable that has already been split and filtered (to drop irrelevant variants). By the time the [following] code blocks are run,
filtered_mt = hl.read_matrix_table(filtered_mt_url)
has been executed.
Some more information: The code after this (not shown [in the below code blocks]) does additional filtering. If I skip the stepvariant_data.export(f"{variant_stat_file_path_stem}_FULL.tsv")
, I can complete successfully. The issue is that we need the*_FULL.tsv
output. So, I believe that this is likely a RAM issue on the driver, but this used to work.
variant_mt = generate_variant_stats(filtered_mt, interval_names, interval_table_dict)
# Main loop to compute variant stats and save to files
# File path stem to use for saving variant stats over different interval lists
variant_stat_file_path_stem = f"{bucket}/batchE/{workflow_nickname}/variant_stats"
variant_data = variant_mt.cols()
variant_data.describe()
#variant_data.to_pandas().to_csv(f"{variant_stat_file_path_stem}_FULL.tsv", sep='\t', index=False)
variant_data.export(f"{variant_stat_file_path_stem}_FULL.tsv")
def generate_variant_stats(mt, interval_names, interval_table_dict):
mt = mt.annotate_rows(**interval_table_dict)
annotate_dict = {interval_name + '_result': hl.agg.filter(mt[interval_name],
hl.struct(
### SNP counts
SNP_Ti_count_Het=hl.agg.count_where(
(hl.is_transition(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_het_ref())
),
SNP_Tv_count_Het=hl.agg.count_where(
(hl.is_transversion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_het_ref())
),
SNP_Ti_count_Hom=hl.agg.count_where(
(hl.is_transition(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_hom_var())
),
SNP_Tv_count_Hom=hl.agg.count_where(
(hl.is_transversion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_hom_var())
),
### Indel counts
INDEL_Ins_count_Het=hl.agg.count_where(
(hl.is_insertion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_het_ref())
),
INDEL_Del_count_Het=hl.agg.count_where(
(hl.is_deletion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_het_ref())
),
INDEL_Ins_count_Hom=hl.agg.count_where(
(hl.is_insertion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_hom_var())
),
INDEL_Del_count_Hom=hl.agg.count_where(
(hl.is_deletion(mt.alleles[0], mt.alleles[1])) & (mt.GT.is_hom_var())
),
)) for interval_name in interval_names}
mt2 = mt.annotate_cols(**annotate_dict)
return mt2
interval_table_dict = dict(
zip(interval_names, [hl.is_defined(interval_table[filtered_mt.locus]) for interval_table in interval_tables])
)
Version
0.2.126
Relevant log output
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
711 )
713 # If we're going to release the connection in ``finally:``, then
714 # the response doesn't need to know about the connection. Otherwise
715 # it will also try to release it and we'll have a double-release
716 # mess.
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:449, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
--> 449 six.raise_from(e, None)
450 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:444, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
443 try:
--> 444 httplib_response = conn.getresponse()
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
File /opt/conda/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
1374 try:
-> 1375 response.begin()
1376 except ConnectionError:
File /opt/conda/lib/python3.10/http/client.py:318, in HTTPResponse.begin(self)
317 while True:
--> 318 version, status, reason = self._read_status()
319 if status != CONTINUE:
File /opt/conda/lib/python3.10/http/client.py:287, in HTTPResponse._read_status(self)
284 if not line:
285 # Presumably, the server closed the connection before
286 # sending a valid response.
--> 287 raise RemoteDisconnected("Remote end closed connection without"
288 " response")
289 try:
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/requests/adapters.py:487, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
486 try:
--> 487 resp = conn.urlopen(
488 method=request.method,
489 url=url,
490 body=request.body,
491 headers=request.headers,
492 redirect=False,
493 assert_same_host=False,
494 preload_content=False,
495 decode_content=False,
496 retries=self.max_retries,
497 timeout=timeout,
498 chunked=chunked,
499 )
501 except (ProtocolError, OSError) as err:
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
785 e = ProtocolError("Connection aborted.", e)
--> 787 retries = retries.increment(
788 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
789 )
790 retries.sleep()
File /opt/conda/lib/python3.10/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
File /opt/conda/lib/python3.10/site-packages/urllib3/packages/six.py:769, in reraise(tp, value, tb)
768 if value.__traceback__ is not tb:
--> 769 raise value.with_traceback(tb)
770 raise value
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
711 )
713 # If we're going to release the connection in ``finally:``, then
714 # the response doesn't need to know about the connection. Otherwise
715 # it will also try to release it and we'll have a double-release
716 # mess.
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:449, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
--> 449 six.raise_from(e, None)
450 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:444, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
443 try:
--> 444 httplib_response = conn.getresponse()
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
File /opt/conda/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
1374 try:
-> 1375 response.begin()
1376 except ConnectionError:
File /opt/conda/lib/python3.10/http/client.py:318, in HTTPResponse.begin(self)
317 while True:
--> 318 version, status, reason = self._read_status()
319 if status != CONTINUE:
File /opt/conda/lib/python3.10/http/client.py:287, in HTTPResponse._read_status(self)
284 if not line:
285 # Presumably, the server closed the connection before
286 # sending a valid response.
--> 287 raise RemoteDisconnected("Remote end closed connection without"
288 " response")
289 try:
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
File <timed exec>:9
File <decorator-gen-1235>:2, in export(self, output, types_file, header, parallel, delimiter)
File ~/.local/lib/python3.10/site-packages/hail/typecheck/check.py:587, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
584 @decorator
585 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
586 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 587 return __original_func(*args_, **kwargs_)
File ~/.local/lib/python3.10/site-packages/hail/table.py:1153, in Table.export(self, output, types_file, header, parallel, delimiter)
1150 hl.current_backend().validate_file(output)
1152 parallel = ir.ExportType.default(parallel)
-> 1153 Env.backend().execute(
1154 ir.TableWrite(self._tir, ir.TableTextWriter(output, types_file, header, parallel, delimiter)))
File ~/.local/lib/python3.10/site-packages/hail/backend/backend.py:178, in Backend.execute(self, ir, timed)
176 payload = ExecutePayload(self._render_ir(ir), '{"name":"StreamBufferSpec"}', timed)
177 try:
--> 178 result, timings = self._rpc(ActionTag.EXECUTE, payload)
179 except FatalError as e:
180 raise e.maybe_user_error(ir) from None
File ~/.local/lib/python3.10/site-packages/hail/backend/py4j_backend.py:210, in Py4JBackend._rpc(self, action, payload)
208 path = action_routes[action]
209 port = self._backend_server_port
--> 210 resp = self._requests_session.post(f'http://localhost:{port}{path}', data=data)
211 if resp.status_code >= 400:
212 error_json = orjson.loads(resp.content)
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:635, in Session.post(self, url, data, json, **kwargs)
624 def post(self, url, data=None, json=None, **kwargs):
625 r"""Sends a POST request. Returns :class:`Response` object.
626
627 :param url: URL for the new :class:`Request` object.
(...)
632 :rtype: requests.Response
633 """
--> 635 return self.request("POST", url, data=data, json=json, **kwargs)
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:587, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
582 send_kwargs = {
583 "timeout": timeout,
584 "allow_redirects": allow_redirects,
585 }
586 send_kwargs.update(settings)
--> 587 resp = self.send(prep, **send_kwargs)
589 return resp
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:701, in Session.send(self, request, **kwargs)
698 start = preferred_clock()
700 # Send the request
--> 701 r = adapter.send(request, **kwargs)
703 # Total elapsed time of the request (approximately)
704 elapsed = preferred_clock() - start
File /opt/conda/lib/python3.10/site-packages/requests/adapters.py:502, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
487 resp = conn.urlopen(
488 method=request.method,
489 url=url,
(...)
498 chunked=chunked,
499 )
501 except (ProtocolError, OSError) as err:
--> 502 raise ConnectionError(err, request=request)
504 except MaxRetryError as e:
505 if isinstance(e.reason, ConnectTimeoutError):
506 # TODO: Remove this in 3.0.0: see #2811
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))