Skip to content

Commit 6b7a847

Browse files
fix(parser): memory leak in cache
- a global cache was used and never cleared for tokens/parser results, which ended up creating a slow memory leak that eventually caused training to slow down (after ~2000 eps)
1 parent 910bcd6 commit 6b7a847

2 files changed

Lines changed: 16 additions & 12 deletions

File tree

libraries/mathy_python/mathy/core/parser.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,6 @@ def contains(self, type: int) -> bool:
103103
_IS_EXP: TokenSet = TokenSet(TokenExponent)
104104
_IS_EQUAL: TokenSet = TokenSet(TokenEqual)
105105

106-
_parse_cache: Dict[str, MathExpression] = {}
107-
_tokens_cache: Dict[str, List[Token]] = {}
108-
109106

110107
# NOTE: This cannot be shared between threads because it stores state in self.current_token and self.tokens
111108
class ExpressionParser:
@@ -146,27 +143,33 @@ class ExpressionParser:
146143
```
147144
"""
148145

146+
_parse_cache: Dict[str, MathExpression]
147+
_tokens_cache: Dict[str, List[Token]]
148+
149149
# Initialize the tokenizer.
150150
def __init__(self):
151151
self.tokenizer = Tokenizer()
152+
self.clear_cache()
153+
154+
def clear_cache(self):
155+
self._tokens_cache = {}
156+
self._parse_cache = {}
152157

153158
def tokenize(self, input_text: str):
154-
global _tokens_cache
155-
if input_text not in _tokens_cache:
156-
_tokens_cache[input_text] = self.tokenizer.tokenize(input_text)
157-
return _tokens_cache[input_text][:]
159+
if input_text not in self._tokens_cache:
160+
self._tokens_cache[input_text] = self.tokenizer.tokenize(input_text)
161+
return self._tokens_cache[input_text][:]
158162

159163
def parse(self, input_text: str) -> MathExpression:
160164
"""Parse a string representation of an expression into a tree
161165
that can be later evaluated.
162166
163167
Returns : The evaluatable expression tree.
164168
"""
165-
global _parse_cache
166-
if input_text in _parse_cache:
167-
return _parse_cache[input_text]
168-
_parse_cache[input_text] = self._parse(self.tokenize(input_text))
169-
return _parse_cache[input_text]
169+
if input_text in self._parse_cache:
170+
return self._parse_cache[input_text]
171+
self._parse_cache[input_text] = self._parse(self.tokenize(input_text))
172+
return self._parse_cache[input_text]
170173

171174
def _parse(self, tokens: List[Token]) -> MathExpression:
172175
"""Parse a given list of tokens into an expression tree"""

libraries/mathy_python/mathy/env.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ def get_initial_state(
360360
prob: MathyEnvProblem = self.problem_fn(config)
361361
self.valid_actions_mask_cache = dict()
362362
self.valid_rules_cache = dict()
363+
self.parser.clear_cache()
363364
self.max_moves = self.max_moves_fn(prob, config)
364365

365366
# Build and return the initial state

0 commit comments

Comments
 (0)