Skip to content
This repository was archived by the owner on Apr 25, 2023. It is now read-only.

Commit fb9e856

Browse files
committed
considering a pivot to generation mode and simple topk and sample n options
1 parent 5f2c033 commit fb9e856

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

protbert/app.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,25 +82,26 @@ def generate_scoring_matrix(record, tokenizer, masked_model, output_path):
8282
def top_k(scoring_matrix_path, k, output_path):
8383
scoring_matrix = pd.read_csv(scoring_matrix_path)
8484

85-
# Convert the data types of the scoring matrix columns to float
86-
for col in scoring_matrix.columns[2:]:
87-
scoring_matrix[col] = scoring_matrix[col].astype(float)
85+
8886

87+
print("generating top k sequences")
8988
top_k_sequences = []
90-
9189
for i, row in scoring_matrix.iterrows():
9290
# Excluding the first two columns (position and identity) and finding the k largest probabilities
93-
top_k_tokens = row.iloc[2:].nlargest(k).index.values
91+
numeric_series = row.iloc[2:].astype(float)
92+
top_k_tokens = numeric_series.nlargest(k).index.values
9493
if i == 0:
9594
for token in top_k_tokens:
9695
top_k_sequences.append(token)
96+
print(top_k_sequences)
9797
else:
9898
new_sequences = []
9999
for seq in top_k_sequences:
100100
for token in top_k_tokens:
101101
new_sequences.append(seq + token)
102+
print(new_sequences)
102103
top_k_sequences = new_sequences
103-
104+
print(top_k_sequences)
104105

105106
# Convert the resulting sequences to SeqRecord objects
106107
seq_records = []

0 commit comments

Comments
 (0)