In [27]:
import numpy as np
import re
import openai
from sklearn.model_selection import train_test_split
import pandas as pd
import unicodedata
import os
from together import Together
import string

In [50]:
# load CSV
df = pd.read_csv('/Users/ohmpatel/Downloads/nytcrosswords.csv', encoding="cp1252")

In [51]:
sample_df = df.sample(n=2500)

In [85]:
def find_answer(sentence):
    for word in sentence.split():
        if len(word) >= 3 and word.upper() == word:
            return word
    return "NULL"

In [53]:
def preprocess_clue_fn(clue):
    '''
    Function from Berkeley Automated Crossword Solving
    '''
    
    clue = str(clue)

    clue = ''.join(c for c in unicodedata.normalize('NFD', clue) if unicodedata.category(c) != 'Mn')

    clue = re.sub("\x17|\x18|\x93|\x94|“|”|''|\"\"", "\"", clue)
    clue = re.sub("\x85|…", "...", clue)
    clue = re.sub("\x91|\x92|‘|’", "'", clue)

    clue = re.sub("‚", ",", clue)
    clue = re.sub("—|–", "-", clue)
    clue = re.sub("¢", " cents", clue)
    clue = re.sub("¿|¡|^;|\{|\}", "", clue)
    clue = re.sub("÷", "division", clue)
    clue = re.sub("°", " degrees", clue)

    euro = re.search("^£[0-9]+(,*[0-9]*){0,}| £[0-9]+(,*[0-9]*){0,}", clue)
    if euro:
        num = clue[:euro.end()]
        rest_clue = clue[euro.end():]
        clue = num + " Euros" + rest_clue
        clue = re.sub(", Euros", " Euros", clue)
        clue = re.sub("Euros [Mm]illion", "million Euros", clue)
        clue = re.sub("Euros [Bb]illion", "billion Euros", clue)
        clue = re.sub("Euros[Kk]", "K Euros", clue)
        clue = re.sub(" K Euros", "K Euros", clue)
        clue = re.sub("£", "", clue)

    clue = re.sub(" *\(\d{1,},*\)$| *\(\d{1,},* \d{1,}\)$", "", clue)

    clue = re.sub("&amp;", "&", clue)
    clue = re.sub("&lt;", "<", clue)
    clue = re.sub("&gt;", ">", clue)

    clue = re.sub("e\.g\.|for ex\.", "for example", clue)
    clue = re.sub(": [Aa]bbreviat\.|: [Aa]bbrev\.|: [Aa]bbrv\.|: [Aa]bbrv|: [Aa]bbr\.|: [Aa]bbr", " abbreviation", clue)
    clue = re.sub("abbr\.|abbrv\.", "abbreviation", clue)
    clue = re.sub("Abbr\.|Abbrv\.", "Abbreviation", clue)
    clue = re.sub("\(anag\.\)|\(anag\)", "(anagram)", clue)
    clue = re.sub("org\.", "organization", clue)
    clue = re.sub("Org\.", "Organization", clue)
    clue = re.sub("Grp\.|Gp\.", "Group", clue)
    clue = re.sub("grp\.|gp\.", "group", clue)
    clue = re.sub(": Sp\.", " (Spanish)", clue)
    clue = re.sub("\(Sp\.\)|Sp\.", "(Spanish)", clue)
    clue = re.sub("Ave\.", "Avenue", clue)
    clue = re.sub("Sch\.", "School", clue)
    clue = re.sub("sch\.", "school", clue)
    clue = re.sub("Agcy\.", "Agency", clue)
    clue = re.sub("agcy\.", "agency", clue)
    clue = re.sub("Co\.", "Company", clue)
    clue = re.sub("co\.", "company", clue)
    clue = re.sub("No\.", "Number", clue)
    clue = re.sub("Mo\.", "Month", clue)
    clue = re.sub("mo\.", "month", clue)
    clue = re.sub("no\.", "number", clue)
    clue = re.sub(": [Vv]ar\.", " variable", clue)
    clue = re.sub("Subj\.", "Subject", clue)
    clue = re.sub("subj\.", "subject", clue)
    clue = re.sub("Subjs\.", "Subjects", clue)
    clue = re.sub("subjs\.", "subjects", clue)

    theme_clue = re.search("^.+\|[A-Z]{1,}", clue)
    if theme_clue:
        clue = re.sub("\|", " | ", clue)

    if "Partner of" in clue:
        clue = re.sub("Partner of", "", clue)
        clue = clue + " and ___"

    link = re.search("^.+-.+ [Ll]ink$", clue)
    if link:
        no_link = re.search("^.+-.+ ", clue)
        x_y = clue[no_link.start():no_link.end() - 1]
        x_y_lst = x_y.split("-")
        clue = x_y_lst[0] + " ___ " + x_y_lst[1]

    follower = re.search("^.+ [Ff]ollower$", clue)
    if follower:
        no_follower = re.search("^.+ ", clue)
        x = clue[:no_follower.end() - 1]
        clue = x + " ___"

    preceder = re.search("^.+ [Pp]receder$", clue)
    if preceder:
        no_preceder = re.search("^.+ ", clue)
        x = clue[:no_preceder.end() - 1]
        clue = "___ " + x

    if re.search("--[^A-Za-z]|--$", clue):
        clue = re.sub("--", "__", clue)
    if not re.search("_-[A-Za-z]|_-$", clue):
        clue = re.sub("_-", "__", clue)

    clue = re.sub("_{2,}", "___", clue)

    clue = re.sub("\?$", " (wordplay)", clue)

    nonverbal = re.search("\[[^0-9]+,* *[^0-9]*\]", clue)
    if nonverbal:
        clue = re.sub("\[|\]", "", clue)
        clue = clue + " (nonverbal)"

    if clue[:4] == "\"\"\" " and clue[-4:] == " \"\"\"":
        clue = "\"" + clue[4:-4] + "\""
    if clue[:4] == "''' " and clue[-4:] == " '''":
        clue = "'" + clue[4:-4] + "'"
    if clue[:3] == "\"\"\"" and clue[-3:] == "\"\"\"":
        clue = "\"" + clue[3:-3] + "\""
    if clue[:3] == "'''" and clue[-3:] == "'''":
        clue = "'" + clue[3:-3] + "'"

    return clue

  clue = re.sub("¿|¡|^;|\{|\}", "", clue)
  clue = re.sub(" *\(\d{1,},*\)$| *\(\d{1,},* \d{1,}\)$", "", clue)
  clue = re.sub("e\.g\.|for ex\.", "for example", clue)
  clue = re.sub(": [Aa]bbreviat\.|: [Aa]bbrev\.|: [Aa]bbrv\.|: [Aa]bbrv|: [Aa]bbr\.|: [Aa]bbr", " abbreviation", clue)
  clue = re.sub("abbr\.|abbrv\.", "abbreviation", clue)
  clue = re.sub("Abbr\.|Abbrv\.", "Abbreviation", clue)
  clue = re.sub("\(anag\.\)|\(anag\)", "(anagram)", clue)
  clue = re.sub("org\.", "organization", clue)
  clue = re.sub("Org\.", "Organization", clue)
  clue = re.sub("Grp\.|Gp\.", "Group", clue)
  clue = re.sub("grp\.|gp\.", "group", clue)
  clue = re.sub(": Sp\.", " (Spanish)", clue)
  clue = re.sub("\(Sp\.\)|Sp\.", "(Spanish)", clue)
  clue = re.sub("Ave\.", "Avenue", clue)
  clue = re.sub("Sch\.", "School", clue)
  clue = re.sub("sch\.", "school", clue)
  clue = re.sub("Agcy\.", "Agency", clue)
  clue = re.sub("agcy\.", "agency", clue)
  clue = re.sub("Co\.", "Company", clue)
  clue = re.sub

In [68]:
answers = []

client = Together(api_key='cbb9cc63e434536b5583155ac26b44bd457098245baf94ca1fc186981f49ad74')

clues = sample_df.Clue.to_list()
words = sample_df.Word.to_list()
count = 0

for i, clue in enumerate(clues):
    length = len(words[i])
    
    response = client.chat.completions.create(
        model="meta-llama/Llama-3-8b-chat-hf",
        messages=[{"role": "user", "content": f"The crossword clue is {clue}. The length of the answer is {length} characters. Write the answer in all caps and with no spaces."}]
    )
    
    answer = response.choices[0].message.content
    stripped = answer.translate(str.maketrans('', '', string.punctuation))
    answers.append((i, clue, words[i], stripped))
    
    count += 1
    if count % 50 == 0:
        print(count)
        print(i, words[i], stripped)

50
49 SONS The answer is WISH
100
99 OTITIS The answer is INNEREAR
150
149 BUN LEASH
200
199 NEBR The answer is ILLS
250
249 TOL The answer is TOLD
300
299 BEEFS The answer is CRIES
350
349 PEACE The answer is MAHALO
400
399 RATING The answer is FIVESTAR
450
449 BENT The answer is CURVE
500
499 DIBS I think I can help you with that

The answer to the crossword clue Claim exclamation with a length of 4 characters is ISNT

Here it is in all caps and without spaces ISNT
550
549 TAUT The answer is TAUT
600
599 SEISMO I think I can help you with that

The answer to the crossword clue Shaky start with a length of 6 characters is TREMBL
650
649 ROBE ROBE
700
699 TILT The answer is BUSTS
750
749 DES The answer is FRENCH
800
799 LISBON The answer is LISBON
850
849 READ The answer is AUDIT
900
899 AMUSERS The answer is TOYBOXES
950
949 HAMS The answer is LAMB
1000
999 UNISEX The answer is ALOPECIA
1050
1049 YESLETS The answer is ABSOLUTELY
1100
1099 LEWIS The answer is LEWIS
1150
1149 NEE The an

In [86]:
stripped = []
answers_df = pd.DataFrame(answers, columns=['index','Clue','Word','Guess'])
for output in answers_df['Guess']:
    stripped.append(find_answer(output))

In [88]:
answers_df['Cleaned_Guess'] = stripped

In [135]:
words = answers_df['Word'].to_list()
guesses = answers_df['Cleaned_Guess'].to_list()

In [94]:
# calculate accuracy by word
count = 0
for i in range(len(words)):
    word, guess = words[i], guesses[i]
    if word == guess:
        count += 1
print(f"Correct word prediction accuracy: {count / len(words)}")

Llama prediction accuracy: 0.1936


In [107]:
# calculate by letter accuracy
def letter_accuracy(words, guesses):
    correct_letters, total_letters = 0, 0
    for i in range(len(words)):
        word, guess = words[i], guesses[i]
        # null guesses
        if guess == "NULL":
            total_letters += len(word)
        else:
            # correct guess
            if word == guess:
                correct_letters += len(word)
                total_letters += len(word)

            else: 
                # Case 1: guess too short, adding padding
                if len(word) > len(guess):
                    while len(guess) < len(word):
                        guess += '!'
                # Case 2: guess too long, crop to len(word)
                elif len(word) < len(guess):
                    guess = guess[:len(word)]

                # Word, Guess now guaranteed to be same length
                for i in range(len(word)):
                    if word[i] == guess[i]:
                        total_letters += 1
                        correct_letters += 1
                    else: 
                        total_letters += 1 
    return correct_letters, total_letters

In [113]:
correct, total = letter_accuracy(words, guesses)
print(f"Correct letter prediction accuracy: {correct / total}")

Correct letter prediction accuracy: 0.2905750535508321


In [124]:
set_word_len = set([len(word) for word in words])

In [125]:
for length in set_word_len:
    idxs = [i for i in range(len(words)) if len(words[i]) == length]
    subgroup_words = [words[i] for i in idxs]
    subgroup_guesses = [guesses[i] for i in idxs]
    correct, total = letter_accuracy(subgroup_words, subgroup_guesses)
    print(f"Correct {length}-letter prediction accuracy: {correct / total}")

Correct 3-letter prediction accuracy: 0.37166324435318276
Correct 4-letter prediction accuracy: 0.34477124183006536
Correct 5-letter prediction accuracy: 0.28698752228163993
Correct 6-letter prediction accuracy: 0.26811594202898553
Correct 7-letter prediction accuracy: 0.24879614767255218
Correct 8-letter prediction accuracy: 0.20646067415730338
Correct 9-letter prediction accuracy: 0.18018018018018017
Correct 10-letter prediction accuracy: 0.18125
Correct 11-letter prediction accuracy: 0.18181818181818182
Correct 12-letter prediction accuracy: 0.08333333333333333
Correct 13-letter prediction accuracy: 0.0
Correct 14-letter prediction accuracy: 0.14285714285714285
Correct 15-letter prediction accuracy: 0.03333333333333333


In [142]:
# Python program to generate word vectors using Word2Vec

# importing all necessary modules
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [143]:
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each length 200
    """
    import gensim.downloader as api
    wv_from_bin = api.load("glove-wiki-gigaword-200")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin
wv_from_bin = load_embedding_model()

Loaded vocab size 400000


In [146]:
wv_from_bin.distance('page', 'talk')

0.6915422677993774

In [159]:
def cos_similarity_incl_null(words, guesses):
    cos_sim = []
    for i in range(len(words)):
        try: 
            cos_sim.append(wv_from_bin.distance(words[i].lower(), guesses[i].lower()))
        except:
            cos_sim.append(0)
    return sum(cos_sim) / len(cos_sim)

def cos_similarity_excl_null(words, guesses):
    cos_sim = []
    for i in range(len(words)):
        try: 
            cos_sim.append(wv_from_bin.distance(words[i].lower(), guesses[i].lower()))
        except:
            pass
    if len(cos_sim):
        return sum(cos_sim) / len(cos_sim)
    else:
        return 0

In [160]:
print(f"Cosine similarity w/ 0s for null guesses: {cos_similarity_incl_null(words, guesses)}")
print(f"Cosine similarity excluding null guesses: {cos_similarity_excl_null(words, guesses)}")

Cosine similarity w/ 0s for null guesses: 0.45806048088092355
Cosine similarity excluding null guesses: 0.6292039572540159


In [161]:
for length in set_word_len:
    idxs = [i for i in range(len(words)) if len(words[i]) == length]
    subgroup_words = [words[i] for i in idxs]
    subgroup_guesses = [guesses[i] for i in idxs]
    print(f"Cosine similarity {length}-letter w/ 0s for null guesses: {cos_similarity_incl_null(subgroup_words, subgroup_guesses)}")
    print(f"Cosine similarity {length}-letter excluding null guesses: {cos_similarity_excl_null(subgroup_words, subgroup_guesses)}")
    print()

Cosine similarity 3-letter w/ 0s for null guesses: 0.6275630133911141
Cosine similarity 3-letter excluding null guesses: 0.6447746572182965

Cosine similarity 4-letter w/ 0s for null guesses: 0.5327656174121194
Cosine similarity 4-letter excluding null guesses: 0.6279902886290775

Cosine similarity 5-letter w/ 0s for null guesses: 0.41969531750117234
Cosine similarity 5-letter excluding null guesses: 0.6295429762517585

Cosine similarity 6-letter w/ 0s for null guesses: 0.3633225873463612
Cosine similarity 6-letter excluding null guesses: 0.6205262011608643

Cosine similarity 7-letter w/ 0s for null guesses: 0.2829566756995876
Cosine similarity 7-letter excluding null guesses: 0.5995986699348405

Cosine similarity 8-letter w/ 0s for null guesses: 0.19600043441639858
Cosine similarity 8-letter excluding null guesses: 0.5451262082206085

Cosine similarity 9-letter w/ 0s for null guesses: 0.07995470653514604
Cosine similarity 9-letter excluding null guesses: 0.7395810354501009

Cosine sim