In [269]:
import torch as t
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json
import random
import csv
from math import factorial

In [270]:
funnyWords1 = {}
funnyWords = {}

with open('funny_words.csv', ) as f:
    reader = csv.reader(f, delimiter = ',')
    for row in reader:
        if row[0] == 'word':
            continue
        funnyWords1[float(row[1])] = row[0]

for key in funnyWords1:
    if key > 2.8:
        funnyWords[funnyWords1[key]] = key


In [271]:
with open('common_words_no_names.txt') as f:
    common_words = f.read().splitlines()
g = t.Generator().manual_seed(214743647)
numerated = {}
for i in range(len(common_words)):
    numerated[common_words[i]] = i

common_words = set(common_words)

In [272]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', clean_up_tokenization_spaces = False, dtype = t.float16)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [273]:
def get_prob(text):
    if ' ' not in text:
        return t.tensor(0)
    input_ids = tokenizer.encode(text, return_tensors='pt')
    prob = 1
    # with t.no_grad():
    #     outputs = model(input_ids)
    #     logits = outputs.logits
    # print(logits.shape)
    logits = t.randn(1,list(t.Tensor.size(input_ids))[1] - 1, 50257)
    for i in range(list(t.Tensor.size(input_ids))[1] - 1):
        next_token_logits = logits[:, i, :]
        probabilities = F.softmax(next_token_logits, dim=-1)
        next_token_id = input_ids[0][i + 1].item()
        prob *= probabilities[0][next_token_id]
    if prob != 1:
        return prob
    else:
        return t.tensor(0)

In [274]:
class A:
    def __init__(self):
        self.common_words = open('common_words_no_names.txt').read().split()
        with open('memo_dict.json', 'r') as f:
            self.memoDict = json.load(f)

    def save_anagrams(self):
        open('memo_dict.json', 'r').close()
        with open('memo_dict.json', 'w') as f:
            json.dump(self.memoDict, f)

    def anagrams_helper(self, word):
        if word == '':
            return ['']
        if word in self.memoDict:
            return self.memoDict[word]
        dict = {}
        for w in common_words:
            wordHusk = word
            included = True
            for c in w:
                if c not in wordHusk:
                    included = False
                    break
                else:
                    i = wordHusk.index(c)
                    wordHusk = wordHusk[:i] + wordHusk[i+1:]
            if included:
                dict[w] = wordHusk
        if len(word) < self.length - 2:     # adds small words to memoization dictionary for speed
                self.memoDict[word] = [w + ' ' + g for w in list(dict.keys()) for g in self.anagrams_helper(dict[w])]
        return [w + ' ' + g for w in list(dict.keys()) for g in self.anagrams_helper(dict[w])]

    def anagrams(self, input_word):
        self.length = len(input_word)
        input_word = input_word.lower()
        word = ''
        for c in input_word:
            if c in 'abcdefghijklmnopqrstuvwxyz':
                word += c
        words = self.anagrams_helper(input_word.lower())
        for w in words:
            w = w.split(' ')
        words.sort(key = len)
        return words

In [275]:
g = A()

In [276]:
input_word = 'machinelearning'
words = g.anagrams(input_word)
len(words)

8349186

In [277]:
def letter_likelihood(word):
    word = word.lower()
    l = 1.12*(word.count('e')) + .85*(word.count('a')) +.76*(word.count('r')) +.75*(word.count('i')) +.2*(word.count('o')) +.7*(word.count('t')) +.67*(word.count('n')) +.57*(word.count('s')) +.55*(word.count('l')) +.45*(word.count('c')) +.36*(word.count('u')) +.34*(word.count('d')) +.32*(word.count('p')) +.3*(word.count('m')) +.3*(word.count('h')) +.25*(word.count('g')) +.21*(word.count('b')) +.18*(word.count('f')) + .18*(word.count('y')) +.13*(word.count('w')) +.11*(word.count('k')) +.1*(word.count('v')) +.03*(word.count('x')) +.027*(word.count('z')) +.02*(word.count('j')) +.02*(word.count('q'))
    return round(l)

In [278]:
slice = int(2000000 / factorial(letter_likelihood(input_word)))
words1 = words[:slice]
print(len(words1))

print("\n".join(sorted(words1, reverse = True, key = get_prob)))
# print("\n".join(words[slice:]))

# 11:15


0

