In [1]:
#source data: https://github.com/orgtre/google-books-ngram-frequency/tree/main
#potential source https://github.com/krishnakt031990/Crawl-Wiki-For-Acronyms
#potential source https://blog.gdeltproject.org/announcing-the-new-web-news-ngrams-3-0-dataset/

In [2]:
import csv
from itertools import combinations

#Keys grouped by finger with the Colemak-DH layout
keygroup9 = {'q':'KC_Q','a':'KC_A','z':'KC_Z'}
keygroup7 = {'w':'KC_W','r':'KC_R','x':'KC_X'}
keygroup5 = {'f':'KC_F','s':'KC_S','c':'KC_C'}
keygroup3 = {'p':'KC_P','t':'KC_T','d':'KC_D','b':'KC_B','g':'KC_G','v':'KC_V'}
keygroup2 = {'j':'KC_J','m':'KC_M','k':'KC_K','l':'KC_L','n':'KC_N','h':'KC_H'}
keygroup4 = {'u':'KC_U','e':'KC_E'}
keygroup6 = {'y':'KC_Y','i':'KC_I'}
keygroup8 = {';':'KC_SCLN','o':'KC_O','/':'KC_SLSH',"'":'KC_QUOT','-':'KC_MINS'}

#how many do you want to return
combo_list_limit = 2000

#A thumb key that can be used in combos without affecting typing (tab, enter or a function key could also work)
#Change to None if this will not work for your keyboard
#Using a function key could allow 2 key combos for heavy typing but it is a large investment for such a valuable position
special_key = 'KC_BSPC'
#special_key = None

#space will use the other thumb
space_key = 'KC_SPC'

#force abbreviations for known issues/preferances
item_abrevs = {
    ("k", "b"): "keyboard",
    ("k", "b", " "): "keyboard ",
    ("k", "y", " "): "key ",
    # SQL
    ("s", "l", " "): "select ",
    ("w", "e", " "): "where ",
    ("u", "p", " "): "update ",
    ("a", "p", " "): "append ",
    ("c", "t", " "): "create ",
    ("i", "s", " "): "insert ",
    ("d", "a", "s"): "database",
    ("d", "e", " "): "delete ",
    ("g", "y", " "): "group by ",
    ("h", "g", " "): "having ",
    ("l", "e", " "): "like ",
    ("o", "b", " "): "order by ",
    ("d", "s", " "): "distinct ",
    ("v", "l", " "): "values ",
    ("u", "n", " "): "union ",
    ("t", "l", " "): "table ",
    ("j", "o", " "): "join ",
    # Python
    ("t", "u", " "): "True ",
    ("f", "l", " "): "False ",
    ("v", "l", "u"): "value",
    ("n", "e", " "): "none ",
    ("w", "l", " "): "while ",
    ("e", "f", " "): "elif ",
    ("e", "l", "s"): "else",
    ("e", "x", "c"): "except",
    ("l", "d", " "): "lambda ",
}

left_groups = [1, 3, 5, 7, 9]
right_groups = [0, 2, 4, 6, 8]

usable_keys = {}
usable_keys[' '] = {'kc': space_key, 'group': 0}
usable_keys[special_key] = {'kc': special_key, 'group': 1}

for key, val in keygroup2.items():
    usable_keys[key] = {'kc': val, 'group': 2}
for key, val in keygroup3.items():
    usable_keys[key] = {'kc': val, 'group': 3}
for key, val in keygroup4.items():
    usable_keys[key] = {'kc': val, 'group': 4}
for key, val in keygroup5.items():
    usable_keys[key] = {'kc': val, 'group': 5}
for key, val in keygroup6.items():
    usable_keys[key] = {'kc': val, 'group': 6}
for key, val in keygroup7.items():
    usable_keys[key] = {'kc': val, 'group': 7}
for key, val in keygroup8.items():
    usable_keys[key] = {'kc': val, 'group': 8}
for key, val in keygroup9.items():
    usable_keys[key] = {'kc': val, 'group': 9}


In [3]:
def get_gram_dict(file, dictname):
    with open(file, mode ='r')as file:
        csvFile = csv.reader(file)
        i = 0
        for lines in csvFile:
            if i > 0:
                word = lines[0] + " "
                dictname[word] = int(lines[1])
            i += 1

In [4]:
def check_combo(combo):
    check = True
    if len(combo) == 2:
        let1 = combo[0]
        let2 = combo[1]
        check_set1 = let1+let2
        check_set2 = let2+let1
        if check_set1 in shinglesk2 or check_set2 in shinglesk2:
            check = False
        elif usable_keys[let1]['group'] == usable_keys[let2]['group']:
            check = False
        elif combo in abrevs_list:
            check = False
    elif len(combo) >= 3:
        letter_groups = []
        for let in combo:
            if usable_keys[let]['group'] in letter_groups:
                check = False
            letter_groups.append(usable_keys[let]['group'])
        if check is True:
            #ban pinky finger if 2 other fingers on the same hand are already used, might be ok if all are on the same row
            if 8 in letter_groups:
                right_count = 0
                for group_num in letter_groups:
                    if group_num in right_groups:
                        right_count += 1
                if right_count > 2 :
                    check = False
            if 9 in letter_groups:
                left_count = 0
                for group_num in letter_groups:
                    if group_num in left_groups:
                        left_count += 1
                if left_count > 2:
                    check = False
        if check is True:
            if combo in abrevs_list:
                check = False
    return check

In [5]:
ngrams1 = {}
ngrams2 = {}
ngrams3 = {}
ngrams4 = {}
ngrams5 = {}
shingles = {}
shinglesk2 = set()
letter_ranks = {}

get_gram_dict('1grams_english.csv', ngrams1)
get_gram_dict('2grams_english.csv', ngrams2)
get_gram_dict('3grams_english.csv', ngrams3)
get_gram_dict('4grams_english.csv', ngrams4)
get_gram_dict('5grams_english.csv', ngrams5)


for word, freq in ngrams1.items():
    for let in word:
        let = let.lower()
        if let in letter_ranks:
            letter_ranks[let] += freq
        else:
            letter_ranks[let] = freq
    if len(word) > 3:
        word_len = len(word)
        for start in range(word_len-1):
            end = start + 2
            shingle = word[start:end].lower()
            shinglesk2.add(shingle)
        for k in range(3, word_len):
            shingle_count = word_len - k
            for start in range(shingle_count+1):
                end = start + k
                shingle = word[start:end].lower()
                if shingle in shingles:
                    shingles[shingle] += freq
                else:
                    shingles[shingle] = freq

candidate_dict = {**ngrams1, **ngrams2, **ngrams3, **ngrams4, **ngrams5}
for shingle, freq in shingles.items():
    if shingle in candidate_dict:
        candidate_dict[shingle] += freq
    else:
        candidate_dict[shingle] = freq

candidate_scores = {}
for key, val in candidate_dict.items():
    if len(key) > 2:
        score = (len(key)-2) * val
        candidate_scores[key] = score


max_score = max(candidate_scores.values())
for key, val in candidate_scores.items():
    candidate_scores[key] = val / max_score

In [6]:
ngrams1f = {}
ngrams2f = {}
ngrams3f = {}
ngrams4f = {}
ngrams5f = {}
shingles = {}
#we can just use the first dataset to find letters that are never adjcent
#shinglesk2 = set()
letter_ranks = {}

get_gram_dict('1grams_english-fiction.csv', ngrams1f)
get_gram_dict('2grams_english-fiction.csv', ngrams2f)
get_gram_dict('3grams_english-fiction.csv', ngrams3f)
get_gram_dict('4grams_english-fiction.csv', ngrams4f)
get_gram_dict('5grams_english-fiction.csv', ngrams5f)


for word, freq in ngrams1f.items():
    for let in word:
        let = let.lower()
        if let in letter_ranks:
            letter_ranks[let] += freq
        else:
            letter_ranks[let] = freq
    if len(word) > 3:
        word_len = len(word)
        #for start in range(word_len-1):
        #    end = start + 2
        #    shingle = word[start:end].lower()
        #    shinglesk2.add(shingle)
        for k in range(3, word_len):
            shingle_count = word_len - k
            for start in range(shingle_count+1):
                end = start + k
                shingle = word[start:end].lower()
                if shingle in shingles:
                    shingles[shingle] += freq
                else:
                    shingles[shingle] = freq

candidate_dict2 = {**ngrams1f, **ngrams2f, **ngrams3f, **ngrams4f, **ngrams5f}
for shingle, freq in shingles.items():
    if shingle in candidate_dict2:
        candidate_dict2[shingle] += freq
    else:
        candidate_dict2[shingle] = freq

candidate_scores2 = {}
for key, val in candidate_dict2.items():
    if len(key) > 2:
        score = (len(key)-2) * val
        candidate_scores2[key] = score

max_score = max(candidate_scores2.values())
for key, val in candidate_scores2.items():
    #candidate_scores2[key] = val / max_score
    if key not in candidate_scores.keys():
        candidate_scores[key] = val / max_score * 0.5
    else:
        candidate_scores[key] = (candidate_scores[key] * 0.5) + ((val / max_score) * 0.5)

for key in candidate_scores.keys():
    if key not in candidate_scores2.keys():
        candidate_scores[key] = candidate_scores[key] * 0.5

In [7]:
#get items from forced user preferances
abrevs_list = []
for key in item_abrevs.keys():
    abrevs_list.append(key)

sorted_candidate_scores = sorted(candidate_scores.items(), key=lambda x:x[1], reverse=True)

abrevs_list = []
for key in item_abrevs.keys():
    abrevs_list.append(key)

letter_ranks[' '] = -99999999
letter_ranks['KC_BSPC'] = -9999

for items in sorted_candidate_scores:
    item = items[0]
    #check if new item is just part of a higher scored existing item, ex. if we already have 'between' then ignore 'etween'
    used = False
    for old_item in item_abrevs.values():
        len_diff = len(old_item) - len(item)
        if len_diff > 0:
            for start in range(len_diff + 1):
                end = len(old_item) - len_diff + start
                check = old_item[start:end]
                if check == item:
                    used = True
                    break
    if len(item) > 2 and used == False:
        """combo = tuple
        if special_key is not None:
            combo = (item[0].lower(), special_key)
            combo = tuple(sorted(combo))
        if combo not in abrevs_list and special_key is not None:
            item_abrevs[combo] = item
            abrevs_list.append(combo)
        else:"""
        temp_dict = {}
        temp_groups = []
        combo_len = 1
        temp_dict2 = {}
        f_letter = item[0].lower()
        if f_letter in usable_keys.keys():
            f_letter_group = usable_keys[f_letter]['group']
            temp_groups.append(f_letter_group)
            for letter in item[1:]:
                letter = letter.lower()
                if letter in usable_keys.keys() and letter != f_letter:
                    letter_group = usable_keys[letter]['group']
                    if letter_group != f_letter_group:
                        temp_dict[letter] = letter_ranks[letter]
                    temp_dict2[letter] = letter_ranks[letter]
        combo_found = False
        if len(temp_dict.keys()) == 0:
            combo_found = True
        while combo_found is False:
            if len(abrevs_list) >= combo_list_limit:
                combo_found = True
            else:
                #try to get 2 letter combos when both keys are never consecutive in a word
                if combo_len == 1:
                    cand_key = min(temp_dict, key=temp_dict.get)
                    del temp_dict[cand_key]
                    combo = (f_letter, cand_key)
                    combo = tuple(sorted(combo))
                    check_res = check_combo(combo)
                elif len(temp_dict) > 0 and combo_len > 1:
                    combo = min(temp_dict, key=temp_dict.get)
                    del temp_dict[combo]
                    combo = tuple(sorted(combo))
                    if combo_len >= len(item):
                        check_res = False
                    else:
                        check_res = check_combo(combo)
                if check_res is True:
                    item_abrevs[combo] = item
                    abrevs_list.append(combo)
                    combo_found = True

                elif len(temp_dict.keys()) == 0 and combo_len == 1:
                    combo_len = 2
                    temp_dict = {}
                    temp_combos = combinations(list(temp_dict2.keys()), 2)
                    for combo in temp_combos:
                        temp_dict[combo] = temp_dict2[combo[0]] + temp_dict2[combo[1]]
                elif len(temp_dict.keys()) == 0 and combo_len == 2:
                    combo_len = 3
                    temp_dict = {}
                    temp_dict2[f_letter] = -9999
                    #bias towards first letters all words
                    word_start = False
                    for check_letter in item:
                        check_letter = check_letter.lower()
                        if word_start is True:
                            temp_dict2[check_letter] = -9999
                        if check_letter == ' ':
                            temp_dict2[' '] = letter_ranks[check_letter]
                            word_start = True
                        else:
                            word_start = False
                    if special_key is not None:
                        temp_dict2[special_key] = letter_ranks[special_key]
                    temp_combos = combinations(list(temp_dict2.keys()), 3)
                    for combo in temp_combos:
                        temp_dict[combo] = temp_dict2[combo[0]] + temp_dict2[combo[1]] + temp_dict2[combo[2]]
                elif len(temp_dict.keys()) == 0 and combo_len == 3:
                    combo_found = True

print(item_abrevs)

{('k', 'b'): 'keyboard', ('k', 'b', ' '): 'keyboard ', ('k', 'y', ' '): 'key ', ('s', 'l', ' '): 'select ', ('w', 'e', ' '): 'where ', ('u', 'p', ' '): 'update ', ('a', 'p', ' '): 'append ', ('c', 't', ' '): 'create ', ('i', 's', ' '): 'insert ', ('d', 'a', 's'): 'database', ('d', 'e', ' '): 'delete ', ('g', 'y', ' '): 'group by ', ('h', 'g', ' '): 'having ', ('l', 'e', ' '): 'like ', ('o', 'b', ' '): 'order by ', ('d', 's', ' '): 'distinct ', ('v', 'l', ' '): 'values ', ('u', 'n', ' '): 'union ', ('t', 'l', ' '): 'table ', ('j', 'o', ' '): 'join ', ('t', 'u', ' '): 'True ', ('f', 'l', ' '): 'False ', ('v', 'l', 'u'): 'value', ('n', 'e', ' '): 'none ', ('w', 'l', ' '): 'while ', ('e', 'f', ' '): 'elif ', ('e', 'l', 's'): 'else', ('e', 'x', 'c'): 'except', ('l', 'd', ' '): 'lambda ', (' ', 'KC_BSPC', 't'): 'the ', (' ', 'KC_BSPC', 'a'): 'and ', (' ', 'h', 't'): 'that ', (' ', 'o', 't'): 'of the ', (' ', 'KC_BSPC', 'h'): 'her ', (' ', 'h', 's'): 'his ', (' ', 'i', 't'): 'tion ', (' ', 'K

In [8]:
i = 0
for key in abrevs_list:
    val = item_abrevs[key]
    i += 1
    combo_id = "0000" + str(i)
    trim_len = len(combo_id) - 4
    combo_id = "COMBO" + combo_id[trim_len:]
    combo_keys = ""
    for key_entry in key:
        code = usable_keys[key_entry]['kc']
        combo_keys = combo_keys + code + ", "
    combo_keys = combo_keys[:-2]
    print("SUBS(" + combo_id + ',  "' + val + '",  ' + combo_keys + ")")

SUBS(COMBO0001,  "keyboard",  KC_K, KC_B)
SUBS(COMBO0002,  "keyboard ",  KC_K, KC_B, KC_SPC)
SUBS(COMBO0003,  "key ",  KC_K, KC_Y, KC_SPC)
SUBS(COMBO0004,  "select ",  KC_S, KC_L, KC_SPC)
SUBS(COMBO0005,  "where ",  KC_W, KC_E, KC_SPC)
SUBS(COMBO0006,  "update ",  KC_U, KC_P, KC_SPC)
SUBS(COMBO0007,  "append ",  KC_A, KC_P, KC_SPC)
SUBS(COMBO0008,  "create ",  KC_C, KC_T, KC_SPC)
SUBS(COMBO0009,  "insert ",  KC_I, KC_S, KC_SPC)
SUBS(COMBO0010,  "database",  KC_D, KC_A, KC_S)
SUBS(COMBO0011,  "delete ",  KC_D, KC_E, KC_SPC)
SUBS(COMBO0012,  "group by ",  KC_G, KC_Y, KC_SPC)
SUBS(COMBO0013,  "having ",  KC_H, KC_G, KC_SPC)
SUBS(COMBO0014,  "like ",  KC_L, KC_E, KC_SPC)
SUBS(COMBO0015,  "order by ",  KC_O, KC_B, KC_SPC)
SUBS(COMBO0016,  "distinct ",  KC_D, KC_S, KC_SPC)
SUBS(COMBO0017,  "values ",  KC_V, KC_L, KC_SPC)
SUBS(COMBO0018,  "union ",  KC_U, KC_N, KC_SPC)
SUBS(COMBO0019,  "table ",  KC_T, KC_L, KC_SPC)
SUBS(COMBO0020,  "join ",  KC_J, KC_O, KC_SPC)
SUBS(COMBO0021,  "True ",  KC_