In [1]:
# uses https://github.com/dwyl/english-words

In [2]:
import pandas as pd
import json
import re
import random

In [3]:
with open('words_dictionary.json', "r") as file:
    data = json.load(file)

In [4]:
words = [None]*len(data.keys())
idx = 0
for word in data.keys():
    words[idx] = word
    idx += 1
    


In [5]:
word_table = {}
count = 0
for key in data.keys():
    word_table[key] = count
    count += 1
    


In [6]:
word_table = {"word": words, "word_id": [x for x in range(0, len(data))]}

In [7]:
word_table_df = pd.DataFrame(word_table)
                             
word_table_df.set_index("word_id")

Unnamed: 0_level_0,word
word_id,Unnamed: 1_level_1
0,a
1,aa
2,aaa
3,aah
4,aahed
...,...
370095,zwinglianism
370096,zwinglianist
370097,zwitter
370098,zwitterion


In [9]:
word_table_df[word_table_df['word'] == 'joey']

Unnamed: 0,word,word_id
161523,joey,161523


In [10]:
word_table_df

Unnamed: 0,word,word_id
0,a,0
1,aa,1
2,aaa,2
3,aah,3
4,aahed,4
...,...,...
370095,zwinglianism,370095
370096,zwinglianist,370096
370097,zwitter,370097
370098,zwitterion,370098


In [11]:
start_UN = r"^un\w*"
two_E = r"(?:.*e.*){2}"
end_LY = r"\w*ly$"
eight_letters = r"^\w{8}$"
all_five_vowels = r"^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u).*"
has_Z = r".*z.*"

In [14]:
category_table = {
    "category": ["start_UN", "two_E", "end_LY", "eight_letters", "all_five_vowels", "has_Z"],
    "regex": [start_UN, two_E, end_LY, eight_letters, all_five_vowels, has_Z],
    "conflicting categories": [0,0,0,0,0,0],
    "category_id": [x for x in range(0,6)],
    # difficulty -- 1-5 higher is harder
    "difficulty": [1, 1, 2, 3, 5, 2]
}

In [15]:
category_table_df = pd.DataFrame(category_table)
category_table_df.set_index("category_id")

Unnamed: 0_level_0,category,regex,conflicting categories,difficulty
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,start_UN,^un\w*,0,1
1,two_E,(?:.*e.*){2},0,1
2,end_LY,\w*ly$,0,2
3,eight_letters,^\w{8}$,0,3
4,all_five_vowels,^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u).*,0,5
5,has_Z,.*z.*,0,2


In [16]:
word_matches_category_dfs = [None]*6
for idx, cat_row in category_table_df.iterrows():
    pattern = cat_row['regex']
    word_matches_category_dfs[idx] = word_table_df[word_table_df['word'].str.contains(pattern, regex=True)]

In [17]:
running_len = 0
for i in word_matches_category_dfs:
    running_len += len(i)

word_category_table = [[None, None] for _ in range(running_len)]

In [76]:
category_idx = 0
running_idx = 0
for category_sub_dfs in word_matches_category_dfs:
    for idx, word_row in category_sub_dfs.iterrows():
        
        word_id = word_row['word_id']
        word_category_table[running_idx] = [word_id, category_idx]
        running_idx += 1

    category_idx += 1

In [77]:
word_category_table

[[334308, 0],
 [334309, 0],
 [334310, 0],
 [334311, 0],
 [334312, 0],
 [334313, 0],
 [334314, 0],
 [334315, 0],
 [334316, 0],
 [334317, 0],
 [334318, 0],
 [334319, 0],
 [334320, 0],
 [334321, 0],
 [334322, 0],
 [334323, 0],
 [334324, 0],
 [334325, 0],
 [334326, 0],
 [334327, 0],
 [334328, 0],
 [334329, 0],
 [334330, 0],
 [334331, 0],
 [334332, 0],
 [334333, 0],
 [334334, 0],
 [334335, 0],
 [334336, 0],
 [334337, 0],
 [334338, 0],
 [334339, 0],
 [334340, 0],
 [334341, 0],
 [334342, 0],
 [334343, 0],
 [334344, 0],
 [334345, 0],
 [334346, 0],
 [334347, 0],
 [334348, 0],
 [334349, 0],
 [334350, 0],
 [334351, 0],
 [334352, 0],
 [334353, 0],
 [334354, 0],
 [334355, 0],
 [334356, 0],
 [334357, 0],
 [334358, 0],
 [334359, 0],
 [334360, 0],
 [334361, 0],
 [334362, 0],
 [334363, 0],
 [334364, 0],
 [334365, 0],
 [334366, 0],
 [334367, 0],
 [334368, 0],
 [334369, 0],
 [334370, 0],
 [334371, 0],
 [334372, 0],
 [334373, 0],
 [334374, 0],
 [334375, 0],
 [334376, 0],
 [334377, 0],
 [334378, 0],
 [3343

In [78]:
word_category_table_df = pd.DataFrame(word_category_table, columns=['word_id', 'category_id'])

In [79]:
word_category_table_df

Unnamed: 0,word_id,category_id
0,334308,0
1,334309,0
2,334310,0
3,334311,0
4,334312,0
...,...,...
209483,370095,5
209484,370096,5
209485,370097,5
209486,370098,5


In [81]:
# Group by 'word_id' and count the unique 'category_id' values for each word_id
grouped = word_category_table_df.groupby('word_id')['category_id'].nunique()

# Filter out word_ids that are associated with more than one category_id
matching_word_ids = grouped[grouped > 1].index

# Display the matching word_ids
print(matching_word_ids)

Index([    83,    115,    202,    224,    244,    316,    327,    334,    341,
          354,
       ...
       370045, 370053, 370059, 370062, 370067, 370068, 370073, 370084, 370091,
       370093],
      dtype='int64', name='word_id', length=33759)


In [None]:
# number of words in more than one category
len(matching_word_ids)

33759

In [84]:
from itertools import combinations

In [86]:
# Step 1: Group by 'word_id' and aggregate categories
# This will create a DataFrame where each word_id has a list of category_id(s)
grouped = word_category_table_df.groupby('word_id')['category_id'].apply(set)

# Step 2: Filter words that belong to two or more categories
# Only keep words with 2 or more categories
filtered_group = grouped[grouped.apply(len) > 1]

# Step 3: Generate all combinations of category pairs (0,1), (0,2), ..., (4,5)
category_pairs = list(combinations(range(6), 2))

# Step 4: Count how many words belong to each category pair
category_pair_counts = {pair: [] for pair in category_pairs}

# Iterate through each word and its associated category set
for word_id, categories in filtered_group.items():
    # Get all combinations of the categories for this word_id
    for pair in category_pairs:
        if pair[0] in categories and pair[1] in categories:
            category_pair_counts[pair].append(word_id)

# Step 5: Create a DataFrame for each pair with the count of words in each category combination
category_pair_dfs = {}

for pair, word_ids in category_pair_counts.items():
    category_pair_dfs[pair] = pd.DataFrame({
        'category_pair': [pair] * len(word_ids),
        'word_id': word_ids
    })

    # Optional: If you want the count of words in each combination (not just the words)
    category_pair_dfs[str(pair) + '_count'] = pd.DataFrame({
        'category_pair': [pair],
        'count': [len(word_ids)]
    })

# Step 6: Display the results
for pair, df in category_pair_dfs.items():
    print(f"Words in category pair {pair}:")
    print(df)
    print("\n")

Words in category pair (0, 1):
     category_pair  word_id
0           (0, 1)   334323
1           (0, 1)   334328
2           (0, 1)   334329
3           (0, 1)   334338
4           (0, 1)   334340
...            ...      ...
6960        (0, 1)   354417
6961        (0, 1)   354421
6962        (0, 1)   354428
6963        (0, 1)   354444
6964        (0, 1)   354446

[6965 rows x 2 columns]


Words in category pair (0, 1)_count:
  category_pair  count
0        (0, 1)   6965


Words in category pair (0, 2):
     category_pair  word_id
0           (0, 2)   334313
1           (0, 2)   334316
2           (0, 2)   334320
3           (0, 2)   334322
4           (0, 2)   334332
...            ...      ...
2701        (0, 2)   354385
2702        (0, 2)   354402
2703        (0, 2)   354414
2704        (0, 2)   354434
2705        (0, 2)   354443

[2706 rows x 2 columns]


Words in category pair (0, 2)_count:
  category_pair  count
0        (0, 2)   2706


Words in category pair (0, 3):
     catego

In [113]:
display(category_table_df[category_table_df['category_id'] == 4])
display(category_table_df[category_table_df['category_id'] == 5])

for word_id in category_pair_dfs[(4,5)]['word_id']:
    display(word_table_df[word_table_df['word_id'] == word_id]['word'])

Unnamed: 0,category,regex,conflicting categories,category_id
4,all_five_vowels,^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u).*,0,4


Unnamed: 0,category,regex,conflicting categories,category_id
5,has_Z,.*z.*,0,5


1000    absolutize
Name: word, dtype: object

1881    accustomize
Name: word, dtype: object

1882    accustomized
Name: word, dtype: object

7080    aizoaceous
Name: word, dtype: object

7517    albumenization
Name: word, dtype: object

15260    antifeudalization
Name: word, dtype: object

18290    aquotize
Name: word, dtype: object

21105    asexualization
Name: word, dtype: object

23255    auctorizate
Name: word, dtype: object

23834    authorizable
Name: word, dtype: object

23837    authorize
Name: word, dtype: object

23838    authorized
Name: word, dtype: object

23839    authorizer
Name: word, dtype: object

23840    authorizers
Name: word, dtype: object

23841    authorizes
Name: word, dtype: object

23998    autodepolymerization
Name: word, dtype: object

24116    autoimmunize
Name: word, dtype: object

24117    autoimmunized
Name: word, dtype: object

24203    automatize
Name: word, dtype: object

24204    automatized
Name: word, dtype: object

24205    automatizes
Name: word, dtype: object

24259    autonomize
Name: word, dtype: object

24337    autoracemization
Name: word, dtype: object

24367    autoschediaze
Name: word, dtype: object

24373    autosensitization
Name: word, dtype: object

24374    autosensitized
Name: word, dtype: object

24450    autotomize
Name: word, dtype: object

24451    autotomized
Name: word, dtype: object

24490    autoxidizable
Name: word, dtype: object

24491    autoxidize
Name: word, dtype: object

24492    autoxidizer
Name: word, dtype: object

25313    azoisobutyronitrile
Name: word, dtype: object

25340    azorubine
Name: word, dtype: object

25341    azosulphine
Name: word, dtype: object

25389    azoxytoluidine
Name: word, dtype: object

31761    benzenediazonium
Name: word, dtype: object

31790    benzoazurine
Name: word, dtype: object

31801    benzofuroquinoxaline
Name: word, dtype: object

31849    benzoquinoxaline
Name: word, dtype: object

31858    benzothiofuran
Name: word, dtype: object

31866    benzotrifuran
Name: word, dtype: object

38932    boulevardize
Name: word, dtype: object

43023    bureaucratization
Name: word, dtype: object

49302    cataloguize
Name: word, dtype: object

50014    cauponize
Name: word, dtype: object

50103    cauterization
Name: word, dtype: object

50979    centrifugalization
Name: word, dtype: object

52068    chalaziferous
Name: word, dtype: object

55915    chouanize
Name: word, dtype: object

56252    chromazurine
Name: word, dtype: object

61823    coeducationalize
Name: word, dtype: object

62024    coequalize
Name: word, dtype: object

63118    colloquialize
Name: word, dtype: object

63119    colloquializer
Name: word, dtype: object

63545    columnarized
Name: word, dtype: object

64225    communalize
Name: word, dtype: object

64226    communalized
Name: word, dtype: object

64227    communalizer
Name: word, dtype: object

64986    computerizable
Name: word, dtype: object

64987    computerization
Name: word, dtype: object

65173    conceptualization
Name: word, dtype: object

65174    conceptualizations
Name: word, dtype: object

65175    conceptualize
Name: word, dtype: object

65176    conceptualized
Name: word, dtype: object

65177    conceptualizer
Name: word, dtype: object

65178    conceptualizes
Name: word, dtype: object

65179    conceptualizing
Name: word, dtype: object

66489    connaturalize
Name: word, dtype: object

67055    constitutionalize
Name: word, dtype: object

67425    contextualize
Name: word, dtype: object

70944    counterorganization
Name: word, dtype: object

71012    counterpropagandize
Name: word, dtype: object

71348    courtezanship
Name: word, dtype: object

75717    customizable
Name: word, dtype: object

77790    debituminization
Name: word, dtype: object

78078    decarburization
Name: word, dtype: object

78116    decasualization
Name: word, dtype: object

78597    decolourization
Name: word, dtype: object

79521    defunctionalization
Name: word, dtype: object

79522    defunctionalize
Name: word, dtype: object

79752    dehumanization
Name: word, dtype: object

79807    deindividualization
Name: word, dtype: object

79810    deindustrialization
Name: word, dtype: object

79822    deinstitutionalization
Name: word, dtype: object

79825    deintellectualization
Name: word, dtype: object

80388    demasculinization
Name: word, dtype: object

80959    demutization
Name: word, dtype: object

80989    denaturalization
Name: word, dtype: object

81007    denaturization
Name: word, dtype: object

81096    deneutralization
Name: word, dtype: object

81355    denuclearization
Name: word, dtype: object

81535    depauperization
Name: word, dtype: object

81732    depopularize
Name: word, dtype: object

81943    deputationize
Name: word, dtype: object

81958    deputization
Name: word, dtype: object

82039    deregulationize
Name: word, dtype: object

82536    desexualization
Name: word, dtype: object

82825    despiritualization
Name: word, dtype: object

83022    desulfurization
Name: word, dtype: object

83037    desulphurization
Name: word, dtype: object

83716    devulcanization
Name: word, dtype: object

87130    disauthorize
Name: word, dtype: object

88020    disequalization
Name: word, dtype: object

89119    dispopularize
Name: word, dtype: object

92105    douzaine
Name: word, dtype: object

92106    douzaines
Name: word, dtype: object

92107    douzainier
Name: word, dtype: object

96980    electrocauterization
Name: word, dtype: object

103168    equalization
Name: word, dtype: object

104780    esugarization
Name: word, dtype: object

105371    eudaemonize
Name: word, dtype: object

105508    eulogization
Name: word, dtype: object

105619    euphemization
Name: word, dtype: object

105826    europeanization
Name: word, dtype: object

105827    europeanize
Name: word, dtype: object

106530    exauthorize
Name: word, dtype: object

106531    exauthorizeexc
Name: word, dtype: object

111837    feudalization
Name: word, dtype: object

117506    formularize
Name: word, dtype: object

117507    formularized
Name: word, dtype: object

117508    formularizer
Name: word, dtype: object

120135    functionalize
Name: word, dtype: object

120136    functionalized
Name: word, dtype: object

120342    furazolidone
Name: word, dtype: object

120477    furodiazole
Name: word, dtype: object

121548    galvanocauterization
Name: word, dtype: object

125893    glamourize
Name: word, dtype: object

125894    glamourizer
Name: word, dtype: object

126036    glaucophanize
Name: word, dtype: object

128345    gourmandize
Name: word, dtype: object

128346    gourmandizer
Name: word, dtype: object

130596    gruneritization
Name: word, dtype: object

130652    guaconize
Name: word, dtype: object

130671    guaiacolize
Name: word, dtype: object

136096    hemiazygous
Name: word, dtype: object

138617    hydrodesulfurization
Name: word, dtype: object

138618    hydrodesulphurization
Name: word, dtype: object

138891    hydropneumatization
Name: word, dtype: object

139890    hyperazoturia
Name: word, dtype: object

140198    hyperimmunization
Name: word, dtype: object

140208    hyperinsulinization
Name: word, dtype: object

140916    hypodiazeuxis
Name: word, dtype: object

153943    institutionalize
Name: word, dtype: object

153944    institutionalized
Name: word, dtype: object

153945    institutionalizes
Name: word, dtype: object

154279    intellectualization
Name: word, dtype: object

154280    intellectualizations
Name: word, dtype: object

158788    isobenzofuran
Name: word, dtype: object

161891    journalize
Name: word, dtype: object

161892    journalized
Name: word, dtype: object

161893    journalizer
Name: word, dtype: object

161894    journalizes
Name: word, dtype: object

168480    lardizabalaceous
Name: word, dtype: object

171088    leucoquinizarin
Name: word, dtype: object

176469    luteinization
Name: word, dtype: object

182222    mediumization
Name: word, dtype: object

182258    medullization
Name: word, dtype: object

183714    mercurialization
Name: word, dtype: object

183733    mercurization
Name: word, dtype: object

186297    microminiaturize
Name: word, dtype: object

186298    microminiaturized
Name: word, dtype: object

188749    misauthorize
Name: word, dtype: object

188750    misauthorized
Name: word, dtype: object

190454    miszealous
Name: word, dtype: object

191175    modularize
Name: word, dtype: object

191176    modularized
Name: word, dtype: object

191177    modularizes
Name: word, dtype: object

192962    monumentalization
Name: word, dtype: object

192963    monumentalize
Name: word, dtype: object

192964    monumentalized
Name: word, dtype: object

192965    monumentalizing
Name: word, dtype: object

194449    mozambique
Name: word, dtype: object

196385    muzzleloading
Name: word, dtype: object

197674    nebularization
Name: word, dtype: object

197691    nebulization
Name: word, dtype: object

199261    neurotization
Name: word, dtype: object

199310    neutralization
Name: word, dtype: object

199311    neutralizations
Name: word, dtype: object

202979    nondesulfurization
Name: word, dtype: object

203590    nonequalization
Name: word, dtype: object

203591    nonequalized
Name: word, dtype: object

203592    nonequalizing
Name: word, dtype: object

204488    nonhumanized
Name: word, dtype: object

207771    nontautomerizable
Name: word, dtype: object

208325    nonvisualized
Name: word, dtype: object

208374    nonvulcanizable
Name: word, dtype: object

208375    nonvulcanized
Name: word, dtype: object

209045    noumenalize
Name: word, dtype: object

209331    nucleization
Name: word, dtype: object

210087    obituarize
Name: word, dtype: object

217766    outtyrannize
Name: word, dtype: object

217767    outtyrannized
Name: word, dtype: object

218451    overbrutalization
Name: word, dtype: object

218452    overbrutalize
Name: word, dtype: object

218453    overbrutalized
Name: word, dtype: object

218454    overbrutalizing
Name: word, dtype: object

219751    overhumanize
Name: word, dtype: object

219752    overhumanized
Name: word, dtype: object

219753    overhumanizing
Name: word, dtype: object

219833    overindividualization
Name: word, dtype: object

219842    overindustrialization
Name: word, dtype: object

219843    overindustrialize
Name: word, dtype: object

219844    overindustrialized
Name: word, dtype: object

219845    overindustrializes
Name: word, dtype: object

219846    overindustrializing
Name: word, dtype: object

219885    overintellectualization
Name: word, dtype: object

219886    overintellectualize
Name: word, dtype: object

219887    overintellectualized
Name: word, dtype: object

219888    overintellectualizing
Name: word, dtype: object

220346    overneutralization
Name: word, dtype: object

220347    overneutralize
Name: word, dtype: object

220348    overneutralized
Name: word, dtype: object

220349    overneutralizer
Name: word, dtype: object

220350    overneutralizing
Name: word, dtype: object

220762    overrapturize
Name: word, dtype: object

221716    overurbanization
Name: word, dtype: object

221717    overurbanize
Name: word, dtype: object

221718    overurbanized
Name: word, dtype: object

221719    overurbanizing
Name: word, dtype: object

225350    parabenzoquinone
Name: word, dtype: object

227432    pasteurization
Name: word, dtype: object

228145    pauperization
Name: word, dtype: object

228705    peculiarization
Name: word, dtype: object

232671    pezizaceous
Name: word, dtype: object

235636    phthalylsulfathiazole
Name: word, dtype: object

243313    popularize
Name: word, dtype: object

243314    popularized
Name: word, dtype: object

243315    popularizer
Name: word, dtype: object

243316    popularizes
Name: word, dtype: object

244229    postinfluenzal
Name: word, dtype: object

244874    poulardize
Name: word, dtype: object

247330    preequalization
Name: word, dtype: object

249612    pressurization
Name: word, dtype: object

250091    preutilization
Name: word, dtype: object

251391    procrusteanize
Name: word, dtype: object

254783    pseudographize
Name: word, dtype: object

254815    pseudoinfluenza
Name: word, dtype: object

255012    pseudopeziza
Name: word, dtype: object

255236    pseudozoological
Name: word, dtype: object

256062    puebloization
Name: word, dtype: object

256453    pulverization
Name: word, dtype: object

256454    pulverizator
Name: word, dtype: object

257370    puzzleation
Name: word, dtype: object

257843    quakerization
Name: word, dtype: object

258051    quarterization
Name: word, dtype: object

258102    quartziferous
Name: word, dtype: object

258669    quinazoline
Name: word, dtype: object

261549    reactualization
Name: word, dtype: object

262160    reauthorization
Name: word, dtype: object

262161    reauthorize
Name: word, dtype: object

262162    reauthorized
Name: word, dtype: object

262163    reauthorizing
Name: word, dtype: object

262624    recarburization
Name: word, dtype: object

265875    regularization
Name: word, dtype: object

266050    rehumanization
Name: word, dtype: object

266267    reindustrialization
Name: word, dtype: object

268366    repopularization
Name: word, dtype: object

268367    repopularize
Name: word, dtype: object

268368    repopularized
Name: word, dtype: object

268369    repopularizing
Name: word, dtype: object

268737    republicanization
Name: word, dtype: object

271052    reutilization
Name: word, dtype: object

271053    reutilizations
Name: word, dtype: object

271408    revisualization
Name: word, dtype: object

272188    rhizocephalous
Name: word, dtype: object

272222    rhizophoraceous
Name: word, dtype: object

275412    ruggedization
Name: word, dtype: object

280131    scheuchzeriaceous
Name: word, dtype: object

280208    schizaeaceous
Name: word, dtype: object

280244    schizolaenaceous
Name: word, dtype: object

280257    schizoneura
Name: word, dtype: object

282902    secularization
Name: word, dtype: object

284748    semipopularized
Name: word, dtype: object

285450    sensualization
Name: word, dtype: object

286960    sexualization
Name: word, dtype: object

297112    somnambulize
Name: word, dtype: object

307923    suberinization
Name: word, dtype: object

307933    suberization
Name: word, dtype: object

308056    subgelatinization
Name: word, dtype: object

308161    subhorizontalness
Name: word, dtype: object

308300    subjectivization
Name: word, dtype: object

309244    subspecialization
Name: word, dtype: object

309587    subtrapezoid
Name: word, dtype: object

309588    subtrapezoidal
Name: word, dtype: object

309857    succinylsulfathiazole
Name: word, dtype: object

309858    succinylsulphathiazole
Name: word, dtype: object

310387    sulfamethylthiazole
Name: word, dtype: object

310412    sulfathiazole
Name: word, dtype: object

310440    sulfisoxazole
Name: word, dtype: object

310618    sulphathiazole
Name: word, dtype: object

310631    sulphazotize
Name: word, dtype: object

310657    sulphisoxazole
Name: word, dtype: object

310675    sulphoazotize
Name: word, dtype: object

310788    sulphozincate
Name: word, dtype: object

311421    superazotation
Name: word, dtype: object

311470    supercanonization
Name: word, dtype: object

311481    supercarbonization
Name: word, dtype: object

311482    supercarbonize
Name: word, dtype: object

311539    supercivilization
Name: word, dtype: object

312368    superorganization
Name: word, dtype: object

312369    superorganize
Name: word, dtype: object

312619    supersensitization
Name: word, dtype: object

313235    suprarenalectomize
Name: word, dtype: object

317156    tautologize
Name: word, dtype: object

317157    tautologized
Name: word, dtype: object

317158    tautologizer
Name: word, dtype: object

317167    tautomerizable
Name: word, dtype: object

317168    tautomerization
Name: word, dtype: object

317169    tautomerize
Name: word, dtype: object

317170    tautomerized
Name: word, dtype: object

317171    tautomerizing
Name: word, dtype: object

320260    tetrazolium
Name: word, dtype: object

320342    teutonization
Name: word, dtype: object

326657    tourmalinize
Name: word, dtype: object

329753    trifluoperazine
Name: word, dtype: object

331900    tubercularization
Name: word, dtype: object

331929    tuberculinization
Name: word, dtype: object

331937    tuberculization
Name: word, dtype: object

331965    tuberization
Name: word, dtype: object

332765    turkomanize
Name: word, dtype: object

334016    ultraorganized
Name: word, dtype: object

334070    ultraspecialization
Name: word, dtype: object

334759    unagonize
Name: word, dtype: object

334794    unalcoholized
Name: word, dtype: object

334821    unallegorized
Name: word, dtype: object

334929    unamortized
Name: word, dtype: object

334962    unanalogized
Name: word, dtype: object

334970    unanatomizable
Name: word, dtype: object

334971    unanatomized
Name: word, dtype: object

335043    unantagonizable
Name: word, dtype: object

335044    unantagonized
Name: word, dtype: object

335046    unanthologized
Name: word, dtype: object

335068    unapostatized
Name: word, dtype: object

335072    unapostrophized
Name: word, dtype: object

335230    unaromatized
Name: word, dtype: object

335454    unauthorizable
Name: word, dtype: object

335456    unauthorize
Name: word, dtype: object

335457    unauthorized
Name: word, dtype: object

335458    unauthorizedly
Name: word, dtype: object

335459    unauthorizedness
Name: word, dtype: object

335540    unazotized
Name: word, dtype: object

336032    unbohemianize
Name: word, dtype: object

336457    uncanonize
Name: word, dtype: object

336458    uncanonized
Name: word, dtype: object

336462    uncantonized
Name: word, dtype: object

336507    uncarbonized
Name: word, dtype: object

336576    uncategorized
Name: word, dtype: object

336590    uncatholicize
Name: word, dtype: object

336591    uncatholicized
Name: word, dtype: object

337169    uncognizable
Name: word, dtype: object

337351    uncompartmentalize
Name: word, dtype: object

337352    uncompartmentalized
Name: word, dtype: object

337353    uncompartmentalizes
Name: word, dtype: object

337486    unconceptualized
Name: word, dtype: object

337919    unconventionalize
Name: word, dtype: object

337920    unconventionalized
Name: word, dtype: object

337921    unconventionalizes
Name: word, dtype: object

338716    undemocratization
Name: word, dtype: object

338717    undemocratize
Name: word, dtype: object

338718    undemocratized
Name: word, dtype: object

338719    undemocratizing
Name: word, dtype: object

338731    undemoralized
Name: word, dtype: object

338748    undenominationalize
Name: word, dtype: object

338942    undercapitalization
Name: word, dtype: object

339540    underorganization
Name: word, dtype: object

340052    underutilization
Name: word, dtype: object

340551    undisorganized
Name: word, dtype: object

341538    unetymologizable
Name: word, dtype: object

342584    unformalized
Name: word, dtype: object

342594    unformularizable
Name: word, dtype: object

342595    unformularize
Name: word, dtype: object

343531    unharmonize
Name: word, dtype: object

343532    unharmonized
Name: word, dtype: object

343755    unhypnotizable
Name: word, dtype: object

343884    unhospitalized
Name: word, dtype: object

344134    uniformalize
Name: word, dtype: object

344196    unilateralization
Name: word, dtype: object

344280    unimmortalize
Name: word, dtype: object

344281    unimmortalized
Name: word, dtype: object

345126    universalization
Name: word, dtype: object

345159    univocalized
Name: word, dtype: object

345187    unjeopardized
Name: word, dtype: object

345225    unjournalized
Name: word, dtype: object

345590    unlycanthropize
Name: word, dtype: object

345740    unlocalizable
Name: word, dtype: object

345741    unlocalize
Name: word, dtype: object

345742    unlocalized
Name: word, dtype: object

345948    unmammonized
Name: word, dtype: object

346205    unmemorialized
Name: word, dtype: object

346573    unmoralize
Name: word, dtype: object

346574    unmoralized
Name: word, dtype: object

346593    unmortalize
Name: word, dtype: object

346773    unnationalized
Name: word, dtype: object

346936    unnormalized
Name: word, dtype: object

347232    unorganizable
Name: word, dtype: object

347233    unorganized
Name: word, dtype: object

347234    unorganizedly
Name: word, dtype: object

347235    unorganizedness
Name: word, dtype: object

347318    unoxidizable
Name: word, dtype: object

347418    unparagonized
Name: word, dtype: object

347561    unpatronizable
Name: word, dtype: object

347562    unpatronized
Name: word, dtype: object

347782    unpersonalized
Name: word, dtype: object

347783    unpersonalizing
Name: word, dtype: object

347884    unphosphatized
Name: word, dtype: object

347904    unpictorialize
Name: word, dtype: object

347905    unpictorialized
Name: word, dtype: object

348119    unpolarizable
Name: word, dtype: object

348120    unpolarized
Name: word, dtype: object

348162    unpopularize
Name: word, dtype: object

348163    unpopularized
Name: word, dtype: object

348171    unporcelainized
Name: word, dtype: object

348710    unprotestantize
Name: word, dtype: object

349062    unrationalized
Name: word, dtype: object

349200    unrecognizable
Name: word, dtype: object

349201    unrecognizableness
Name: word, dtype: object

349202    unrecognizably
Name: word, dtype: object

349531    unreorganized
Name: word, dtype: object

349848    unrevelationize
Name: word, dtype: object

350041    unroyalized
Name: word, dtype: object

350058    unromanticized
Name: word, dtype: object

350638    unsectionalized
Name: word, dtype: object

351500    unsocializable
Name: word, dtype: object

351501    unsocialized
Name: word, dtype: object

352898    untheorizable
Name: word, dtype: object

353661    unvaporized
Name: word, dtype: object

353911    unvocalized
Name: word, dtype: object

353928    unvolatilize
Name: word, dtype: object

353929    unvolatilized
Name: word, dtype: object

354329    unwomanize
Name: word, dtype: object

354330    unwomanized
Name: word, dtype: object

356199    utopianize
Name: word, dtype: object

356200    utopianizer
Name: word, dtype: object

357046    vapocauterization
Name: word, dtype: object

357121    vapourizable
Name: word, dtype: object

357123    vapourize
Name: word, dtype: object

357124    vapourized
Name: word, dtype: object

357125    vapourizer
Name: word, dtype: object

357857    velloziaceous
Name: word, dtype: object

358668    vernacularization
Name: word, dtype: object

369137    zeuctocoelomatic
Name: word, dtype: object

369143    zeuglodontia
Name: word, dtype: object

369144    zeuglodontidae
Name: word, dtype: object

369150    zeugobranchia
Name: word, dtype: object

369151    zeugobranchiata
Name: word, dtype: object

369462    zingiberaceous
Name: word, dtype: object

369490    zinziberaceous
Name: word, dtype: object

369678    zomotherapeutic
Name: word, dtype: object

369734    zonuridae
Name: word, dtype: object

In [101]:
# Step 6: Display the results
for pair, df in category_pair_dfs.items():
    print(f"\nWords in category pair {pair}:", len(df))
    if len(df) == 1:
        print(category_table_df['category'][category_table_df['category_id'] == int(pair[1])])
        print(category_table_df['category'][category_table_df['category_id'] == int(pair[4])])
        print(category_table_df['category'][category_table_df['category_id'] == int(pair[4])])
        # print(word_table_df['word'][word_table_df['word_id'] == ])
    print(df)



Words in category pair (0, 1): 6965
     category_pair  word_id
0           (0, 1)   334323
1           (0, 1)   334328
2           (0, 1)   334329
3           (0, 1)   334338
4           (0, 1)   334340
...            ...      ...
6960        (0, 1)   354417
6961        (0, 1)   354421
6962        (0, 1)   354428
6963        (0, 1)   354444
6964        (0, 1)   354446

[6965 rows x 2 columns]

Words in category pair (0, 1)_count: 1
0    start_UN
Name: category, dtype: object
1    two_E
Name: category, dtype: object
1    two_E
Name: category, dtype: object
  category_pair  count
0        (0, 1)   6965

Words in category pair (0, 2): 2706
     category_pair  word_id
0           (0, 2)   334313
1           (0, 2)   334316
2           (0, 2)   334320
3           (0, 2)   334322
4           (0, 2)   334332
...            ...      ...
2701        (0, 2)   354385
2702        (0, 2)   354402
2703        (0, 2)   354414
2704        (0, 2)   354434
2705        (0, 2)   354443

[2706 rows x 2 c

In [104]:
print(word_table_df[word_table_df['word_id'] == 334352])
print(word_table_df[word_table_df['word_id'] == 334353])
print(word_table_df[word_table_df['word_id'] == 354363])


              word  word_id
334352  unabortive   334352
                word  word_id
334353  unabortively   334353
                 word  word_id
354363  unworkmanlike   354363


In [105]:
print(len(word_table_df))
print(len(category_table_df))
print(len(word_category_table_df))

370100
6
209488


EVERYTHING BELOW IS EXTRANEOUS


---------------------------------------------


---------------------------------------------

In [64]:
word_category = []

# Iterate over each word
count = 0
for _, word_row in word_table_df.iterrows():
    word_id = word_row['word_id']
    word = word_row['word']
    
    # Iterate over each category
    for _, cat_row in category_table_df.iterrows():
        category_id = cat_row['category_id']
        pattern = cat_row['regex']

        # Check if the word matches the regex
        if re.match(pattern, word):
            word_category.append({'word_category_id': len(word_category), 'word_id': word_id, 'category_id': category_id})
    count += 1

    if count % 10000 == 0:
        print(count)
    

# Convert to DataFrame
word_category_df = pd.DataFrame(word_category)

10000


KeyboardInterrupt: 

In [58]:
print(category_table_df['regex'].apply(type))

0    <class 'tuple'>
1    <class 'tuple'>
2    <class 'tuple'>
3    <class 'tuple'>
4      <class 'str'>
5      <class 'str'>
Name: regex, dtype: object


Just tests below this
`````````````````````````````````````````
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [14]:
test_dict = {}
idx = 0
for word in data.keys():
    test_dict[word] = [random.choice([0, 1]) for _ in range(50)]
    idx += 1
    if idx % 100 == 0:
        print(idx)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850

In [16]:
df = pd.DataFrame(test_dict)

In [23]:
idx_list = [str(x) for x in range(0,50)]
idx_list[0] = 'double e'
df.index = idx_list
df

Unnamed: 0,a,aa,aaa,aah,aahed,aahing,aahs,aal,aalii,aaliis,...,zwanziger,zwieback,zwiebacks,zwieselite,zwinglian,zwinglianism,zwinglianist,zwitter,zwitterion,zwitterionic
double e,0,0,1,1,1,0,0,0,0,1,...,0,0,0,1,0,1,0,1,0,1
1,1,0,0,0,1,0,1,1,1,1,...,1,0,1,1,0,0,1,1,1,0
2,0,0,1,1,1,1,0,0,1,0,...,1,1,1,1,0,0,1,0,1,0
3,0,0,1,0,1,1,0,1,0,1,...,0,0,1,1,0,0,0,0,1,0
4,0,1,0,1,1,1,1,0,0,1,...,1,0,0,0,0,1,1,1,1,0
5,0,1,1,1,0,0,1,0,1,1,...,0,0,0,1,1,1,1,0,0,0
6,0,0,1,0,1,1,1,0,1,0,...,1,1,1,1,0,0,1,0,0,0
7,1,1,1,0,1,1,0,1,0,1,...,1,1,1,0,0,0,1,0,1,0
8,1,0,1,0,0,0,1,0,0,0,...,0,1,1,1,0,1,0,0,0,0
9,1,0,1,0,0,1,1,0,1,0,...,0,1,0,1,0,0,1,0,0,0


In [24]:
df.to_json('dataframe.json', orient='records')

In [25]:
df.to_csv('dataframe.csv', index=False)

In [2]:
csv_df = pd.read_csv('dataframe.csv')

In [3]:
csv_df['jack']

0     1
1     0
2     1
3     1
4     0
5     0
6     1
7     1
8     1
9     0
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    0
18    0
19    1
20    0
21    0
22    0
23    1
24    0
25    0
26    0
27    0
28    1
29    1
30    0
31    0
32    0
33    0
34    0
35    1
36    0
37    0
38    1
39    0
40    0
41    1
42    1
43    1
44    0
45    1
46    0
47    1
48    0
49    1
Name: jack, dtype: int64