In [1]:
import brown_clustering
import json
import numpy as np
import pandas as pd

from brown_clustering import BigramCorpus, BrownClustering
from collections import defaultdict
from spacy.lang.en import English

# Load NLP Pipeline

In [2]:
nlp = English()
tokenizer = nlp.tokenizer

In [10]:
nlp.Defaults.stop_words |= {'.', ',', '"', '(', ')', '?', '!', '...', '-', "'", '©', '°'}

# Load Data

In [3]:
model_options = ['language', 'main-idea', 'organization', 'support']

In [4]:
datasets = {}
for opt in model_options:
    print(opt)
    datasets[opt] = pd.read_csv(f'{opt}-analysis_set.parsed.csv')
    print (len(datasets[opt]))

language
892
main-idea
1195
organization
775
support
583


In [5]:
set(datasets['main-idea']['prompt'])

{'tii/america_singing',
 'tii/ap_eng_owning_yourself_informative_3_0_0',
 'tii/ap_hist_local_and_global_historical_analysis_1_0_0',
 'tii/earth_is_cruel_analysis_1_0_0',
 'tii/laughter_narrative_3_0_0',
 'tii/london_eyes_open_narrative_3_0_0',
 'tii/nature_by_design_informative_3_0_0',
 'tii/patience_narrative_3_0_0',
 'tii/tell_tale_heart_narrative_3_0_0',
 'tii/the_giver_analysis_1_0_0',
 'tii/tomorrow_seeds_informative_3_0_0',
 'tii/uniforms_argumentative_3_0_0'}

In [6]:
all_texts = set()
all_prompts = set()
for opt in model_options:
    print (f"Adding texts from [{opt}] ... ", end='')
    all_texts.update(set(datasets[opt]['text']))
    all_prompts.update(set(datasets[opt]['prompt']))
    print("DONE.")
    print (len(all_texts))

Adding texts from [language] ... DONE.
892
Adding texts from [main-idea] ... DONE.
1746
Adding texts from [organization] ... DONE.
1989
Adding texts from [support] ... DONE.
2277


In [7]:
all_texts = defaultdict(set)
for opt in model_options:
    print (f"Adding texts from [{opt}] ... ", end='')
    for idx, prompt in datasets[opt]['prompt'].items():
        all_texts[prompt].add(datasets[opt]['text'].iloc[idx])
    print("DONE.")
    print (len(all_texts))

Adding texts from [language] ... DONE.
12
Adding texts from [main-idea] ... DONE.
18
Adding texts from [organization] ... DONE.
19
Adding texts from [support] ... DONE.
24


In [8]:
all_clusters = []
all_codes = []

for i, k in enumerate(sorted(all_texts)):
    print('*' * 50)
    print(f'Processing {len(all_texts[k])} documents for prompt, `{k}` ... ({i+1} out of {len(all_texts)})')
    processed_tokens = [[str(t).strip().lower() for t in tokenizer(text) if str(t).strip() != ''] for text in sorted(all_texts[k])]
    #  and str(t).strip().lower() not in nlp.Defaults.stop_words
    corpus = BigramCorpus(processed_tokens, alpha=0.25, min_count=3)
    corpus.print_stats()
    
    clustering = BrownClustering(corpus, m=500)
    clusters = clustering.train()
    
    all_clusters.append(clusters)
    all_codes.append(clustering.codes())

**************************************************
Processing 97 documents for prompt, `tii/ad_me_informative_3_0_0` ... (1 out of 24)
Vocab count: 1269
Token count: 39786
unique 2gram count: 16187
2gram count: 39883.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1269/1269 [00:55<00:00, 22.83it/s]


**************************************************
Processing 101 documents for prompt, `tii/america_singing` ... (2 out of 24)
Vocab count: 1166
Token count: 46411
unique 2gram count: 13957
2gram count: 46512.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1166/1166 [00:41<00:00, 28.17it/s]


**************************************************
Processing 37 documents for prompt, `tii/ap_eng_memorialize_argumentative_3_0_0` ... (3 out of 24)
Vocab count: 945
Token count: 21851
unique 2gram count: 9062
2gram count: 21888.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 945/945 [00:28<00:00, 33.09it/s]


**************************************************
Processing 136 documents for prompt, `tii/ap_eng_owning_yourself_informative_3_0_0` ... (4 out of 24)
Vocab count: 1605
Token count: 65797
unique 2gram count: 22826
2gram count: 65933.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1605/1605 [01:05<00:00, 24.62it/s]


**************************************************
Processing 161 documents for prompt, `tii/ap_hist_local_and_global_historical_analysis_1_0_0` ... (5 out of 24)
Vocab count: 1405
Token count: 50919
unique 2gram count: 17617
2gram count: 51080.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:54<00:00, 25.71it/s]


**************************************************
Processing 43 documents for prompt, `tii/call_or_text_informative_3_0_0` ... (6 out of 24)
Vocab count: 617
Token count: 14968
unique 2gram count: 6688
2gram count: 15011.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 617/617 [00:11<00:00, 55.30it/s]


**************************************************
Processing 55 documents for prompt, `tii/earth_is_cruel_analysis_1_0_0` ... (7 out of 24)
Vocab count: 1075
Token count: 30356
unique 2gram count: 11445
2gram count: 30411.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1075/1075 [00:35<00:00, 29.89it/s]


**************************************************
Processing 9 documents for prompt, `tii/human_language_historical_analysis_1_0_0` ... (8 out of 24)
Vocab count: 262
Token count: 3982
unique 2gram count: 2116
2gram count: 3991.0
Laplace smoothing: 0.25


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 262/262 [00:01<00:00, 165.50it/s]


**************************************************
Processing 68 documents for prompt, `tii/julius_caesar_analysis_1_0_0` ... (9 out of 24)
Vocab count: 980
Token count: 39686
unique 2gram count: 11657
2gram count: 39754.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 980/980 [00:31<00:00, 31.59it/s]


**************************************************
Processing 55 documents for prompt, `tii/just_because_narrative_3_0_0` ... (10 out of 24)
Vocab count: 1118
Token count: 27443
unique 2gram count: 12346
2gram count: 27498.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1118/1118 [00:37<00:00, 29.89it/s]


**************************************************
Processing 176 documents for prompt, `tii/laughter_narrative_3_0_0` ... (11 out of 24)
Vocab count: 2359
Token count: 87303
unique 2gram count: 32780
2gram count: 87479.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2359/2359 [01:47<00:00, 21.85it/s]


**************************************************
Processing 96 documents for prompt, `tii/london_eyes_open_narrative_3_0_0` ... (12 out of 24)
Vocab count: 1109
Token count: 34308
unique 2gram count: 13344
2gram count: 34404.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1109/1109 [00:37<00:00, 29.49it/s]


**************************************************
Processing 128 documents for prompt, `tii/man_in_the_water_analysis_1_0_0` ... (13 out of 24)
Vocab count: 1079
Token count: 45441
unique 2gram count: 13320
2gram count: 45569.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:36<00:00, 29.51it/s]


**************************************************
Processing 45 documents for prompt, `tii/naps_argumentative_3_0_0` ... (14 out of 24)
Vocab count: 893
Token count: 26192
unique 2gram count: 9797
2gram count: 26237.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 893/893 [00:25<00:00, 34.69it/s]


**************************************************
Processing 80 documents for prompt, `tii/nature_by_design_informative_3_0_0` ... (15 out of 24)
Vocab count: 1221
Token count: 43551
unique 2gram count: 12756
2gram count: 43631.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1221/1221 [00:43<00:00, 27.83it/s]


**************************************************
Processing 187 documents for prompt, `tii/patience_narrative_3_0_0` ... (16 out of 24)
Vocab count: 1964
Token count: 80052
unique 2gram count: 27904
2gram count: 80239.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1964/1964 [01:25<00:00, 22.98it/s]


**************************************************
Processing 49 documents for prompt, `tii/prep_work_argumentative_3_0_0` ... (17 out of 24)
Vocab count: 665
Token count: 18283
unique 2gram count: 7717
2gram count: 18332.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 665/665 [00:13<00:00, 48.00it/s]


**************************************************
Processing 39 documents for prompt, `tii/social_argumentative_3_0_0` ... (18 out of 24)
Vocab count: 594
Token count: 11643
unique 2gram count: 6092
2gram count: 11682.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 594/594 [00:09<00:00, 59.46it/s]


**************************************************
Processing 93 documents for prompt, `tii/tell_tale_heart_narrative_3_0_0` ... (19 out of 24)
Vocab count: 1302
Token count: 41384
unique 2gram count: 16045
2gram count: 41477.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1302/1302 [00:48<00:00, 26.90it/s]


**************************************************
Processing 135 documents for prompt, `tii/the_giver_analysis_1_0_0` ... (20 out of 24)
Vocab count: 1388
Token count: 57010
unique 2gram count: 18460
2gram count: 57145.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1388/1388 [00:53<00:00, 25.76it/s]


**************************************************
Processing 95 documents for prompt, `tii/tomorrow_seeds_informative_3_0_0` ... (21 out of 24)
Vocab count: 1190
Token count: 52234
unique 2gram count: 13245
2gram count: 52329.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1190/1190 [00:42<00:00, 27.83it/s]


**************************************************
Processing 304 documents for prompt, `tii/uniforms_argumentative_3_0_0` ... (22 out of 24)
Vocab count: 1589
Token count: 93951
unique 2gram count: 24498
2gram count: 94255.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1589/1589 [01:05<00:00, 24.28it/s]


**************************************************
Processing 52 documents for prompt, `tii/womens_suffrage_and_equal_rights_analysis_1_0_0` ... (23 out of 24)
Vocab count: 1023
Token count: 28477
unique 2gram count: 10389
2gram count: 28529.0
Laplace smoothing: 0.25


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1023/1023 [00:32<00:00, 31.16it/s]


**************************************************
Processing 36 documents for prompt, `tii/youve_got_a_friend_narrative_3_0_0` ... (24 out of 24)
Vocab count: 547
Token count: 10408
unique 2gram count: 5267
2gram count: 10444.0
Laplace smoothing: 0.25


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 547/547 [00:07<00:00, 73.60it/s]


In [9]:
import pickle

with open('brown_clusters.by_prompt.with_stopwords.pkl', 'wb') as f:
    pickle.dump((all_clusters, all_codes), f)

In [14]:
MAX_GROUPS = 40

all_groups = []

for clusters, codes in zip(all_clusters, all_codes):
    for n in range(10, 1, -1):
        groups = defaultdict(list)
        for token, code in codes.items():
            groups[code[:n]].append(token)
            
        if len(groups) <= MAX_GROUPS:
            all_groups.append([set(group) for group in groups.values()])
            break
    #all_groups.append(list(groups.values()))

In [15]:
for groups in all_groups:
    print ([len(group) for group in groups])

[249, 6, 72, 24, 29, 42, 78, 82, 32, 107, 235, 39, 76, 21, 26, 21, 25, 31, 29, 11, 4, 8, 6, 4, 2, 2, 2, 1, 1, 1, 2, 1]
[80, 18, 71, 118, 254, 16, 36, 24, 386, 21, 16, 27, 21, 6, 11, 15, 9, 5, 4, 6, 6, 5, 1, 1, 3, 1, 1, 1, 1, 1, 1]
[134, 459, 58, 49, 71, 33, 10, 11, 16, 12, 24, 13, 23, 9, 2, 3, 3, 2, 5, 1, 2, 1, 1, 1, 1, 1]
[123, 16, 62, 89, 96, 141, 625, 54, 41, 146, 28, 50, 22, 13, 25, 15, 6, 8, 12, 3, 5, 5, 4, 3, 2, 2, 1, 1, 1, 3, 1, 2]
[20, 72, 47, 34, 120, 231, 18, 51, 321, 50, 71, 24, 87, 144, 47, 27, 6, 15, 5, 6, 2, 4, 1, 1, 1]
[18, 58, 63, 34, 116, 41, 42, 31, 11, 17, 22, 30, 19, 12, 9, 11, 5, 21, 4, 3, 3, 13, 7, 4, 3, 5, 5, 3, 1, 1, 4, 1]
[73, 108, 549, 35, 69, 48, 24, 35, 11, 25, 11, 19, 9, 6, 13, 3, 20, 4, 4, 4, 1, 1, 1, 1, 1]
[84, 22, 20, 27, 13, 6, 13, 7, 6, 18, 2, 10, 2, 3, 16, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[11, 17, 41, 125, 50, 362, 49, 56, 19, 38, 17, 18, 67, 21, 41, 4, 13, 5, 3, 3, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1]
[197, 37, 168, 74, 97, 306, 15, 30, 41, 28, 8, 9, 11, 53, 5

In [16]:
from typing import List, Set

In [17]:
def common_sets(lists_of_sets: List[List[Set[str]]], threshold: float = 0.5) -> List[Set[str]]:
    """
    Returns a list of sets of words that are the common sets between all of the given lists of sets.
    
    :param lists_of_sets: A list of lists of sets of words.
    :param threshold: The threshold for the Jaccard similarity coefficient to consider two sets as common.
    :return: A list of sets of words that are the common sets between all of the given lists of sets.
    """
    # Compute the Jaccard similarity coefficient between each pair of sets.
    similarities = []
    for i, sets_i in enumerate(lists_of_sets):
        for j, sets_j in enumerate(lists_of_sets[i+1:], start=i+1):
            for set_i in sets_i:
                for set_j in sets_j:
                    similarity = len(set_i.intersection(set_j)) / len(set_i.union(set_j))
                    if similarity >= threshold:
                        similarities.append((i, j, set_i, set_j))
                        
    # Group the sets that have a Jaccard similarity coefficient above the threshold.
    groups = {}
    for i, j, set_i, set_j in similarities:
        if frozenset(set_i) not in groups:
            groups[frozenset(set_i)] = {i, j}
        else:
            groups[frozenset(set_i)].add(i)
            groups[frozenset(set_i)].add(j)
        if frozenset(set_j) not in groups:
            groups[frozenset(set_j)] = {i, j}
        else:
            groups[frozenset(set_j)].add(i)
            groups[frozenset(set_j)].add(j)
            
    # Return the sets that belong to all the groups.
    if len(groups) == 0:
        return []
    common_sets = set()
    for set_i, group_i in groups.items():
        if len(group_i) == len(lists_of_sets):
            common_sets.add(frozenset(set_i))
    return list(common_sets)

In [24]:
test_out = common_sets(all_groups, .1)

In [25]:
len(test_out)

55

In [26]:
for s in test_out:
    print (', '.join(s))
    print ()

does, did, is, was, actually, should, have, could, had, also, can, 's, were, still, would, are, will

that, from, to, during, over, but, on, ", as, between, which, against, with, because, by

into, for, from, on, at, of, with, in

about, for, from, on, of, as, with, in

,, .

but, on, between, ', it, when, out, by, how, with, each, because, in, if, what, that, moki, and, he, of, as, one, which, they, about, from, for, to, we, ", there, both, this, where

there, this, it, owning, he

then, after, what, 2, all, many, no, an, though, one, which, when, although, shirely, while, these, since, why, how, most, both, two, if

everybody, all, many, (, eliminating, like, ', who, some, by, just, into, why, made, how, also, nobody, making, at, assigned, with, :, than, every, -, then, page, having, even, given, though, which, although, or, about, these, from, since, for, only, we, ?, lois, two, without, being, where

a, the

what, that, how, but, because, who, when, if

that, about, for, from, how,

In [27]:
test_out = common_sets(test_out, .1)

AttributeError: 'str' object has no attribute 'intersection'

In [216]:
def print_columns(words_list, start_idx=0):
    """
    Prints a list of lists of words in row format, with a header row and separator row.
    
    :param words_list: A list of lists of words.
    """
    # Compute the maximum length of each row.
    num_cols = len(words_list)
    col_lengths = [max(len(word) for word in row) for row in words_list]
    col_size = [len(w) for w in words_list]
    
    # Compute the maximum length of the header and the longest word in the column.
    header_lengths = [len(f'Feature {j}') for j in range(num_cols)]
    max_lengths = [max(header_lengths[j], col_lengths[j]) for j in range(num_cols)]
    
    # Print the header row with column labels.
    for j in range(num_cols):
        header = f'Group {j+start_idx}'
        padding = ' ' * (max_lengths[j] - len(header))
        print(header + padding, end='  ')
    print()
    
    # Print the separator row with dashes.
    for j in range(num_cols):
        separator = '-' * max_lengths[j]
        print(separator, end='  ')
    print()
    
    # Print the words in row format.
    for i in range(max(col_size)):
        for j in range(num_cols):
            word = words_list[j][i] if i < len(words_list[j]) else ''
            padding = ' ' * (max_lengths[j] - len(word))
            print(word + padding, end='  ')
        print()

In [226]:
n_columns = 10
for i in range(0, len(test_out), n_columns):
    if i+n_columns >= len(test_out):
        print_columns([sorted(t) for t in test_out[i:]], i+1)
    else:
        print_columns([sorted(t) for t in test_out[i:i+n_columns]], i+1)
    print()

Group 1        Group 2      Group 3     Group 4    Group 5    Group 6      Group 7    Group 8    Group 9     Group 10   
-------------  -----------  ----------  ---------  ---------  -----------  ---------  ---------  ----------  ---------  
advantage      ;            attention   asked      believe    1            know       bad        carolyn     bad        
answer         able         based       came       different  actually     need       care       different   bullying   
best           actually     believe     day        example    author       people     come       fell        children   
communication  age          clothing    found      good       happened     play       cry        important   clothing   
conclusion     appeal       come        going      having     life         tell       dad        karcyn      cost       
cons           better       game        know       identity   note         want       divice     lesson      decrease   
convenient     big          help

### Feature 1:
```best, things, easier, advantage, instead, quick, going, reasons, great, think, pros, answer, know, hand, time, thing, easy, example, day, convenient, communication, faster, cons, conclusion```

Efficiency or convinience

### Feature 2:
```school, business, ;, pretty, enjoy, favorite, getting, searches, capture, girl, history, thing, tend, teens, food, age, friends, appeal, conclusion, brand, interest, high, better, million, love, likely, right, depends, search, follow, describe, usually, businesses, pay, actually, interests, comes, consider, good, google, majority, offline, important, directed, need, completely, youtube, girls, times, targeting, find, going, makeup, music, fun, watch, seeing, profile, able, related, personality, nt, main, type, big, create, feel```

Informal teen interests (may be a prompt)

### Feature 3:
```ways, makes, come, want, phone, receive, game, website, based, send, money, seen, interested, try, help, pop, trying, shows, looking, clothing, attention, believe```

Teen values (may be a prompt)

### Feature 4:
```came, found, time, going, ran, started, think, day, looking, know, asked```

Basic actions/activities

### Feature 5:
```way, like, important, identity, thing, different, think, example, mean, having, believe, good, know, life```

Opinion declarations

### Feature 6:
```story, way, represents, sharing, showed, life, happened, represented, shown, told, 1, shows, real, quote, note, author, actually```

Storytelling, quoting (analysis genre reflections)

### Feature 7:
```play, need, want, tell, people, know```

Simple communicative actions

### Feature 8:
```sister, come, picked, cry, having, playing, night, math, bad, divice, dad, little, wanted, saying, ended, care, sun, hallway, s```

Informal (including misspellings) tangent (presonal narrative)

### Feature 9:
```sister, carolyn, different, stop, lesson, work, riding, fell, life, okay, important, nt, ran, thing, little, trying, yes, real, tire, quote, needed, karcyn, pressurize```

Personal narrative

### Feature 10:
```way, idea, prevent, decrease, stop, people, good, student, bad, help, children, thing, expensive, cost, schools, clothing, bullying```

Cost anlaysis (in schools) (prompt)

### Feature 11:
```elders, way, want, feel, choices, different, think, having, know, life, feelings, like, world, choice, nt, utopia, lives, perfect, live```

Feelings/ideals

### Feature 12:
```things, want, grades, different, think, having, focus, good, work, able, know, life, career, plan, future, children, look, mind, ready```

Life goals/priorities

### Feature 13:
```like, saw, find, look, people```

Simple observations

### Feature 14:
```things, way, life, thing, people, know```

Simple general concepts

### Feature 15:
```things, happen, turn, countries, country, life, fair, conspiring, bad, nation, like, time, world, help, poor, little, need, humble, small, poorest```

Societal fairness (may be interesting if that gets groups with demographics)

### Feature 16:
```best, decided, story, mother, come, want, taking, phone, friend, started, great, takes, comes, knew, better, able, asked, bad, family, help, tell, games, wanted, parents, thought```

Parental intent/decisiosn (personal narrative)

### Feature 17:
```purpose, said, way, different, tone, persuasive, trying, shows, better, similar```

Meta-persuasive terms

### Feature 18:
```things, like, look, know, different, people, think```

General population comparison

### Feature 19:
```times, want, die, conspirators, friend, leader, wrong, power, think, reason, person, audience, ceasar, ambition, cassius, end, bad, right, like, thing, poor, example, wanted, romans, instance, hath, friends, feel```

Conflict, historical (prompt)

### Feature 20:
```bad, things, rules, good, sameness, thing, equality```

Sameness judgement

### Feature 21:
```going, online, posting, inspired, having, information, work, know, life, bullied, benefits, time, argue, communicate, looking, day, type, believe, look, lives```

Online interactions (prompts)

### Feature 22:
```years, begin, started, people, classes, better, student, later, kids, like, learn, important, time, colleges, help, earlier, starting, need, schools, believe, parents, process, feel```

School concepts


In [224]:
word_groups = {
    'Efficiency or convinience' : test_out[0],
    'Informal teen interests' : test_out[1],
    'Teen values' : test_out[2],
    'Basic actions/activities' : test_out[3],
    'Opinion declarations' : test_out[4],
    'Storytelling, quoting' : test_out[5],
    'Simple communicative actions' : test_out[6],
    'Informal tangents' : test_out[7],
    'UNKNOWN GROUP' : test_out[8],
    'Cost anlaysis (in schools)' : test_out[9],
    'Feelings/ideals' : test_out[10],
    'Life goals/priorities' : test_out[11],
    'Simple observations' : test_out[12],
    'Simple general concepts' : test_out[13],
    'Societal fairness' : test_out[14],
    'Parental intent/decisiosn' : test_out[15],
    'Meta-persuasive terms' : test_out[16],
    'General population comparison' : test_out[17],
    'Conflict, historical' : test_out[18],
    'Sameness judgement' : test_out[19],
    'Online interactions' : test_out[20],
    'School concepts' : test_out[21],
}

In [225]:
with open('named_common_clusters.pkl', 'wb') as f:
    pickle.dump(word_groups, f)

# OLD - DO NOT USE

In [7]:
all_texts = sorted(all_texts)

In [9]:
all_prompts = sorted(all_prompts)

In [11]:
processed_tokens = [[str(t).strip().lower() for t in tokenizer(text) if str(t).strip() != '' and str(t).strip().lower() not in nlp.Defaults.stop_words] for text in all_texts]

In [12]:
corpus = BigramCorpus(processed_tokens, alpha=0.25, min_count=3)

In [13]:
corpus.print_stats()

Vocab count: 9270
Token count: 367485
unique 2gram count: 234776
2gram count: 369762.0
Laplace smoothing: 0.25


In [94]:
clustering = BrownClustering(corpus, m=1000)

In [95]:
clusters = clustering.train()

100%|█████████████████████████████████████████████████████████████████████████████████████████| 9272/9272 [53:17<00:00,  2.90it/s]


In [96]:
len(clusters)

1000

In [101]:
with open('brown_clusters.all.json', 'r') as f:
    c = json.load(f)

In [103]:
len(c['clusters']), len(c['codes'])

(1000, 9272)

In [123]:
BITS = 5

groups = defaultdict(lambda: [])
for token, code in c['codes'].items():
    groups[code[:BITS]].append(token)

In [124]:
len(groups)

30

In [132]:
for group in groups:
    print(str(len(groups[group])).rjust(5), ', '.join(groups[group][:10]))  # , 
    print()

  446 flight, mr., air, sleeping, city, road, ice, pulled, homework, quiet

   77 melons, melon, drive, drove, 7, 9, culture, fully, plants, crops

  266 products, product, facebook, instagram, crime, leads, consider, considered, relationships, filled

  536 storm, statue, placed, greatly, shape, columbus, described, wind, hurricanes, street

  281 monuments, general, species, animals, believes, believed, aristotle, sartre, differences, sort

   25 persuade, convince, kill, killing, cried, bury, roman, slaves, julius, death

   44 appeals, devices, rhetorical, literary, sameness, individuals, ads, advertisements, logos, ethos

  891 met, near, success, successful, earthquake, tragedy, britian, bengal, facts, statement

  263 joke, jokes, prepare, preparing, lots, rich, works, worked, faster, cousin

 6013 territories, ball, conflicts, spot, leader, friendly, position, sight, crown, fruit

   36 higher, lower, math, science, improve, increase, lowry, released, utopia, provide

   44 pru

In [136]:
for i, group in enumerate(groups):
    if len(groups[group]) < 5:
        continue
    print(f'{i+1}. ', ', '.join(groups[group][:100]))  # str(len(groups[group])).rjust(5), 
    print()

1.  flight, mr., air, sleeping, city, road, ice, pulled, homework, quiet, dear, suddenly, immediately, loud, noise, sent, watched, scared, excited, fell, stopped, practice, clean, floor, yelled, colleges, hot, replied, oh, test, dog, forward, washington, grabbed, park, cars, bike, stood, laughs, mall, ok, quickly, hair, worried, etc, sitting, mans, movie, news, asleep, sudden, tree, running, acting, talked, nervous, town, officer, opened, standing, stayed, winter, jumped, partner, answered, confused, followed, guys, lunch, cat, officers, slowly, rope, missing, riding, 90, awake, fire, perfectly, warm, chair, crying, sit, lay, snow, cry, broken, sorry, clock, dressed, gift, office, stomach, busy, hurricane, neighbor, scream, son, stairs, hotel

2.  melons, melon, drive, drove, 7, 9, culture, fully, plants, crops, seven, 12, opposite, pueblo, safety, reach, watermelon, grew, grown, 30, 20, 8, ceasar, b, 15, plant, planted, 11, forbidden, notes, welcomed, 25, bloody, diane, hundreds, thir

## Group names

1. Miscellaneous Actions: This list contains a mix of actions that do not have a specific relationship with one another. Some are actions such as "jumped" and "fell" while others are verbs like "grabbed" and "watched." Additionally, there are words that can be categorized as places ("washington," "city") and words that can be categorized as feelings ("scared," "excited"). This list is a mix of different actions and feelings that do not fit into any particular category.

1. Historical Agriculture and Numbers: This list includes words related to farming and agriculture, such as "melons," "plants," and "crops." The list also contains specific numbers and names, including "7," "30," and "ceasar." This list seems to be centered around agriculture in a historical context.

1. Personal Growth and Social Media: This list contains words that relate to personal growth, such as "achieve," "complete," and "prepared." It also includes words related to education ("colleges," "education"), technology ("facebook," "internet"), and emotions ("annoying," "happier"). This list is related to personal growth and development, including education, technology, and emotions.

1. Natural Disasters and Phenomena: This list contains words related to natural disasters and phenomena, such as "storm," "hurricane," and "wind." It also includes words related to emotions ("sadness," "soul") and other items that may be present during a natural disaster ("trash," "deed"). This list is related to natural disasters and the emotions that come with them.

1. Memorials and Representations: This list includes words related to memorials and representations, such as "monuments," "memorials," and "possessions." It also includes words related to communication ("communication," "listen") and other items that can be represented ("animals," "songs"). This list seems to be related to memorials and the items and feelings that can be represented through them.

1. Death and Emotion: This list contains words related to death and emotion, such as "kill," "death," and "heart." It also includes words related to literature ("poems," "words") and ancient history ("roman," "julius"). This list seems to be related to death and the emotions and literature surrounding it.

1. Rhetorical Devices: This list contains words related to rhetorical devices, such as "pathos," "ethos," and "logos." It also includes words related to literature ("ads," "advertisements") and politics ("citizens," "government"). This list is related to rhetorical devices and their use in literature and politics.

1. Events and Personal Growth: This list includes words related to events and personal growth, such as "success," "tragedy," and "search." It also includes words related to emotion ("fear," "anger") and geography ("britian," "europe"). This list is related to personal growth and development through the events and experiences that shape it.

1. Fashion and Self-Expression: This list contains words related to fashion and self-expression, such as "cloths," "colors," and "personality." It also includes words related to education ("grade," "teacher") and economic status ("poor," "afford"). This list is related to fashion and how it can be used for self-expression.

1. Leadership and Conflict: This list contains words related to leadership and conflict, such as "leader," "enemies," and "trading." It also includes words related to geography ("planet," "island") and events ("disasters," "relief"). This list is related to leadership and the conflicts that arise.

1. Health and Performance: This set of words focuses on terms related to health and improvement, such as "health," "beneficial," and "alertness," as well as terms associated with performance like "improve" and "performance."

1. War and Colonization: This set of words consists of terms related to wars, battles, and colonization, including names of countries involved such as Prussia, Austria, Britain, and France.

1. Dress Code and Bullying: This set of words revolves around the topic of dress codes and the controversy around their implementation, including terms like "mandatory," "forced," and "safe," as well as terms related to bullying such as "worry," "bullied," and "afford."

1. Persuasive Speaking: This set of words consists of terms related to persuasive speaking, including techniques such as "repetition" and "emotional appeal," as well as notable figures like Shakespeare and his character Cassius.

1. Writing and Communication: This set of words revolves around writing and communication, including terms such as "lines," "paragraph," and "article," as well as terms related to culture like "Hopi" and "Spanish."

20\. School Uniforms and Bullying Prevention: This set of words consists of terms related to school uniforms and the debate around their effectiveness in preventing bullying, including terms like "uniform," "bullying," and "cost."

21\. Patience and Waiting: This set of words focuses on the theme of patience and waiting, including terms like "wait," "patience," and "took."

22\. Poetry and Singing: This set of words consists of terms related to poetry and singing, including names of notable poets like Langston Hughes and Walt Whitman.

23\. Reasoning and Persuasion: This set of words revolves around the theme of reasoning and persuasion, including terms like "idea," "reason," and "speech."

25\. General Descriptors: This set of words includes general descriptors and common terms used to describe things, including words like "great," "important," and "different."

In [97]:
for i, cluster in enumerate(clusters):
    print(f'{str(i).rjust(2)}. {str(len(cluster)).rjust(6)} {cluster[:10]}')

 0.      1 ['people']
 1.      1 ['like']
 2.      1 ['school']
 3.      1 ['uniforms']
 4.      1 ['time']
 5.      1 ['life']
 6.      1 ['man']
 7.      1 ['things']
 8.      1 ['way']
 9.      1 ['think']
10.      1 ['person']
11.      1 ['good']
12.      1 ['day']
13.      1 ['said']
14.      1 ['house']
15.      1 ['want']
16.      1 ['students']
17.      1 ['help']
18.      1 ['know']
19.      1 ['wear']
20.      1 ['different']
21.      1 ['kids']
22.      1 ['thing']
23.      1 ['going']
24.      1 ['america']
25.      1 ['caesar']
26.      1 ['got']
27.      1 ['wasp']
28.      1 ['brutus']
29.      1 ['feel']
30.      1 ['able']
31.      1 ['world']
32.      1 ['better']
33.      1 ['shows']
34.      1 ['society']
35.      1 ['right']
36.      1 ['story']
37.      1 ['need']
38.      1 ['poem']
39.      1 ['water']
40.      1 ['look']
41.      1 ['new']
42.      1 ['whitman']
43.      1 ['went']
44.      1 ['laughter']
45.      1 ['british']
46.      1 ['having']
47.      1 

In [None]:
import json