# Wanille nlp analysis

In [2]:
from querygpt.utils import read_dataset_jsonl, ds_to_df
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import textstat
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)


In [3]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [4]:
ds = read_dataset_jsonl("../dataset.jsonl")
df = ds_to_df(ds)

In [5]:
df

Unnamed: 0,query_id,persona,topic_id,query
0,1,Karen,336,Black bear attack frequency
1,2,Karen,336,Causes of bear attacks
2,3,Karen,336,Locations of bear attacks
3,4,Karen,336,Preventing bear attacks
4,5,Karen,336,Surviving bear attacks
...,...,...,...,...
1876,6,Edith,435,Sustainable population definition
1877,7,Edith,435,Carrying capacity meaning
1878,8,Edith,435,Demographic transition definition
1879,9,Edith,435,Family planning explained


In [6]:
query_by_persona = df.groupby("persona")["query"].agg(list).to_dict()

In [7]:
words_by_persona = {persona: [] for persona in query_by_persona.keys()}
for persona in query_by_persona.keys():
    for query in query_by_persona[persona]:
        query = query.lower()
        query = query.replace("?", "")
        vocab_temp = tokenizer(query)
        words_by_persona[persona].extend([token.text for token in vocab_temp])

## Vocab

### vocab size

In [8]:
{persona: len(set(words_by_persona[persona])) for persona in words_by_persona.keys()}

{'Edith': 499, 'Karen': 487, 'Markus': 506, 'Mr. John Doe': 488}

### unique vocab

In [9]:
from collections import Counter

In [10]:
persona_word_counts = {}
for key in words_by_persona.keys():
    word_counts = dict(Counter(words_by_persona[key]))
    persona_word_counts[key] = word_counts

unique_words_per_persona = {}

for persona in persona_word_counts.keys():
    other_personas = [p for p in persona_word_counts.keys() if persona != p]
    word_counts = persona_word_counts[persona]
    unique_words_persona = word_counts.keys()
    unique_other_persona_words = set.union(*[set(persona_word_counts[persona].keys()) for persona in other_personas])
    persona_special_words = unique_words_persona - unique_other_persona_words
    unique_words_per_persona[persona] = {word: persona_word_counts[persona][word] for word in persona_special_words}

In [11]:
df[df["query"].str.contains("Markovic's")]

Unnamed: 0,query_id,persona,topic_id,query
1253,1,Mr. John Doe,423,Markovic's role explained
1254,2,Mr. John Doe,423,Markovic's charges history
1255,3,Mr. John Doe,423,Markovic's political ideology
1256,4,Mr. John Doe,423,Markovic's political influence
1257,5,Mr. John Doe,423,Markovic's personal background
1258,6,Mr. John Doe,423,Markovic's relationship with Milosevic
1259,7,Mr. John Doe,423,Markovic's involvement in politics
1260,8,Mr. John Doe,423,Markovic's impact on Yugoslavia
1261,9,Mr. John Doe,423,Markovic's legacy in Serbia
1262,10,Mr. John Doe,423,Markovic's ideology explained


In [12]:
df[df["topic_id"] == 423]

Unnamed: 0,query_id,persona,topic_id,query
460,1,Karen,423,Mirjana Markovic biography
461,2,Karen,423,Mirjana Markovic politics
462,3,Karen,423,Mirjana Markovic influence
463,4,Karen,423,Mirjana Markovic relationship Milosevic
464,5,Karen,423,Mirjana Markovic residence
465,6,Karen,423,Mirjana Markovic death date
466,7,Karen,423,Mirjana Markovic controversy
467,8,Karen,423,Mirjana Markovic ideology
468,9,Karen,423,Mirjana Markovic background
469,10,Karen,423,Mirjana Markovic legacy


usage of 's

check significants score

In [13]:
df[df["query"].str.contains("'")]["persona"].value_counts()

persona
Mr. John Doe    23
Edith            9
Karen            7
Markus           3
Name: count, dtype: int64

additional ideas

- query length comparison
- find dataset whith word frequencies and see who uses "rarer words"

...

In [29]:
import csv
fn = "uqv/uqv-core17.txt"
with open(fn, "r") as fp:
    reader = csv.reader(fp, delimiter=',')
    lines = list(reader)

In [33]:
idx, persona_id, topic_id, queries = zip(*lines)

## Sentence embeddings (Clustering queries)

idea which personas ask similar questions?

In [14]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
model = SentenceTransformer('all-MiniLM-L6-v2')
all_queries = df["query"].to_list()
all_queries.extend(queries)
embeddings = model.encode(queries)

In [44]:
clusters = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clusters.fit(embeddings)

In [45]:
len(set(clusters.labels_))

322

In [47]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(clusters.labels_):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(all_queries[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  171
['Black bear attack frequency', 'Causes of bear attacks', 'Surviving bear attacks', 'Countries protecting wildlife', 'Preventing extinction in countries', 'Efforts to save wildlife', 'Actions to prevent extinction', 'Wildlife conservation in countries', 'Women representation in parliaments', 'Number of women in parliaments', 'Gender gap in politics', 'Challenges for women politicians', 'History of women in politics', 'Solutions for women representation', 'Cult activities examples', 'Specific cult name', 'Cult belief system description', 'Cult influence analysis', 'E-mail abuses solutions suggested', 'Radio waves and cancer', 'Benefits of airport security', 'Iran-Iraq cooperation definition', 'Iran-Iraq friendly ties', 'Benefits of Iran-Iraq cooperation', 'Scientific investigations in Antarctica', 'Planned Antarctica explorations', 'Safety for freelance journalists', 'Ocean Remote Sensing Methods']

Cluster  122
['Locations of bear attacks', "Other countries' wildlife effor