In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
!pip install lda

In [None]:
from __future__ import division
import pandas as pd
import numpy as np
from numpy import sum
from numpy.linalg import norm
import re
import os
import string
from string import punctuation
from collections import Counter, defaultdict
from urllib.request import urlretrieve
# from tqdm import tqdm
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import pickle
# import benepar
# from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ProcessPoolExecutor, as_completed
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
import argparse
import warnings
import sys
import csv
import codecs

import lda
import lda.datasets

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

# import stanza
# stanza.download('en')
# nlp = stanza.Pipeline('en')

# import spacy
# nlp = spacy.load('en_core_web_sm')
# benepar.download("benepar_en3")
# nlp.add_pipe("benepar", config={"model": "benepar_en3"})

import nltk
from nltk import word_tokenize, pos_tag, sent_tokenize
from nltk.corpus import stopwords
from nltk.tag import map_tag
from nltk.util import ngrams
from nltk.tree import Tree
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('universal_tagset')

np.random.seed(1337)

stop_words = set(stopwords.words('english'))
print(len(stop_words))


# Preprocess datasets

## DiffusionDB

In [None]:
# 1. Download the parquet table
table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet'
urlretrieve(table_url, 'metadata.parquet')

parquet_file = 'metadata.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')

print('Original Initial Size :', len(df))

# 2. remove duplicates
# columns_to_remove = ['image_name', 'part_id', 'seed', 'step', 'cfg', 'sampler',
#        'width', 'height', 'timestamp', 'image_nsfw', 'prompt_nsfw']
# df_initial = df_initial.drop(columns=columns_to_remove, errors='ignore')
df = df.drop_duplicates(subset=['user_name', 'prompt'])

print('Original Size (without duplicates) :', len(df))

# 3. remove rows that contain at least one NaN or Null character
df.replace(['null', 'NULL', 'NaN', 'nan', '', ' '], np.nan, inplace=True)
df = df.dropna(how='any')
print('Origianl Size (without duplicates and NaN/Null character)', len(df))

# 4. remove rows that contain non-English characters
def contains_non_english(text):
    if not isinstance(text, str):
        print(text)
        return True
    return bool(re.search(r'[^\x00-\x7F]', text))

df = df[~df['prompt'].apply(contains_non_english)]

print('Original Size (without duplicates, NaN/Null and non-English characters)', len(df))

# 5. remove rows that contain only blanks
def contains_nothing(text):
  text = text.strip()
  if len(text) == 0:
    return True
  else:
    return False

df = df[~df['prompt'].apply(contains_nothing)]
print('Original Size (without duplicates, NaN/Null and non-English characters, blank lines)', len(df))

# 6. remove authors who delete the accounts
df = df[df['user_name'] != 'deleted_acount']
print('Original Size (without duplicates, NaN/Null and non-English characters, blank lines, deleted account)', len(df))

df = df.reset_index(drop=True)

# Save the cleaned dataset
df.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv', index=False)
df

In [None]:
df_diffusiondb = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv')
# for idx, row in df_diffusiondb.iterrows():
#   text = row['prompt']
#   if type(text) != str:
#     print(idx)
#     print(text)
#     print()

In [None]:
print(df['prompt'][1407722])
print(df_diffusiondb['prompt'][1407722])
text = 'a     portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation '
print(nltk.word_tokenize(text))
print(len(df))
print(len(df_diffusiondb))

# Calculate dataset statistics

In [None]:
def tokenize(text):  # tokenize the text
  tokens = nltk.word_tokenize(text)
  return tokens

def tokenize_sentences(text):
  sentences = nltk.sent_tokenize(text)
  return sentences


# --- Lexical Features --- #

# Lexical feature (word level)
def total_word_lengths(tokens):
  # if type(tokens) != str:
  #   print(tokens)
  if type(tokens) == str:
    tokens = eval(tokens)
  return sum([len(word) for word in tokens])

def num_short_words(tokens):
  if type(tokens) == str:
    tokens = eval(tokens)
  return sum(1 for word in tokens if len(word) <= 3)

def stopwords_frequency(tokens):
  if type(tokens) == str:
    tokens = eval(tokens)
  stopword_counts = {word: tokens.count(word) for word in stop_words}
  return stopword_counts

def total_words(tokens):
  if type(tokens) == str:
    tokens = eval(tokens)
  return len(tokens)


# Lexical feature (character level)
def total_digits(text):
  return sum(c.isdigit() for c in text)

def total_uppercase(text):
  return sum(c.isupper() for c in text)

def letter_frequency(text):
  # print(string.ascii_letters[:26])
  text = text.lower()
  return {char: text.count(char) for char in string.ascii_letters[: 26]}

def digit_frequency(text):
  # print(string.digits)
  return {digit: text.count(digit) for digit in string.digits}

def total_characters(text):
  return len(text)


# Lexical feature (sentence level)
def total_sentences(text):
  sentences = sent_tokenize(text)
  return len(sentences)


# Lexical feature (vocabulary richness)
# def hapax_legomena(tokens):
#   tokens = eval(tokens)
#   word_counts = Counter(tokens)
#   return sum(1 for count in word_counts.values() if count == 1)

# def dis_legomena(tokens):
#   tokens = eval(tokens)
#   word_counts = Counter(tokens)
#   return sum(1 for count in word_counts.values() if count == 2)


# --- Ngram Features --- #

def calculate_ngrams(corpus, n, mode):
  if mode == 'char':
    corpus = corpus.replace(" ", "")
  elif mode == 'word':
    if type(corpus) == str:
      corpus = eval(corpus)
    corpus = [token for token in corpus if token not in string.punctuation]
  elif mode == 'pos':
    if type(corpus) == str:
      corpus = eval(corpus)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag) in corpus)
    corpus = list(tag_fd.keys())

  return Counter(ngrams(corpus, n))



# --- Syntactic Features --- #

def punctuation_frequency(text):
  punct_freq = Counter(char for char in text if char in string.punctuation)
  return dict(punct_freq)

def pos_frequency(pos_tags):
  if type(pos_tags) == str:
    pos_tags = eval(pos_tags)
  # print(pos_tags)
  pos_tag = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X']
  tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag) in pos_tags)
  pos_tags = list(tag_fd.keys())
  pos_freq = Counter(pos_tags)
  return dict(pos_freq)

# def extract_phrase_structures(subtree):
#   phrase_structures = []
#   if isinstance(subtree, Tree):
#     phrase_structures.append(subtree.label())
#     for child in subtree:
#       phrase_structures.extend(extract_phrase_structures(child))
#   return phrase_structures

# def phrase_structures(text):
#   doc = nlp(text)
#   sentence = doc.sentences[0]
#   tree = sentence.constituency
#   nltk_tree = Tree.fromstring(str(tree))
#   phrase_structures = extract_phrase_structures(nltk_tree)
#   return phrase_structures

# def dependency_paths(text):
#   doc = nlp(text)
#   sentence = doc.sentences[0]

#   dependency_paths = []
#   for dep_edge in sentence.dependencies:
#     head = dep_edge[0].text
#     dependent = dep_edge[2].text
#     dep_type = dep_edge[1]
#     dependency_paths.append((head, dep_type, dependent))
#   return dependency_paths

# def phrase_structures(doc):
#   phrase_structures = []
#   for chunk in doc.noun_chunks:
#     phrase_structures.append(chunk.text)
#   return phrase_structures

# def dependency_paths(doc):
#   paths = []
#   for token in doc:
#     paths.append((token.dep_, token.head.text, token.text))
#   return paths

# def to_nltk_tree(node):
#     if isinstance(node, spacy.tokens.Token):
#         return Tree(node.tag_, [node.text])
#     else:
#         return Tree(node.label_, [to_nltk_tree(child) for child in node.children])

# # Function to extract phrases from the constituency tree
# def extract_phrases(tree):
#     phrases = []
#     if isinstance(tree, Tree):
#         if tree.height() > 2:  # Ignore pre-terminals
#             phrases.append(tree.label())
#         for child in tree:
#             phrases.extend(extract_phrases(child))
#     return phrases

def get_dependency_path(token):
    path = []
    while token.head != token:
        path.append((token.dep_, token.head.text))
        token = token.head
    path.append(('ROOT', token.text))
    path.reverse()
    return path

def dependency_ngrams(doc, n=3):
    tokens = [token.text for token in doc if token.text not in string.punctuation]
    tokens_ids = [token.i for token in doc if token.text not in string.punctuation]
    word_ngrams = list(ngrams(tokens, n))
    dep_ngrams = []

    count = 0
    for word_ngram in word_ngrams:
        indices = [token.i for token in doc if token.text in word_ngram]
        if len(indices) > n:
          indices_new = []
          for idx in indices:
            if idx >= tokens_ids[count]:
              if count + n > len(tokens_ids) - 1:
                indices_new.append(idx)
              else:
                if idx < tokens_ids[count + n]:
                  indices_new.append(idx)
        else:
          indices_new = indices
        if len(indices_new) != n:

          print(doc)
          print(word_ngram)
          print(tokens_ids)
          print(indices)
          print(indices_new)
          print(paths)
          print(dep_ngram)
          print()
        paths = [get_dependency_path(doc[idx]) for idx in indices_new]
        dep_ngram = []
        for path in paths:
          dep_ngram.extend(tuple(p[0] for p in path))
        dep_ngrams.append(tuple(dep_ngram))
        count += 1


    return dep_ngrams

# def process_text(text, n=2):
#     doc = nlp(text)
#     print(doc)
#     root = [sent.root for sent in doc.sents][0]
#     print(root)
#     tree = to_nltk_tree(root)
#     print(tree)
#     return extract_phrase_ngrams(tree, n)
def process_text(text):
  return nlp(text)

def process_dependency(doc, n):
  dep_ngrams = dependency_ngrams(doc, n)
  return Counter(dep_ngrams)

# def parallel_apply(series, func, n_jobs=4):
#     results = Parallel(n_jobs=n_jobs)(delayed(func)(row) for row in tqdm(series))
#     return results

# --- N-gram based --- #

def get_pos_tags(tokens):
    if type(tokens) == str:
      tokens = eval(tokens)
    return nltk.pos_tag(tokens)



# --- Hardness --- #

# Text pre-processing as required to compare the content of different authors
def fil_sent(sent):
    """
    Filter stopwords
    """
    filtered_sentence = ' '.join([w for w in sent.split() if not w in stop_words])
    filtered_sentence = ''.join([w for w in filtered_sentence if w not in list(punctuation)])
    filtered_sentence = filtered_sentence.strip()
    filtered_sentence = filtered_sentence.split()
    return filtered_sentence

def process(sent):
    """
    Apply stemming
    """
    sent = str(sent)
    return fil_sent(' '.join([ps.stem(str(x).lower()) for x in word_tokenize(sent)]))

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Similarity between two authors' content
def jaccard_similarity(list1, list2):

    list1 = list1[:512]
    list2 = list2[:512]

    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection

    if union > 0:
      return float(intersection) / union
    else:
      return 0

# Combines all the text per author in a list of lists
def all_text_all_category(df, col_id, col_text):
  all_text = []

  for val in list(set(df[col_id].values)):
    sub_df = df[df[col_id] == val]
    all_text.append(list(itertools.chain(sub_df[col_text].apply(lambda x: process(x)).values))[0])

  return all_text

# Computes the relative hardness
def rel_hardness(df, col_id, col_text):

  n_labels = len(list(set(df[col_text].values)))
  return 1/(n_labels * (n_labels - 1) * 0.5) * sum([jaccard_similarity(x[0], x[1]) for x in itertools.combinations(all_text_all_category(df, col_id, col_text), 2) if x[0] != x[1]])

In [None]:
def calculate_frequency(dictionary):
  new_dictionary = {}
  count = sum(list(dictionary.values()))
  for key, value in dictionary.items():
    new_dictionary[key] = value / count if count != 0 else 0
  return new_dictionary


def extract_features_lexical(df):

  df['stopwords_num'] = df['tokenized_prompt'].progress_apply(stopwords_frequency)
  df['total_word_lengths'] = df['tokenized_prompt'].progress_apply(total_word_lengths)
  df['num_short_words'] = df['tokenized_prompt'].progress_apply(num_short_words)
  df['total_words'] = df['tokenized_prompt'].progress_apply(total_words)

  df['letter_num'] = df['prompt'].progress_apply(letter_frequency)
  df['digit_num'] = df['prompt'].progress_apply(digit_frequency)
  df['total_digits'] = df['prompt'].progress_apply(total_digits)
  df['total_uppercase'] = df['prompt'].progress_apply(total_uppercase)
  df['total_characters'] = df['prompt'].progress_apply(total_characters)

  df['total_sentences'] = df['prompt'].progress_apply(total_sentences)

  df['average_word_length'] = df['total_word_lengths'] / df['total_words']
  df['short_words_ratio'] = df['num_short_words'] / df['total_words']
  df['stopwords_frequency'] = df['stopwords_num'].apply(calculate_frequency)
  df['percentage_of_digits'] = df['total_digits'] / df['total_characters']
  df['percentage_of_uppercase'] = df['total_uppercase'] / df['total_characters']
  df['letter_frequency'] = df['letter_num'].apply(calculate_frequency)
  df['digit_frequency'] = df['digit_num'].apply(calculate_frequency)
  df['average_sentence_length'] = df['total_sentences']
  df['average_document_length'] = df['total_words']

  return df


# def extract_features_syntactic(df):
#   df['punctuation_frequency'] = df['prompt'].progress_apply(punctuation_frequency)
#   df['pos_frequency'] = df['pos_tags'].progress_apply(pos_frequency)
#   # df['spacy_doc'] = df['prompt'].progress_apply(process_text)
#   # df['dependency_unigrams'] = df['spacy_doc'].progress_apply(lambda x: process_dependency(x, 1))
#   # df['dependency_bigrams'] = df['spacy_doc'].progress_apply(lambda x: process_dependency(x, 2))
#   # df['dependency_trigrams'] = df['spacy_doc'].progress_apply(lambda x: process_dependency(x, 3))
#   # df['dependency_fourgrams'] = df['spacy_doc'].progress_apply(lambda x: process_dependency(x, 4))
#   # tqdm.pandas()
#   # df['spacy_doc'] = df['prompt'].progress_apply(process_text)
#   # df['punctuation_frequency'] = df['prompt'].progress_apply(punctuation_frequency)
#   # df['pos_frequency'] = df['pos_tags'].progress_apply(pos_frequency)
#   # df['phrase_structures'] = df['spacy_doc'].progress_apply(phrase_structures)
#   # df['dependency_paths'] = df['spacy_doc'].progress_apply(dependency_paths)

#   return df

def extract_features_syntactic(df):
  df['pos_num'] = df['pos_tags'].progress_apply(pos_frequency)
  df['punctuation_num'] = df['prompt'].progress_apply(punctuation_frequency)

  df['pos_frequency'] = df['pos_num'].progress_apply(calculate_frequency)
  df['punctuation_frequency'] = df['punctuation_num'].progress_apply(calculate_frequency)

  return df

def extract_features_ngrams(df):
  df['char_bigrams'] = df['prompt'].progress_apply(calculate_ngrams, args=(2, 'char'))
  df['char_trigrams'] = df['prompt'].progress_apply(calculate_ngrams, args=(3, 'char'))

  df['word_unigrams'] = df['tokenized_prompt'].progress_apply(calculate_ngrams, args=(1, 'word'))
  df['word_bigrams'] = df['tokenized_prompt'].progress_apply(calculate_ngrams, args=(2, 'word'))
  df['word_trigrams'] = df['tokenized_prompt'].progress_apply(calculate_ngrams, args=(3, 'word'))

  df['pos_bigrams'] = df['pos_tags'].progress_apply(calculate_ngrams, args=(2, 'pos'))
  df['pos_trigrams'] = df['pos_tags'].progress_apply(calculate_ngrams, args=(3, 'pos'))

  return df

def syntactic_richness(df):
    words_all = []
    words_distinct = []
    for idx, row in df.iterrows():
      words = row['tokenized_prompt']
      if type(words) == str:
        words = eval(words)
      words_all.extend(words)
      # for word in words:
      #   if word not in words_distinct:
      #     words_distinct.append(word)
    return len(set(words_all)) / len(words_all)

def hapax_legomena(df):
  words_all = []
  for idx, row in df.iterrows():
    words = row['tokenized_prompt']
    if type(words) == str:
      words = eval(words)
    words_all.extend(words)
  print(len(words_all))
  word_counts = Counter(words_all)
  return sum(1 for count in word_counts.values() if count == 1) / len(words_all)

def dis_legomena(df):
  words_all = []
  for idx, row in df.iterrows():
    words = row['tokenized_prompt']
    if type(words) == str:
      words = eval(words)
    words_all.extend(words)
  print(len(words_all))
  word_counts = Counter(words_all)
  return sum(1 for count in word_counts.values() if count == 2) / len(words_all)

def herdan_ttr(df):
  words_all = []
  for idx, row in df.iterrows():
    words = row['tokenized_prompt']
    if type(words) == str:
      words = eval(words)
    words_all.extend(words)
  return np.log(len(set(words_all))) / np.log(len(words_all))

def combine_dictionary_results(df, col_name):
  results = {}
  count = 0
  for idx, row in df.iterrows():
    dic = row[col_name]
    if type(dic) == str:
      dic = eval(dic)
    if isinstance(dic, Counter):
      dic = dict(dic)
    for key, value in dic.items():
      if key in results.keys():
        results[key] += value
      else:
        results[key] = value
      count += value
  for key, value in results.items():
    if count == 0:
      results[key] = 0
    else:
      results[key] = value / count
  return results

def calculate_statistics_lexical(df):

  # aggregated_stopwords_frequency = Counter()
  # aggregated_letter_frequency = Counter()
  # aggregated_digit_frequency = Counter()

  # for idx, row in tqdm(df.iterrows(), total=len(df)):
  #   aggregated_stopwords_frequency.update(Counter(stopwords_frequency(row['tokenized_prompt'])))
  #   aggregated_letter_frequency.update(Counter(letter_frequency(row['prompt'])))
  #   aggregated_digit_frequency.update(Counter(digit_frequency(row['prompt'])))

  metrics = {
    # Lexical feature (word-level)
    'average_word_length': df['total_word_lengths'].sum() / df['total_words'].sum(),
    'short_words_ratio': df['num_short_words'].sum() / df['total_words'].sum(),
    'stopwords_frequency': combine_dictionary_results(df, 'stopwords_num'),
    # 'stopwords_frequency': dict(aggregated_stopwords_frequency),
    'syntactic_richness': syntactic_richness(df),
    'number_of_words': df['total_words'].sum(),

    # Lexical feature (character-level)
    'percentage_of_digits': df['total_digits'].sum() / df['total_characters'].sum(),
    'percentage_of_uppercase': df['total_uppercase'].sum() / df['total_characters'].sum(),
    'letter_frequency': combine_dictionary_results(df, 'letter_num'),
    'digit_frequency': combine_dictionary_results(df, 'digit_num'),
    # 'letter_frequency': dict(aggregated_letter_frequency),
    # 'digit_frequency': dict(aggregated_digit_frequency),

    # Lexical feature (sentence-level)
    'average_sentence_length': df['total_words'].sum() / df['total_sentences'].sum(),
    'average_document_length': df['total_words'].sum() / len(df),

    # Lexical feature (dataset-level)
    'number_of_authors': df['user_name'].nunique(),
    'author_document_counts': df['user_name'].value_counts().to_dict(),
    'std_dev_document_counts': df['user_name'].value_counts().std(),
    'number_of_documents': len(df),
    'avg_documents_per_author': len(df) / df['user_name'].nunique(),

    # Lexical feature (vocabulary richness)
    'hapax_legomena_ratio': hapax_legomena(df),
    'dis_legomena_ratio': dis_legomena(df),
    'herdan_ttr': herdan_ttr(df)
  }

  return metrics

def get_most_common_ngrams(counter, n=100):
    counter = Counter(counter)
    return counter.most_common(n)

# def calculate_statistics_ngrams(df):

#   aggregate_word_unigrams = Counter()
#   aggregate_word_bigrams = Counter()
#   aggregate_word_trigrams = Counter()

#   aggregate_char_bigrams = Counter()
#   aggregate_char_trigrams = Counter()

#   aggregate_pos_bigrams = Counter()
#   aggregate_pos_trigrams = Counter()

#   # Process each row
#   for index, row in tqdm(df.iterrows(), total=len(df)):
#     # if index % 10000 == 0:
#     #   print(index)
#     # Character n-grams
#     char_text = row['prompt'].replace(" ", "")  # Remove spaces for character n-grams
#     aggregate_char_bigrams.update(calculate_ngrams(char_text, 2))
#     aggregate_char_trigrams.update(calculate_ngrams(char_text, 3))

#     # Word n-grams
#     words = eval(row['tokenized_prompt'])
#     # print(words)
#     words = [token for token in words if token not in string.punctuation]
#     aggregate_word_unigrams.update(calculate_ngrams(words, 1))
#     aggregate_word_bigrams.update(calculate_ngrams(words, 2))
#     aggregate_word_trigrams.update(calculate_ngrams(words, 3))

#     # POS n-grams
#     pos_tags = eval(row['pos_tags'])
#     # print(pos_tags)
#     pos_tag = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X']
#     tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag) in pos_tags)
#     pos_tags = list(tag_fd.keys())
#     # print(pos_tags)
#     aggregate_pos_bigrams.update(calculate_ngrams(pos_tags, 2))
#     aggregate_pos_trigrams.update(calculate_ngrams(pos_tags, 3))


#   metrics = {
#     'char_bigrams': get_most_common_ngrams(aggregate_char_bigrams),
#     'char_trigrams': get_most_common_ngrams(aggregate_char_trigrams),
#     'word_unigrams': get_most_common_ngrams(aggregate_word_unigrams),
#     'word_bigrams': get_most_common_ngrams(aggregate_word_bigrams),
#     'word_trigrams': get_most_common_ngrams(aggregate_word_trigrams),
#     'pos_bigrams': get_most_common_ngrams(aggregate_pos_bigrams),
#     'pos_trigrams': get_most_common_ngrams(aggregate_pos_trigrams)
#   }

#   return metrics

def common_ngrams(ngram_counter, common_ngrams):
    res = {}
    for ngram in common_ngrams:
      ngram = tuple(ngram[0])
      if ngram in ngram_counter:
        res[ngram] = ngram_counter[ngram]
    return res

def common_ngrams_frequency(ngram_counter, common_ngrams):
    total = sum(list(ngram_counter.values()))
    res = {}
    for ngram in common_ngrams:
      ngram = tuple(ngram[0])
      if ngram in ngram_counter:
        res[ngram] = ngram_counter[ngram] / total
    return res


def calculate_statistics_ngrams(df):

  char_bigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'char_bigrams'))
  char_trigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'char_trigrams'))
  word_unigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'word_unigrams'))
  word_bigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'word_bigrams'))
  word_trigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'word_trigrams'))
  pos_bigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'pos_bigrams'))
  pos_trigrams_common = get_most_common_ngrams(combine_dictionary_results(df, 'pos_trigrams'))

  # Create the metrics dictionary using the stored variables
  metrics = {
      'char_bigrams': char_bigrams_common,
      'char_trigrams': char_trigrams_common,
      'word_unigrams': word_unigrams_common,
      'word_bigrams': word_bigrams_common,
      'word_trigrams': word_trigrams_common,
      'pos_bigrams': pos_bigrams_common,
      'pos_trigrams': pos_trigrams_common
  }

  df['char_bigrams_common'] = df['char_bigrams'].apply(lambda x: common_ngrams(x, char_bigrams_common))
  df['char_trigrams_common'] = df['char_trigrams'].apply(lambda x: common_ngrams(x, char_trigrams_common))
  df['word_unigrams_common'] = df['word_unigrams'].apply(lambda x: common_ngrams(x, word_unigrams_common))
  df['word_bigrams_common'] = df['word_bigrams'].apply(lambda x: common_ngrams(x, word_bigrams_common))
  df['word_trigrams_common'] = df['word_trigrams'].apply(lambda x: common_ngrams(x, word_trigrams_common))
  df['pos_bigrams_common'] = df['pos_bigrams'].apply(lambda x: common_ngrams(x, pos_bigrams_common))
  df['pos_trigrams_common'] = df['pos_trigrams'].apply(lambda x: common_ngrams(x, pos_trigrams_common))

  df['char_bigrams_common_frequency'] = df['char_bigrams'].apply(lambda x: common_ngrams_frequency(x, char_bigrams_common))
  df['char_trigrams_common_frequency'] = df['char_trigrams'].apply(lambda x: common_ngrams_frequency(x, char_trigrams_common))
  df['word_unigrams_common_frequency'] = df['word_unigrams'].apply(lambda x: common_ngrams_frequency(x, word_unigrams_common))
  df['word_bigrams_common_frequency'] = df['word_bigrams'].apply(lambda x: common_ngrams_frequency(x, word_bigrams_common))
  df['word_trigrams_common_frequency'] = df['word_trigrams'].apply(lambda x: common_ngrams_frequency(x, word_trigrams_common))
  df['pos_bigrams_common_frequency'] = df['pos_bigrams'].apply(lambda x: common_ngrams_frequency(x, pos_bigrams_common))
  df['pos_trigrams_common_frequency'] = df['pos_trigrams'].apply(lambda x: common_ngrams_frequency(x, pos_trigrams_common))

  return metrics


def calculate_statistics_syntactic(df):
  metrics = {
    'punctuation_frequency': combine_dictionary_results(df, 'punctuation_num'),
    'pos_frequency': combine_dictionary_results(df, 'pos_num')
  }

  return metrics


# def calculate_statistics_syntactic(df):
#   def get_most_common_ngrams(counter, n=100):
#     return counter.most_common(n)

#   def calculate_ngrams(corpus, n):
#     return Counter(ngrams(corpus, n))

#   def combine_dictionary_results(df, col_name):
#     results = {}
#     count = 0
#     for idx, row in df.iterrows():
#       # print(row[col_name])
#       # print(type(row[col_name]))
#       if type(row[col_name]) == str:
#         dic = eval(row[col_name])
#       else:
#         dic = row[col_name]
#       for key, value in dic.items():
#         if key in results.keys():
#           results[key] += value
#         else:
#           results[key] = value
#         count += value
#     for key, value in results.items():
#       if count == 0:
#         results[key] = 0
#       else:
#         results[key] = value / count
#     return results

#   # aggregate_phrase_unigrams = Counter()
#   # aggregate_phrase_bigrams = Counter()
#   # aggregate_phrase_trigrams = Counter()
#   # aggregate_phrase_fourgrams = Counter()

#   # aggregate_dependency_unigrams = Counter()
#   # aggregate_dependency_bigrams = Counter()
#   # aggregate_dependency_trigrams = Counter()
#   # aggregate_dependency_fourgrams = Counter()

#   # Process each row
#   # for index, row in tqdm(df.iterrows(), total=df.shape[0]):

#   #   # # Phrase structure n-grams
#   #   # if type(row['phrase_structures']) == str:
#   #   #   phrase_structures = eval(row['phrase_structures'])
#   #   # else:
#   #   #   phrase_structures = row['phrase_structures']
#   #   # aggregate_phrase_unigrams.update(calculate_ngrams(phrase_structures, 1))
#   #   # aggregate_phrase_bigrams.update(calculate_ngrams(phrase_structures, 2))
#   #   # aggregate_phrase_trigrams.update(calculate_ngrams(phrase_structures, 3))
#   #   # aggregate_phrase_fourgrams.update(calculate_ngrams(phrase_structures, 4))

#   #   # Denpendency structure n-grams
#   #   text = row['prompt']
#   #   doc = nlp(text)
#   #   # print(dependency_ngrams(doc, 1))
#   #   # print(Counter(dependency_ngrams(doc, 1)))
#   #   # print(dependency_ngrams(doc, 2))
#   #   # print(Counter(dependency_ngrams(doc, 2)))
#   #   aggregate_dependency_unigrams.update(Counter(dependency_ngrams(doc, 1)))
#   #   aggregate_dependency_bigrams.update(Counter(dependency_ngrams(doc, 2)))
#   #   aggregate_dependency_trigrams.update(Counter(dependency_ngrams(doc, 3)))
#   #   aggregate_dependency_fourgrams.update(Counter(dependency_ngrams(doc, 4)))


#   metrics = {
#     # 'phrase_unigrams': get_most_common_ngrams(aggregate_phrase_unigrams),
#     # 'phrase_bigrams': get_most_common_ngrams(aggregate_phrase_bigrams),
#     # 'phrase_trigrams': get_most_common_ngrams(aggregate_phrase_trigrams),
#     # 'phrase_fourgrams': get_most_common_ngrams(aggregate_phrase_fourgrams),
#     'punctuation_frequency': combine_dictionary_results(df, 'punctuation_frequency'),
#     'pos_frequency': combine_dictionary_results(df, 'pos_frequency'),
#     # 'dependency_unigrams': get_most_common_ngrams(aggregate_dependency_unigrams),
#     # 'dependency_bigrams': get_most_common_ngrams(aggregate_dependency_bigrams),
#     # 'dependency_trigrams': get_most_common_ngrams(aggregate_dependency_trigrams),
#     # 'dependency_fourgrams': get_most_common_ngrams(aggregate_dependency_fourgrams)
#   }

#   return metrics

In [None]:
# Topic analysis

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()


def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


def load_diffusiondb(data_path):
    # load data (can be used if data already split into train and test set)
    df = pd.read_csv(data_path)
    df = df[['user_name', 'prompt']]
    unique_user_names_count = df['user_name'].nunique()
    print('Unique Users:', unique_user_names_count)
    df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
    unique_user_names_count = df['user_name'].nunique()
    print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

    if unique_user_names_count >= 100:
      random_users = df_filtered['user_name'].drop_duplicates().sample(n=100, random_state=42).tolist()
      sub_df = df_filtered[df_filtered['user_name'].isin(random_users)]
      data = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    else:
      # random_users = df_filtered['user_name'].drop_duplicates().sample(n=50, random_state=42).tolist()
      # sub_df = df_filtered[df_filtered['user_name'].isin(random_users)]
      # data = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
      data = df_filtered
    print(f'totally {len(data)} data')

    x = data['prompt']
    y = data['user_name']
    dict_author = {}    # id doc: author_name
    X = []
    for i in range(len(x)):
        X.append(clean_str(x[i]))
        dict_author[i] = y[i]
    auth_class = list(set(y))
    le = preprocessing.LabelEncoder()
    le.fit(auth_class)
    y_numeric = le.transform(y)
    return X, dict_author


def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def topics_analysis(args):
    warnings.simplefilter(action='ignore', category=FutureWarning)

    data = args.data
    number_of_topics_list = eval(args.n_topics)
    d_path = args.data_path
    X = []
    dict_author = {}
    if data == "ccat10" or data == 'ccat50':
        X, dict_author = load_ccat(d_path)
    elif data == "judgment":
        X, dict_author = load_judgment(d_path)
    elif data == "imdb":
        X, dict_author = load_imdb62(d_path)
    elif data == 'diffusiondb' or data == 'twitter_micro' or data == 'blogs50' or data == 'imdb62':
        X, dict_author = load_diffusiondb(d_path)

    # print(X)
    # create vocabulary
    print ("creating vocabulary..")
    print ("---------------------------")

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    X_tf = tf_vectorizer.fit_transform(X)
    vocab = tf_vectorizer.get_feature_names_out()
    print("shape: {}\n".format(X_tf.shape))
    print(vocab)

    for number_of_topics in number_of_topics_list:

      title = str(data) + " (N_topic = " + str(number_of_topics) + ")"

      # building topic model using LDA
      print ("building model..")
      print ("---------------------------")
      model = lda.LDA(n_topics=number_of_topics, n_iter=1000, random_state=1000)
      model.fit(X_tf)
      topic_word = model.topic_word_
      print("shape: {}".format(topic_word.shape))

      # show detail of topic
      n = 10
      for i, topic_dist in enumerate(topic_word):
          topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
          print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

      print ("document topic model..")
      print ("---------------------------")
      doc_topic = model.doc_topic_
      topic_most = {}
      for n in range(len(doc_topic)):
          topic_most_pr = doc_topic[n].argmax()
          author = dict_author[n]
          if author in topic_most:
              tp_most.append(topic_most_pr)
          else:
              tp_most = []
              tp_most.append(topic_most_pr)
          topic_most[author] = tp_most

      i = 0
      for author_p, topic_p in topic_most.items():
          print (i, author_p, Counter(topic_p))
          i += 1

      new_dict = defaultdict(list)
      for k, v in dict_author.items():
          new_dict[v].append(k)

      new_dict_2 = defaultdict(list)
      for k, v in new_dict.items():
          sum_per_author = np.zeros(number_of_topics)
          n_doc = len(v)
          for i in range(len(v)):
              sum_per_author = sum([sum_per_author, doc_topic[v[i]]], axis=0)
          mean_prob = sum_per_author/n_doc
          new_dict_2[k].append(mean_prob)

      P1 = []
      j = 0
      for auth, m in new_dict_2.items():
          P1.append(m)
          j +=1

      # calculating JS divergence between authors
      print ("calculating JS divergence..")
      print ("---------------------------")
      P2 = P1
      KL = []
      for p in np.array(P1):
          kl = []
          for q in np.array(P2):
              ent = JSD(p.ravel(), q.ravel())
              kl.append(ent)
          KL.append(kl)
      print ("Average JS Divergence", np.mean(KL))

      # create confusion matrix
      print ("creating heatmap..")
      print ("---------------------------")

      plt.figure(figsize=(10, 8))
      ax = sns.heatmap(np.array(KL))
      ax.set_ylabel('author')
      ax.set_xlabel('author')
      ax.set_title(title)
      ax.collections[0].colorbar.set_label("JS Divergence")
      plt.savefig(f'/content/drive/MyDrive/msc_project/data/{data}/clean/heatmap_{data}_{number_of_topics}.png')
      plt.close()

## DiffusionDB

In [None]:
df_diffusiondb = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv')
df_diffusiondb['tokenized_prompt'] = df_diffusiondb['prompt'].apply(tokenize)
df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

In [None]:
# Lexical
# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

# df_diffusiondb_features = extract_features_lexical(df_diffusiondb_tokenized)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv')

diffusiondb_statistics = calculate_statistics_lexical(df_diffusiondb_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_lexical.csv', index=False)
df_diffusiondb_statistics

In [None]:
# Syntactic
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')
df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# print(df_diffusiondb_features)
df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_syntactic.csv', index=False)

df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_syntactic.csv')

diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_syntactic.csv', index=False)
df_diffusiondb_statistics

In [None]:
# N-gram based
# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')
# df_diffusiondb_tokenized['pos_tags'] = df_diffusiondb_tokenized['tokenized_prompt'].apply(get_pos_tags)
# df_diffusiondb_tokenized.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

# df_diffusiondb_features = extract_features_ngrams(df_diffusiondb_tokenized)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_ngrams.csv', index=False)

# df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_ngrams.csv')

diffusiondb_statistics = calculate_statistics_ngrams(df_diffusiondb_tokenized)
# print(diffusiondb_statistics)

diffusiondb_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(diffusiondb_statistics.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_ngrams.csv', index=False)
df_diffusiondb_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv')
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50, 75, 100, 150, 200]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'diffusiondb',
    '--data_path', '/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

### test

In [None]:
print(df_diffusiondb_features)

In [None]:
diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features.head(50))

In [None]:
df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized.head(5))
diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
print(diffusiondb_statistics)

In [None]:
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

# df_diffusiondb_tokenized = df_diffusiondb_tokenized.head(5)
# df_diffusiondb_tokenized['spacy_doc'] = df_diffusiondb_tokenized['prompt'].progress_apply(process_text)

# with open('/content/drive/MyDrive/msc_project/data/spacy_docs.pkl', 'wb') as f:
#     pickle.dump(df_diffusiondb_tokenized['spacy_doc'].tolist(), f)

# with open('/content/drive/MyDrive/msc_project/data/spacy_docs.pkl', 'rb') as f:
#     spacy_docs = pickle.load(f)

# df_diffusiondb_tokenized['spacy_doc'] = spacy_docs
# print(df_diffusiondb_tokenized)
# # print(df_diffusiondb_features)

# df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# df_diffusiondb_features

In [None]:
text = "The four main performers are riveting."
# df_diffusiondb_tokenized = pd.DataFrame(data)
# df_diffusiondb_tokenized['spacy_doc'] = df_diffusiondb_tokenized['prompt'].progress_apply(process_text)
doc = process_text(text)
# df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# print(df_diffusiondb_features)
# diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
# print(diffusiondb_statistics)

In [None]:
import spacy
import benepar

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')
benepar.download("benepar_en3")
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
# Sample sentence
sentence = "The four main performers are riveting."

# Parse the sentence
doc = nlp(sentence)
sent = list(doc.sents)[0]

# Print the phrase structure tree
print(sent._.parse_string)

# Display the tree using nltk.Tree
from nltk import Tree

def print_tree(tree):
    if not isinstance(tree, Tree):
        return
    tree.pretty_print()

nltk_tree = Tree.fromstring(sent._.parse_string)
print_tree(nltk_tree)


def extract_ngrams(tree, n):
    ngrams = []

    def traverse(t):
        if isinstance(t, Tree):
            for i in range(len(t) - n + 1):
                ngram = t[i:i+n]
                if all(isinstance(child, Tree) for child in ngram):
                    ngrams.append(ngram)
            for child in t:
                traverse(child)

    traverse(tree)
    return ngrams

def print_ngrams(ngrams):
    for ngram in ngrams:
        print(' '.join([' '.join(child.leaves()) for child in ngram]))
        print(ngram)
        print()

# Extract n-grams of phrase structures
n = 2  # Change this to any n you want
ngrams = extract_ngrams(nltk_tree, 2)

# Print the extracted n-grams
print_ngrams(ngrams)

In [None]:
diffusiondb_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(diffusiondb_statistics.items()), columns=['feature', 'ngrams'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_syntactic.csv', index=False)
df_diffusiondb_statistics

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/authorship_inference_attack/real_diffusionDB_large_all_features 1.csv')
df_diffusiondb1 = pd.read_csv('1.csv')
df_diffusiondb1['prompt'] = df_diffusiondb1['prompt'].astype(str)
# df_diffusiondb1['tokenized_prompt'] = df_diffusiondb1['prompt'].apply(tokenize)
# df_diffusiondb1.to_csv('1.csv', index=False)

# df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features1 = extract_features(df_diffusiondb1)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

diffusiondb_statistics1 = calculate_statistics(df_diffusiondb_features1)
# print(diffusiondb_statistics)

df_diffusiondb_statistics1 = pd.DataFrame.from_dict(diffusiondb_statistics1, orient='index', columns=['DiffusionDB'])
df_diffusiondb_statistics1

In [None]:
# df_diffusiondb2 = pd.read_csv('/content/drive/MyDrive/authorship_inference_attack/real_diffusionDB_large_all_features 1.csv')

# df_diffusiondb2['prompt'] = df_diffusiondb2['prompt'].astype(str)
# df_diffusiondb2['tokenized_prompt'] = df_diffusiondb2['prompt'].apply(lambda x: x.split())
# df_diffusiondb2.to_csv('2.csv', index=False)
df_diffusiondb2 = pd.read_csv('2.csv')
df_diffusiondb2['prompt'] = df_diffusiondb2['prompt'].astype(str)
# df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features2 = extract_features(df_diffusiondb2)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

diffusiondb_statistics2 = calculate_statistics(df_diffusiondb_features2)
# print(diffusiondb_statistics)

df_diffusiondb_statistics2 = pd.DataFrame.from_dict(diffusiondb_statistics2, orient='index', columns=['DiffusionDB'])
df_diffusiondb_statistics2

In [None]:
print(string.ascii_letters[:26])

## Twitter

In [None]:
df_twitter = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/author_texts_cleaned.csv')
df_twitter.columns = ['user_name', 'prompt']
df_twitter['tokenized_prompt'] = df_twitter['prompt'].apply(tokenize)
df_twitter['pos_tags'] = df_twitter['tokenized_prompt'].apply(get_pos_tags)
df_twitter.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv', index=False)
df_twitter

In [None]:
# Lexical
tqdm.pandas()
# df_twitter_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv')

# df_twitter_features = extract_features_lexical(df_twitter_tokenized)
# df_twitter_features.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_lexical.csv', index=False)

df_twitter_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_lexical.csv')

twitter_statistics = calculate_statistics_lexical(df_twitter_features)
# print(twitter_statistics)

flattened_data = {k: str(v) for k, v in twitter_statistics.items()}
df_twitter_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'Twitter'])
df_twitter_statistics.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_statistics_lexical_1.csv', index=False)
df_twitter_statistics

In [None]:
import pandas as pd
import ast

def normalize_dict_str(dict_str):
    if not dict_str or not isinstance(dict_str, str) or not dict_str.startswith('{'):
        return dict_str  # Return original if not a dictionary string
    dict_obj = ast.literal_eval(dict_str)
    total = sum(dict_obj.values())
    normalized_dict = {k: v / total for k, v in dict_obj.items()}
    return str(normalized_dict)

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_statistics_lexical.csv')
rows_to_normalize = ['stopwords_frequency', 'letter_frequency', 'digit_frequency']
df.loc[df['Metric'].isin(rows_to_normalize), 'Twitter'] = df.loc[df['Metric'].isin(rows_to_normalize), 'Twitter'].apply(normalize_dict_str)
df.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_statistics_lexical.csv', index=False)
df

In [None]:
# Syntactic
tqdm.pandas()
df_twitter_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv')
# df_twitter_tokenized['pos_tags'] = df_twitter_tokenized['tokenized_prompt'].progress_apply(get_pos_tags)
df_twitter_features = extract_features_syntactic(df_twitter_tokenized)
# print(df_diffusiondb_features)
df_twitter_features.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_syntactic.csv', index=False)

df_twitter_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_syntactic.csv')

twitter_statistics = calculate_statistics_syntactic(df_twitter_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in twitter_statistics.items()}
df_twitter_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'Twitter'])
df_twitter_statistics.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_statistics_syntactic.csv', index=False)
df_twitter_statistics

In [None]:
# N-gram based
df_twitter_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv')
# df_twitter_tokenized['pos_tags'] = df_twitter_tokenized['tokenized_prompt'].apply(get_pos_tags)
# df_twitter_tokenized.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv', index=False)

# df_twitter_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized.csv')

twitter_statistics = calculate_statistics_ngrams(df_twitter_tokenized)
# print(diffusiondb_statistics)

twitter_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in twitter_statistics.items()}
df_twitter_statistics = pd.DataFrame(list(twitter_statistics.items()), columns=['Metric', 'Twitter'])
df_twitter_statistics.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/preprocessed_twitter_tokenized_statistics_ngrams.csv', index=False)
df_twitter_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/author_texts_cleaned.csv')
df.columns = ['user_name', 'prompt']
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50, 75, 100, 150, 200]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'twitter_micro',
    '--data_path', '/content/drive/MyDrive/msc_project/data/twitter_micro/author_texts_cleaned.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

## IMDB62

In [None]:
tqdm.pandas()
df_imdb62 = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/imdb62.csv')
df_imdb62.columns = ['user_name', 'prompt']
df_imdb62['tokenized_prompt'] = df_imdb62['prompt'].progress_apply(tokenize)
df_imdb62['pos_tags'] = df_imdb62['tokenized_prompt'].progress_apply(get_pos_tags)
df_imdb62.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized.csv', index=False)
df_imdb62

In [None]:
# Lexical
tqdm.pandas()
df_imdb62_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized.csv')

df_imdb62_features = extract_features_lexical(df_imdb62_tokenized)
df_imdb62_features.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_lexical.csv', index=False)

df_imdb62_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_lexical.csv')

imdb62_statistics = calculate_statistics_lexical(df_imdb62_features)
# print(twitter_statistics)

flattened_data = {k: str(v) for k, v in imdb62_statistics.items()}
df_imdb62_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'IMDB62'])
df_imdb62_statistics.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_statistics_lexical.csv', index=False)
df_imdb62_statistics

In [None]:
# Syntactic
tqdm.pandas()
df_imdb62_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized.csv')

df_imdb62_features = extract_features_syntactic(df_imdb62_tokenized)
# print(df_diffusiondb_features)
df_imdb62_features.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_syntactic.csv', index=False)

df_imdb62_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_syntactic.csv')

imdb62_statistics = calculate_statistics_syntactic(df_imdb62_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in imdb62_statistics.items()}
df_imdb62_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'IMDB62'])
df_imdb62_statistics.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_statistics_syntactic.csv', index=False)
df_imdb62_statistics

In [None]:
# N-gram based
df_imdb62_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized.csv')

imdb62_statistics = calculate_statistics_ngrams(df_imdb62_tokenized)
# print(diffusiondb_statistics)

imdb62_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in imdb62_statistics.items()}
df_imdb62_statistics = pd.DataFrame(list(imdb62_statistics.items()), columns=['Metric', 'IMDB62'])
df_imdb62_statistics.to_csv('/content/drive/MyDrive/msc_project/data/imdb62/preprocessed_imdb62_tokenized_statistics_ngrams.csv', index=False)
df_imdb62_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/imdb62.csv')
df.columns = ['user_name', 'prompt']
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50, 75, 100, 150, 200]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'imdb62',
    '--data_path', '/content/drive/MyDrive/msc_project/data/imdb62/imdb62.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

## Blogs

In [None]:
tqdm.pandas()
df_blogs = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/blogs.csv')
df_blogs.columns = ['user_name', 'prompt']
df_blogs['tokenized_prompt'] = df_blogs['prompt'].progress_apply(tokenize)
df_blogs['pos_tags'] = df_blogs['tokenized_prompt'].progress_apply(get_pos_tags)
df_blogs.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized.csv', index=False)
df_blogs

In [None]:
# Lexical
tqdm.pandas()
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized.csv')

df_blogs_features = extract_features_lexical(df_blogs_tokenized)
df_blogs_features.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_lexical.csv', index=False)

df_blogs_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_lexical.csv')

blogs_statistics = calculate_statistics_lexical(df_blogs_features)
# print(twitter_statistics)

flattened_data = {k: str(v) for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'blogs'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_statistics_lexical.csv', index=False)
df_blogs_statistics

In [None]:
# Syntactic
tqdm.pandas()
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized.csv')

df_blogs_features = extract_features_syntactic(df_blogs_tokenized)
# print(df_diffusiondb_features)
df_blogs_features.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_syntactic.csv', index=False)

df_blogs_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_syntactic.csv')

blogs_statistics = calculate_statistics_syntactic(df_blogs_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'blogs'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_statistics_syntactic.csv', index=False)
df_blogs_statistics

In [None]:
# N-gram based
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized.csv')

blogs_statistics = calculate_statistics_ngrams(df_blogs_tokenized)
# print(diffusiondb_statistics)

blogs_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(blogs_statistics.items()), columns=['Metric', 'blogs'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs/preprocessed_blogs_tokenized_statistics_ngrams.csv', index=False)
df_blogs_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/blogs.csv')
df.columns = ['user_name', 'prompt']
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'blogs',
    '--data_path', '/content/drive/MyDrive/msc_project/data/blogs/blogs.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

## Blogs50

In [None]:
tqdm.pandas()
df_blogs = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/blogs50.csv')
df_blogs.columns = ['user_name', 'prompt']
df_blogs['tokenized_prompt'] = df_blogs['prompt'].progress_apply(tokenize)
df_blogs['pos_tags'] = df_blogs['tokenized_prompt'].progress_apply(get_pos_tags)
df_blogs.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized.csv', index=False)
df_blogs

In [None]:
# Lexical
tqdm.pandas()
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized.csv')

df_blogs_features = extract_features_lexical(df_blogs_tokenized)
df_blogs_features.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_lexical.csv', index=False)

df_blogs_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_lexical.csv')

blogs_statistics = calculate_statistics_lexical(df_blogs_features)
# print(twitter_statistics)

flattened_data = {k: str(v) for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'blogs50'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_statistics_lexical.csv', index=False)
df_blogs_statistics

In [None]:
# Syntactic
tqdm.pandas()
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized.csv')

df_blogs_features = extract_features_syntactic(df_blogs_tokenized)
# print(df_diffusiondb_features)
df_blogs_features.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_syntactic.csv', index=False)

df_blogs_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_syntactic.csv')

blogs_statistics = calculate_statistics_syntactic(df_blogs_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'blogs50'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_statistics_syntactic.csv', index=False)
df_blogs_statistics

In [None]:
# N-gram based
df_blogs_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized.csv')

blogs_statistics = calculate_statistics_ngrams(df_blogs_tokenized)
# print(diffusiondb_statistics)

blogs_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in blogs_statistics.items()}
df_blogs_statistics = pd.DataFrame(list(blogs_statistics.items()), columns=['Metric', 'blogs50'])
df_blogs_statistics.to_csv('/content/drive/MyDrive/msc_project/data/blogs50/preprocessed_blogs50_tokenized_statistics_ngrams.csv', index=False)
df_blogs_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/blogs50.csv')
df.columns = ['user_name', 'prompt']
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'blogs50',
    '--data_path', '/content/drive/MyDrive/msc_project/data/blogs50/blogs50.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

# Compare datasets

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_1.csv')
df_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_1.csv')

df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
df

In [None]:
tqdm.pandas()
df['tokenized_prompt'] = df['prompt'].apply(tokenize)
df['pos_tags'] = df['tokenized_prompt'].apply(get_pos_tags)
# print(df)
df = extract_features_lexical(df)
df = extract_features_syntactic(df)
df = extract_features_ngrams(df)
# print(df)
lexical1 = calculate_statistics_lexical(df)
ngrams1 = calculate_statistics_ngrams(df)
syntactic1 = calculate_statistics_syntactic(df)
print(lexical1)
print(ngrams1)
print(syntactic1)

lexical2 = {k: str(v) for k, v in lexical1.items()}
lexical3 = pd.DataFrame(list(lexical2.items()), columns=['Metric', 'DiffusionDB'])
ngrams2 = {k: str(v) for k, v in ngrams1.items()}
ngrams3 = pd.DataFrame(list(ngrams2.items()), columns=['Metric', 'DiffusionDB'])
syntactic2 = {k: str(v) for k, v in syntactic1.items()}
syntactic3 = pd.DataFrame(list(syntactic2.items()), columns=['Metric', 'DiffusionDB'])

statistics = pd.concat([lexical3, ngrams3], ignore_index=True)
statistics = pd.concat([statistics, syntactic3], ignore_index=True)

df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/features_random100_1.csv', index=False)
statistics.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/statistics_random100_1.csv', index=False)
statistics

In [None]:
df

In [None]:
df.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

columns = ['average_word_length', 'short_words_ratio', 'percentage_of_digits',
       'percentage_of_uppercase', 'average_sentence_length', 'average_document_length']

for column in columns:
    plt.figure(figsize=(8, 6))  # Optional: to set the figure size for each plot
    sns.histplot(df[column], kde=True, bins=30)  # kde=False to not include the kernel density estimate
    plt.title(f'Histogram of {column}')  # Adding a title for clarity
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()


In [None]:

# from sklearn.model_selection import train_test_split
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/author_texts_cleaned.csv')
# df.columns = ['user_name', 'prompts']
# # df_selected = df[['user_name', 'prompt']]
# df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
# print('number of authors', len(df_filtered['user_name'].drop_duplicates()))

# for idx in range(3):
#   sampled_authors = df_filtered['user_name'].drop_duplicates().sample(n=100)
#   df_sampled = df_filtered[df_filtered['user_name'].isin(sampled_authors)]
#   df_final = df_sampled.groupby('user_name').apply(lambda x: x.sample(n=100)).reset_index(drop=True)

#   train_data = pd.DataFrame()
#   test_data = pd.DataFrame()

#   for author in df_final['user_name'].unique():
#       author_data = df_final[df_final['user_name'] == author]
#       train, test = train_test_split(author_data, test_size=0.2)
#       train_data = pd.concat([train_data, train])
#       test_data = pd.concat([test_data, test])

#   train_data.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_{idx+1}.csv', index=False)
#   test_data.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_{idx+1}.csv', index=False)
#   # print(train_data)
#   # print(test_data)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_1.csv')
df_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_1.csv')

df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
df.columns = ['user_name', 'prompt']
df

In [None]:
tqdm.pandas()
df['tokenized_prompt'] = df['prompt'].apply(tokenize)
df['pos_tags'] = df['tokenized_prompt'].apply(get_pos_tags)
# print(df)
df = extract_features_lexical(df)
df = extract_features_syntactic(df)
df = extract_features_ngrams(df)
lexical1 = calculate_statistics_lexical(df)
ngrams1 = calculate_statistics_ngrams(df)
syntactic1 = calculate_statistics_syntactic(df)
print(lexical1)
print(ngrams1)
print(syntactic1)

lexical2 = {k: str(v) for k, v in lexical1.items()}
lexical3 = pd.DataFrame(list(lexical2.items()), columns=['Metric', 'Twitter_micro'])
ngrams2 = {k: str(v) for k, v in ngrams1.items()}
ngrams3 = pd.DataFrame(list(ngrams2.items()), columns=['Metric', 'Twitter_micro'])
syntactic2 = {k: str(v) for k, v in syntactic1.items()}
syntactic3 = pd.DataFrame(list(syntactic2.items()), columns=['Metric', 'Twitter_micro'])

statistics = pd.concat([lexical3, ngrams3], ignore_index=True)
statistics = pd.concat([statistics, syntactic3], ignore_index=True)
statistics

df.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/features_random100_1.csv', index=False)
statistics.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/statistics_random100_1.csv', index=False)
statistics

In [None]:
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

columns = ['average_word_length', 'short_words_ratio', 'percentage_of_digits',
       'percentage_of_uppercase', 'average_sentence_length', 'average_document_length']

for column in columns:
    plt.figure(figsize=(8, 6))  # Optional: to set the figure size for each plot
    sns.histplot(df[column], kde=True, bins=30)  # kde=False to not include the kernel density estimate
    plt.title(f'Histogram of {column}')  # Adding a title for clarity
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# test whether the data is normally distributed
import scipy.stats as stats
from scipy.stats import skew, kurtosis

def test_normal_distribution(df):
  columns = ['average_word_length', 'short_words_ratio', 'percentage_of_digits',
        'percentage_of_uppercase', 'average_sentence_length', 'average_document_length']

  for column in columns:
    print(column)
    data = df[column].tolist()
    # Shapiro-Wilk Test
    stat, p = stats.shapiro(data)
    print(f'Shapiro-Wilk Test: Statistics={stat}, p={p}')

    # Kolmogorov-Smirnov Test
    stat, p = stats.kstest(data, 'norm')
    print(f'Kolmogorov-Smirnov Test: Statistics={stat}, p={p}')

    # Anderson-Darling Test
    result = stats.anderson(data, dist='norm')
    print(f'Anderson-Darling Test: Statistic={result.statistic}')
    for i in range(len(result.critical_values)):
        sl = result.significance_level[i]
        cv = result.critical_values[i]
        if result.statistic < cv:
            print(f'{sl}%: {cv}, data looks normal (fail to reject H0)')
        else:
            print(f'{sl}%: {cv}, data does not look normal (reject H0)')

    # D'Agostino's K^2 Test
    stat, p = stats.normaltest(data)
    print(f'D\'Agostino\'s K^2 Test: Statistics={stat}, p={p}')

    skewness = skew(data)
    kurt = kurtosis(data)

    print(f'Skewness: {skewness}')
    print(f'Kurtosis: {kurt}')  # If kurtosis() gives excess kurtosis (subtract 3 from kurtosis)

    print()

print('---diffusiondb---')
test_normal_distribution(df_diffusiondb)
print('---df_twitter---')
test_normal_distribution(df_twitter)

In [None]:
# compare datasets
import scipy.stats as stats

def compare_two_datasets(df1, df2):
  columns = ['average_word_length', 'short_words_ratio', 'percentage_of_digits',
        'percentage_of_uppercase', 'average_sentence_length', 'average_document_length']

  for column in columns:
    print(column)
    data1 = df1[column].tolist()
    data2 = df2[column].tolist()

    # Mann-Whitney U Test
    stat, p = stats.mannwhitneyu(data1, data2)
    print(f'Mann-Whitney U Test: Statistics={stat}, p={p}')
    print(f"z-score: {z}")
    print(f"Effect size (Cohen's r): {r}")

    # Convert U statistic to z-score
    n1 = len(data1)
    n2 = len(data2)
    mean_u = n1 * n2 / 2
    std_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
    z = (stat - mean_u) / std_u

    # Calculate the effect size (Cohen's r)
    r = z / np.sqrt(n1 + n2)

    # Interpretation
    if p < 0.05:
        print("The two datasets have significantly different distributions (reject H0).")
    else:
        print("The two datasets do not have significantly different distributions (fail to reject H0).")

    # if r <= 0.1:
    #   print('small effect size')
    # elif r <= 0.3:
    #   print('medium effect size')
    # elif r <


    print()


compare_two_datasets(df_diffusiondb, df_twitter)


In [None]:
!pip install scikit-bio

In [None]:
df_diffusiondb = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/features_random100_1.csv')
df_twitter = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/features_random100_1.csv')

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform  # Correct import
from skbio.stats.ordination import pcoa
from skbio.stats.distance import permanova, DistanceMatrix  # Import DistanceMatrix

# PERMANOVA
def compare_two_datasets_multiple(df1, df2):
    columns = ['digit_frequency', 'pos_frequency', 'stopwords_frequency',
               'punctuation_frequency', 'letter_frequency',
               'char_bigrams_common_frequency', 'char_trigrams_common_frequency',
               'word_unigrams_common_frequency', 'word_bigrams_common_frequency',
               'word_trigrams_common_frequency', 'pos_bigrams_common_frequency',
               'pos_trigrams_common_frequency']

    for column in columns:
        # Convert the dictionary column to DataFrame for both df1 and df2
        # print(type(df1[column]))
        df1[column] = df1[column].apply(lambda x: eval(x))
        df2[column] = df2[column].apply(lambda x: eval(x))
        df11 = pd.json_normalize(df1[column])
        df22 = pd.json_normalize(df2[column])
        # print(df11.head(5))
        # Concatenate the two DataFrames along rows
        combined_df = pd.concat([df11, df22], axis=0)
        # print(combined_df)
        if combined_df.isnull().values.any():
            print(f"NaN values found in {column}. Filling NaNs with 0.")
            combined_df = combined_df.fillna(0)  # or combined_df.dropna()

        distance_matrix = pdist(combined_df, metric='euclidean')
        distance_matrix = squareform(distance_matrix)

        # Convert to DistanceMatrix object
        distance_matrix = DistanceMatrix(distance_matrix)

        # Create a grouping variable that distinguishes between df1 and df2
        labels = np.array([0]*len(df11) + [1]*len(df22))

        # Perform PERMANOVA
        result = permanova(distance_matrix, labels, permutations=999)

        # Extract sums of squares
        SSB = result['test statistic'] * result['sample size'] * (len(np.unique(labels)) - 1) / (len(labels) - 1)
        SST = SSB + result['test statistic'] * result['sample size']

        # Calculate eta squared
        eta_squared = SSB / SST

        print(f"PERMANOVA result for {column}:")
        print(result)
        print(f"Eta^2 (Effect Size) for {column}: {eta_squared}")
        print("\n")

# Example usage:
compare_two_datasets_multiple(df_diffusiondb, df_twitter)


# diffusiondb clean

In [None]:
df_diffusiondb = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/all.csv')
df_diffusiondb['tokenized_prompt'] = df_diffusiondb['prompt'].apply(tokenize)
df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/preprocessed_diffusiondb_tokenized.csv', index=False)

In [None]:
# Lexical
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features = extract_features_lexical(df_diffusiondb_tokenized)
df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/preprocessed_diffusiondb_tokenized_lexical.csv')

diffusiondb_statistics = calculate_statistics_lexical(df_diffusiondb_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/preprocessed_diffusiondb_tokenized_statistics_lexical.csv', index=False)
df_diffusiondb_statistics

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'diffusiondb',
    '--data_path', '/content/drive/MyDrive/msc_project/data/diffusiondb/clean/all.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

# DiffusionDB 10000

In [None]:
df_diffusiondb = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/final_llama3.csv')
df_diffusiondb['tokenized_prompt'] = df_diffusiondb['prompt'].apply(tokenize)
df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/preprocessed_diffusiondb_tokenized.csv', index=False)

In [None]:
# Lexical
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features = extract_features_lexical(df_diffusiondb_tokenized)
df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/preprocessed_diffusiondb_tokenized_lexical.csv')

diffusiondb_statistics = calculate_statistics_lexical(df_diffusiondb_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/preprocessed_diffusiondb_tokenized_statistics_lexical.csv', index=False)
df_diffusiondb_statistics

In [None]:
# Syntactic
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')
df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# print(df_diffusiondb_features)
df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_syntactic.csv', index=False)

df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_syntactic.csv')

diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
# print(diffusiondb_statistics)

flattened_data = {k: str(v) for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(flattened_data.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_syntactic.csv', index=False)
df_diffusiondb_statistics

In [None]:
# N-gram based
# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')
# df_diffusiondb_tokenized['pos_tags'] = df_diffusiondb_tokenized['tokenized_prompt'].apply(get_pos_tags)
# df_diffusiondb_tokenized.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

# df_diffusiondb_features = extract_features_ngrams(df_diffusiondb_tokenized)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_ngrams.csv', index=False)

# df_diffusiondb_features = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_ngrams.csv')

diffusiondb_statistics = calculate_statistics_ngrams(df_diffusiondb_tokenized)
# print(diffusiondb_statistics)

diffusiondb_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(diffusiondb_statistics.items()), columns=['Metric', 'DiffusionDB'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_ngrams.csv', index=False)
df_diffusiondb_statistics

In [None]:
# Hardness
def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv')
unique_user_names_count = df['user_name'].nunique()
print('Unique Users:', unique_user_names_count)
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
unique_user_names_count = df['user_name'].nunique()
print(f"Number of users with at least 100 prompts: {unique_user_names_count}")

rh_blog = []
for lim in [5, 10, 25, 50, 75, 100, 150, 200]:
  print(lim)
  list_spk = list(pd.DataFrame(df_filtered['user_name'].value_counts()[:lim]).index)
  sub_df = df_filtered[df_filtered['user_name'].isin(list_spk)]
  sampled_df = sub_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
  rh_blog.append(rel_hardness(sampled_df, 'user_name', 'prompt'))

rh_blog

In [None]:
# Topics

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='ccat10', help='data')
parser.add_argument('--data_path', type=str, default='/home/yunita/Data/Dataset/Stamatatos/c10_traintest.csv', help='data path')
parser.add_argument('--n_topics', help='number of topics')

# args = parser.parse_args()
args = parser.parse_args(args=[
    '--data', 'diffusiondb',
    '--data_path', '/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/final_llama3.csv',
    '--n_topics', '[3, 10, 20, 30, 40, 50]'
])

topics_analysis(args)

In [None]:
print(df_diffusiondb_features)

In [None]:
diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features.head(50))

In [None]:
df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized.head(5))
diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
print(diffusiondb_statistics)

In [None]:
tqdm.pandas()
df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

# df_diffusiondb_tokenized = df_diffusiondb_tokenized.head(5)
# df_diffusiondb_tokenized['spacy_doc'] = df_diffusiondb_tokenized['prompt'].progress_apply(process_text)

# with open('/content/drive/MyDrive/msc_project/data/spacy_docs.pkl', 'wb') as f:
#     pickle.dump(df_diffusiondb_tokenized['spacy_doc'].tolist(), f)

# with open('/content/drive/MyDrive/msc_project/data/spacy_docs.pkl', 'rb') as f:
#     spacy_docs = pickle.load(f)

# df_diffusiondb_tokenized['spacy_doc'] = spacy_docs
# print(df_diffusiondb_tokenized)
# # print(df_diffusiondb_features)

# df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# df_diffusiondb_features

In [None]:
text = "The four main performers are riveting."
# df_diffusiondb_tokenized = pd.DataFrame(data)
# df_diffusiondb_tokenized['spacy_doc'] = df_diffusiondb_tokenized['prompt'].progress_apply(process_text)
doc = process_text(text)
# df_diffusiondb_features = extract_features_syntactic(df_diffusiondb_tokenized)
# print(df_diffusiondb_features)
# diffusiondb_statistics = calculate_statistics_syntactic(df_diffusiondb_features)
# print(diffusiondb_statistics)

In [None]:
import spacy
import benepar

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')
benepar.download("benepar_en3")
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
# Sample sentence
sentence = "The four main performers are riveting."

# Parse the sentence
doc = nlp(sentence)
sent = list(doc.sents)[0]

# Print the phrase structure tree
print(sent._.parse_string)

# Display the tree using nltk.Tree
from nltk import Tree

def print_tree(tree):
    if not isinstance(tree, Tree):
        return
    tree.pretty_print()

nltk_tree = Tree.fromstring(sent._.parse_string)
print_tree(nltk_tree)


def extract_ngrams(tree, n):
    ngrams = []

    def traverse(t):
        if isinstance(t, Tree):
            for i in range(len(t) - n + 1):
                ngram = t[i:i+n]
                if all(isinstance(child, Tree) for child in ngram):
                    ngrams.append(ngram)
            for child in t:
                traverse(child)

    traverse(tree)
    return ngrams

def print_ngrams(ngrams):
    for ngram in ngrams:
        print(' '.join([' '.join(child.leaves()) for child in ngram]))
        print(ngram)
        print()

# Extract n-grams of phrase structures
n = 2  # Change this to any n you want
ngrams = extract_ngrams(nltk_tree, 2)

# Print the extracted n-grams
print_ngrams(ngrams)

In [None]:
diffusiondb_statistics = {k: {tuple(ngram): count for ngram, count in v} for k, v in diffusiondb_statistics.items()}
df_diffusiondb_statistics = pd.DataFrame(list(diffusiondb_statistics.items()), columns=['feature', 'ngrams'])
df_diffusiondb_statistics.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_statistics_syntactic.csv', index=False)
df_diffusiondb_statistics

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/authorship_inference_attack/real_diffusionDB_large_all_features 1.csv')
df_diffusiondb1 = pd.read_csv('1.csv')
df_diffusiondb1['prompt'] = df_diffusiondb1['prompt'].astype(str)
# df_diffusiondb1['tokenized_prompt'] = df_diffusiondb1['prompt'].apply(tokenize)
# df_diffusiondb1.to_csv('1.csv', index=False)

# df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features1 = extract_features(df_diffusiondb1)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

diffusiondb_statistics1 = calculate_statistics(df_diffusiondb_features1)
# print(diffusiondb_statistics)

df_diffusiondb_statistics1 = pd.DataFrame.from_dict(diffusiondb_statistics1, orient='index', columns=['DiffusionDB'])
df_diffusiondb_statistics1

In [None]:
# df_diffusiondb2 = pd.read_csv('/content/drive/MyDrive/authorship_inference_attack/real_diffusionDB_large_all_features 1.csv')

# df_diffusiondb2['prompt'] = df_diffusiondb2['prompt'].astype(str)
# df_diffusiondb2['tokenized_prompt'] = df_diffusiondb2['prompt'].apply(lambda x: x.split())
# df_diffusiondb2.to_csv('2.csv', index=False)
df_diffusiondb2 = pd.read_csv('2.csv')
df_diffusiondb2['prompt'] = df_diffusiondb2['prompt'].astype(str)
# df_diffusiondb.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv', index=False)

# df_diffusiondb_tokenized = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized.csv')

df_diffusiondb_features2 = extract_features(df_diffusiondb2)
# df_diffusiondb_features.to_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb_tokenized_lexical.csv', index=False)

diffusiondb_statistics2 = calculate_statistics(df_diffusiondb_features2)
# print(diffusiondb_statistics)

df_diffusiondb_statistics2 = pd.DataFrame.from_dict(diffusiondb_statistics2, orient='index', columns=['DiffusionDB'])
df_diffusiondb_statistics2

In [None]:
print(string.ascii_letters[:26])