In [None]:
datapath = "./dataset/MassiveSumm/"
column_order = ['text', 'label', 'length', 'source', 'language', 'domain', 'topic']
selected = ['ar', 'ca', 'cs', 'de', 'en', 'es', 'nl', 'pt', 'ru', 'uk', 'zh']

#export datapath to be accesible by a bash script
with open('datapath.txt', 'w') as f:
    f.write(datapath)

In [None]:
!mkdir -p $(cat datapath.txt)

In [None]:
#mount GDrive if datapath is on the drive
from google.colab import drive
drive.mount('/content/drive')

# Download Data


*   Problem with wayback, only Common Crawl (cc) links and MassiveSumm-full (author version) used



In [None]:
!git clone https://github.com/danielvarab/massive-summ.git
!pip install -r massive-summ/requirements.txt

In [None]:
import gdown
import pandas as pd
import glob

urls = pd.read_csv('massive-summ/urls.tsv', sep='\t')

In [None]:
urls

##Common Crawl

In [None]:
url_list = []
for l in urls['cc']:
  if l is None or l == "-": continue
  url_list.append(l.replace('/view)','').split('/')[-1])
for file_id in url_list:
  url = f'https://drive.google.com/uc?id={file_id}'
  gdown.download(url, quiet=False)

In [None]:
%%bash

for f in $(ls massive-in-cc.*.jsonl.gz); do
  python massive-summ/scripts/download.py --urls $f --archive temp --n_proc 4;
  python massive-summ/scripts/extract.py --archive temp --dataset ${f#*.};
  cp ${f#*.} $(cat datapath.txt)
  rm $f
  #break
done

## WayBack

In [None]:
!git clone https://github.com/danielvarab/da-newsroom.git
!pip install -r da-newsroom/requirements.txt
!pip install -e ./da-newsroom/newsroom-lib

In [None]:
url_list = []
for l in urls['wayback']:
  if l is None or l == "-": continue
  url_list.append(l.replace('/view)','').split('/')[-1])
for file_id in url_list:
  url = f'https://drive.google.com/uc?id={file_id}'
  gdown.download(url, quiet=False)

In [None]:
%%bash

for f in $(ls -1 *.jsonl.gz | grep -v "wb-\|-wb"); do
  mv $f "massive-in-wb.$f";
done

In [None]:
%%bash

for f in $(ls -1 massive-in-wb.*.jsonl.gz); do
  python massive-summ/scripts/download.py --urls $f --archive temp --n_proc 4;
  #python massive-summ/scripts/extract.py --archive temp --dataset wb-${f#*.};
  #cp wb-${f#*.} $(cat datapath.txt)
  #rm $f
  #break
done

## MassiveSumm-full


*   direct links to dataset per language received from authors



In [None]:
urls = pd.read_csv(datapath + 'massive-summ by language (links) - massive-summ-full.csv')

In [None]:
processed = []
for f in glob.glob(datapath + '*.all.jsonl.gz'):
  processed.append(f.split('/')[-1].split('.')[0])
temp = [x not in processed for x in urls.alpha_3]
urls = urls[temp]
urls

In [None]:
url_list = []
for l in urls['link']:
  if l is None or l is pd.NA or l == "-": continue
  url_list.append(str(l).replace('/view','').split('/')[-1])
for file_id in url_list:
  url = f'https://drive.google.com/uc?id={file_id}'
  gdown.download(url, quiet=False)

In [None]:
%%bash

for f in $(ls *.all.jsonl.gz); do
  cp ${f} $(cat datapath.txt)
  rm $f
  #break
done

# Preprocessing (Clean & Clear) Human-Text Dataset

In [None]:
!sudo apt-get install libicu-dev > /dev/null
!pip install polyglot PyICU pycld2 morfessor > /dev/null

In [None]:
#[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification
#[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O lid.176.bin > /dev/null
!pip install fasttext > /dev/null

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import glob
import shutil
from langcodes import *
import polyglot
from polyglot.text import Text, Word
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")
import fasttext
fasttext_model = fasttext.load_model('lid.176.bin')
from collections import Counter

In [None]:
filelist = glob.glob(datapath + '*.jsonl.gz')

In [None]:
selected_filelist = []
for f in filelist:
  if standardize_tag(f.split('/')[-1].split('.')[0]) in selected:
    selected_filelist.append(f)
filelist = selected_filelist

In [None]:
%%time

def clear_dataset(df):
  df_string_columns = df.select_dtypes(['object'])
  df[df_string_columns.columns] = df_string_columns.apply(lambda x: x.str.strip())
  return df

def try_to_detect(text):
  fasttext_pred, fasttext_prob, polyglot_pred, polyglot_prob = 0, 0, 0, 0
  try:
    pred, prob = fasttext_model.predict(text.split('\n')[0])
    #pred, prob = fasttext_model.predict(text.replace('\n', ' '))
    fasttext_pred = pred[0].replace('__label__', '')
    fasttext_prob = prob[0]
    language = Text(text).language
    polyglot_pred = language.code
    polyglot_prob = language.confidence
    return fasttext_pred, fasttext_prob, polyglot_pred, polyglot_prob
  except:
    pass
  return fasttext_pred, fasttext_prob, polyglot_pred, polyglot_prob

def try_to_detect_language(title, text):
  title_fasttext_pred, title_fasttext_prob, title_polyglot_pred, title_polyglot_prob = try_to_detect(title)
  fasttext_pred, fasttext_prob, polyglot_pred, polyglot_prob = try_to_detect(text)

  detected_language = "unknown"

  if title_fasttext_prob < 0.9: title_fasttext_pred = "unknown"
  if title_polyglot_prob < 0.9: title_polyglot_pred = "unknown"
  if fasttext_prob < 0.9: fasttext_pred = "unknown"
  if polyglot_prob < 0.9: polyglot_pred = "unknown"

  c = Counter([title_fasttext_pred, title_polyglot_pred, fasttext_pred, polyglot_pred])
  if c.most_common()[0][1] > 2:
    detected_language = c.most_common()[0][0]

  return detected_language

stat = pd.DataFrame(columns=['language', 'size', 'note'])
massivesumm = pd.DataFrame()
for f in filelist:
  #if ("wb-" in f) or (".all." in f): continue
  df = pd.DataFrame()
  temp = pd.read_json(f, lines=True, chunksize=10000, nrows=50000) #up to 50k samples per file taken
  for chunk in temp:
    df = pd.concat([df, chunk.astype("string")], copy=False)
  df.drop(columns=['date', 'summary'], inplace=True)
  if (".all." not in f):
    df.drop(columns=['archive'], inplace=True)
    language_source = standardize_tag(f.split('/')[-1].replace('.jsonl.gz', ''))
  else:
    language_source = standardize_tag(f.split('/')[-1].replace('.all.jsonl.gz', ''))
  df['language'] = language_source
  stat.loc[len(stat.index)] = [language_source, len(df), 'original']
  df = clear_dataset(df)
  df.replace('', pd.NA, inplace=True)
  df.dropna(inplace=True)
  df.drop_duplicates(inplace=True)
  #df.drop_duplicates(subset=['text'], inplace=True)
  #df.drop_duplicates(subset=['title'], inplace=True)
  stat.loc[len(stat.index)] = [language_source, len(df), 'NA_dup_removed']
  df['temp'] = df.title.str.split().apply(len)
  df = df[df.temp > 1]
  df = df.drop(columns=['temp'])
  df['temp'] = [len(x.split()) for x in df.text] #df.text.str.split().apply(len)
  df = df[df.temp > 5]#.copy()
  df = df.drop(columns=['temp'])
  stat.loc[len(stat.index)] = [language_source, len(df), 'min_textsize_applied']
  df['detected_language'] = df.apply(lambda x: try_to_detect_language(x['title'], x['text']), axis = 1)
  df = df[(df.language == df.detected_language)]
  df = df.drop(columns=['detected_language'])
  stat.loc[len(stat.index)] = [language_source, len(df), 'language_checked']
  df = df.sample(min(5000, len(df)), random_state = 0).sample(frac=1., random_state = 0).reset_index(drop=True)
  massivesumm = pd.concat([massivesumm, df], ignore_index=True, copy=False)
  #break

In [None]:
original = stat[stat.note == 'original'].groupby(by=['language']).sum(numeric_only=True)['size']
NA_dup_removed = stat[stat.note == 'NA_dup_removed'].groupby(by=['language']).sum(numeric_only=True)['size']
min_textsize_applied = stat[stat.note == 'min_textsize_applied'].groupby(by=['language']).sum(numeric_only=True)['size']
language_checked = stat[stat.note == 'language_checked'].groupby(by=['language']).sum(numeric_only=True)['size']
temp = pd.concat([original, NA_dup_removed, min_textsize_applied, language_checked], axis=1)
temp.columns = ['original', 'NA_dup_removed', 'min_textsize_applied', 'language_checked']
temp

In [None]:
total = stat.groupby('note').sum(numeric_only=True)
total

In [None]:
total = total.T
total = total.rename(columns={'note':'language'}, index={'size':'Total'})
total.columns.name = 'language'
total

In [None]:
temp = temp.append(total).reset_index()
print(temp.to_latex(index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.0f}".format))

In [None]:
massivesumm.language.value_counts()

In [None]:
#remove dulicates (among files - e.g. from CC and all)
massivesumm.drop_duplicates(inplace=True)

In [None]:
massivesumm.language.value_counts()

In [None]:
massivesumm.to_csv(datapath + 'MassiveSumm_selected.csv', index=False)

In [None]:
# remove news venue from text (e.g. CNN, BBC)
#massivesumm['temp'] = massivesumm['url'].str.replace('https://', '', regex=False).str.replace('http://', '', regex=False).str.replace('www.', '', regex=False).apply(lambda x: x.split('/')[0].lower())
#massivesumm['temp'] = massivesumm['temp'].str.replace('.com.', '.', regex=False).str.replace('.co.', '.', regex=False).str.replace('.org.', '.', regex=False).str.replace('.net.', '.', regex=False)
#massivesumm['temp'] = massivesumm['temp'].apply(lambda x: x.split('.')[-2])
#massivesumm['text'] = [x.replace(str(y), '').replace(str(y).upper(), '').replace('  ', ' ') for x, y  in massivesumm[['text','temp']].to_numpy()]
#massivesumm.drop(columns=['temp'], inplace=True)

In [None]:
#massivesumm.to_csv(datapath + 'MassiveSumm_removed_platform_from_text.csv', index=False)

# Convert to Unified Form

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
massivesumm = pd.read_csv(datapath + 'MassiveSumm_selected.csv')

In [None]:
massivesumm = massivesumm.astype('string')

In [None]:
massivesumm.info()

In [None]:
%%time
massivesumm['label'] = 'human'
#ToDo: list() for Chinese to obtain letters or use some NLP library to get words
massivesumm['length'] = [len(x.split()) for x in massivesumm.text] #massivesumm['text'].str.split().apply(len)
massivesumm['temp'] = massivesumm['url'].str.replace('https://', '', regex=False).str.replace('http://', '', regex=False).str.replace('www.', '', regex=False).apply(lambda x: x.split('/')[0].lower())
massivesumm['temp'] = massivesumm['temp'].str.replace('.com.', '.', regex=False).str.replace('.co.', '.', regex=False).str.replace('.org.', '.', regex=False).str.replace('.net.', '.', regex=False)
massivesumm['temp'] = massivesumm['temp'].apply(lambda x: x.split('.')[-2])
massivesumm['source'] = 'MassiveSumm_' + massivesumm['temp']
massivesumm.drop(columns=['temp'], inplace=True)
massivesumm['domain'] = 'news'
massivesumm['topic'] = 'unknown'

massivesumm = massivesumm[massivesumm.length > 5].reset_index(drop=True)

massivesumm['keep'] = False
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'en'].sample(3300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'es'].sample(1300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'ru'].sample(1300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'ar'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'ca'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'cs'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'de'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'nl'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'pt'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'uk'].sample(300, random_state = 0).index), 'keep'] = True
massivesumm.loc[massivesumm.index.isin(massivesumm[massivesumm.language == 'zh'].sample(300, random_state = 0).index), 'keep'] = True

#remaining data
massivesumm_rem = massivesumm.loc[~massivesumm.keep]
massivesumm_rem.drop_duplicates(subset=['text'], inplace=True)
massivesumm_rem.drop_duplicates(subset=['title'], inplace=True)
massivesumm_rem = massivesumm_rem.drop(columns=['keep'])
massivesumm_rem = massivesumm_rem.sample(frac=1., random_state = 0).reset_index(drop=True)
massivesumm_rem['split'] = "train"
massivesumm_rem.to_csv(datapath + 'MassiveSumm_selected2.csv', index=False)

massivesumm = massivesumm.loc[massivesumm.keep]
massivesumm = massivesumm.drop(columns=['keep'])
massivesumm = massivesumm.sample(frac=1., random_state = 0).reset_index(drop=True)

massivesumm['split'] = "train"
test_split = massivesumm.groupby(['language']).sample(300, random_state = 0)
massivesumm.loc[massivesumm.index.isin(test_split.index), 'split'] = "test"

#massivesumm.drop(columns=['url', 'title'], inplace=True)
#massivesumm = massivesumm[column_order]
massivesumm.to_csv(datapath + 'MassiveSumm_selected.csv', index=False)

In [None]:
print(massivesumm[massivesumm.split == "train"].language.value_counts())
print(massivesumm[massivesumm.split == "test"].language.value_counts())

In [None]:
massivesumm.language.value_counts()

In [None]:
massivesumm.head()

In [None]:
massivesumm.source.value_counts().reset_index()

In [None]:
massivesumm.drop_duplicates(keep=False, inplace=True)

In [None]:
massivesumm.language.value_counts()

In [None]:
len(massivesumm.language.unique())

In [None]:
str(sorted(massivesumm.language.unique()))

# Check New Samples

In [None]:
from tqdm import tqdm

In [None]:
df = pd.read_csv(datapath + f'MassiveSumm_selected.csv')
print(df[df.duplicated(['title'], keep='first')].language.value_counts())
print(df[df.duplicated(['text'], keep='first')].language.value_counts())
print(df[df.duplicated(['title','text'], keep='first')].language.value_counts())

In [None]:
df2 = pd.read_csv(datapath + f'MassiveSumm_selected2.csv')
df2[df2.duplicated(['title'], keep=False)].language.value_counts()

In [None]:
present = []
for index, row in tqdm(df2.iterrows()):
  temp = False
  if row.title in df.title.to_list():
    temp = True
  present.append(temp)
df2['present'] = present

In [None]:
df2.present.value_counts()

In [None]:
to_be_changed = df[df.duplicated(['text']) | df.duplicated(['title'])]
df = df[~df.duplicated(['text']) & ~df.duplicated(['title'])]
to_be_changed.language.value_counts()

In [None]:
new_data = df2[~df2.present]
new_data['keep'] = False
new_data.loc[new_data.index.isin(new_data[new_data.language == 'en'].sample(215, random_state = 0).index), 'keep'] = True
new_data.loc[new_data.index.isin(new_data[new_data.language == 'es'].sample(79, random_state = 0).index), 'keep'] = True
new_data.loc[new_data.index.isin(new_data[new_data.language == 'pt'].sample(15, random_state = 0).index), 'keep'] = True
new_data.loc[new_data.index.isin(new_data[new_data.language == 'de'].sample(11, random_state = 0).index), 'keep'] = True
new_data.loc[new_data.index.isin(new_data[new_data.language == 'ar'].sample(1, random_state = 0).index), 'keep'] = True
new_data.loc[new_data.index.isin(new_data[new_data.language == 'zh'].sample(1, random_state = 0).index), 'keep'] = True
new_data = new_data.loc[new_data.keep]
new_data = new_data.drop(columns=['keep'])
new_data = new_data.sample(frac=1., random_state = 0).reset_index(drop=True)
new_data.language.value_counts()

In [None]:
new_data.present.value_counts()

# Preprocess Generated Data

After machine-texts generation from LLM based on human-texts obtained from above

In [None]:
#[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification
#[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models
#!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O lid.176.bin > /dev/null
#!pip install fasttext > /dev/null
#import fasttext
#fasttext_model = fasttext.load_model('lid.176.bin')

In [None]:
!pip install fasttext-langdetect language_data > /dev/null
#!python -m spacy download zh_core_web_sm > /dev/null
!pip install -U git+https://github.com/aboSamoor/polyglot.git@master --quiet

In [None]:
import pandas as pd
import numpy as np
from ftlangdetect import detect
from tqdm import tqdm
from collections import Counter
from langcodes import *
from polyglot.text import Text, Word
import regex
#import spacy
#nlp_zh = spacy.load('zh_core_web_sm')
pd.set_option('display.max_rows', 100)
tqdm.pandas()

In [None]:
models = ['text-davinci-003', 'gpt-3.5-turbo', 'gpt-4', 'alpaca-lora-30b', 'vicuna-13b', 'llama-65b', 'opt-66b', 'opt-iml-max-1.3b']
datasets = {}

multitude = pd.read_csv(datapath + f'MassiveSumm_selected.csv')

for model in models:
  temp = pd.read_csv(datapath + f'MassiveSumm_selected_{model}.csv')
  datasets[model] = temp

In [None]:
#213 duplicated titles, 310 duplicated texts, 3 titles could be in both splits
multitude[multitude.duplicated(['title'], keep=False)].groupby(['title']).split.value_counts()

In [None]:
#remove whitespaces around texts
def clear_dataset(df):
  df_string_columns = df.select_dtypes(['object'])
  df[df_string_columns.columns] = df_string_columns.apply(lambda x: x.str.strip())
  return df

#remove some unicode chars making problems in polyglot
#https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
def remove_bad_chars(text):
  RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
  return RE_BAD_CHARS.sub("", text)

#remove prompts from generated text
def remove_prompts(row):
  language = row.language
  language_name = Language.make(language=row.language).display_name()
  headline = row.title
  prompt = f'You are a multilingual journalist.\n\nTask: Write a news article in {language_name} for the following headline: "{headline}". Leave out the instructions, return just the text of the article.\n\nOutput:'
  #alpaca prompt
  prompt2 = f'<unk>### Instruction:\nYou are a multilingual journalist.\n\nTask: Write a news article in {language_name} for the following headline: "{headline}". Leave out the instructions, return just the text of the article.\n\n\n\n### Response:'
  text = str(row.generated).strip()
  text = text.replace(prompt2, '').strip()
  text = text.replace(''.join(prompt2.split()), '').strip()
  text = text.replace(prompt, '').strip()
  text = text.replace(''.join(prompt.split()), '').strip()
  text = text.replace(f'"{row.title}"', '').strip()
  text = text.replace(row.title, '').strip()
  return text
  text = text.replace('###', '').strip()
  text = text.replace('Instruction:', '').strip()
  text = text.replace('You are a multilingual journalist.', '').strip()
  text = text.replace('Task:', '').strip()
  text = text.replace(f'Write a news article in {language_name} for the following headline:', '').strip()
  text = text.replace('\"\".', '').strip()
  text = text.replace('Leave out the instructions, return just the text of the article.', '').strip()
  text = text.replace('Response:', '').strip()
  return text

#remove unfinished final sentence from generated text
def remove_unended_sentence(row):
  text = Text(row.generated, hint_language_code=row.language)
  if (row.generated != '') and (len(text.sentences) > 1):
    if (text.sentences[-1].words[-1] not in ['。', '؟', '!', '?', '.']): #final sentence not ended by any of these characters
      return row.generated.removesuffix(str(text.sentences[-1]))
  return row.generated

#detect language of generated text
def fasttext_detect_language(dataset):
  generated_languages = []
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if str(row.generated) != "nan":
      generated_languages.append(detect(text=row.generated.replace('\n', ' '), low_memory=False)['lang'])
    else:
      generated_languages.append(row.language)
  return generated_languages

#shorten generated texts
def shorten_generated(row):
  generated = str(row.generated).strip()
  if (generated == ''):
    return generated
  generated_length = len(row.generated.split())
  if (row.language == 'zh'):
    generated_length = len(Text(row.generated, hint_language_code=row.language).words)
  human_length = len(row.text.split())
  if (row.language == 'zh'):
    human_length = len(Text(row.text, hint_language_code=row.language).words)

  if (human_length == 0):
    return generated

  while (human_length < (generated_length - 5)): #remove last sentence while more than 5 words longer
    text = Text(generated, hint_language_code=row.language)
    if (len(text.sentences) < 2): #single sentence will not be removed
      return generated
    generated = generated.removesuffix(str(text.sentences[-1])).strip()
    generated_length = len(generated.split())
    if (row.language == 'zh'):
      generated_length = len(Text(generated, hint_language_code=row.language).words)
  return generated

#unify dataset form
def unify_form(dataset, model):
  dataset = clear_dataset(dataset)
  dataset['label'] = model
  dataset['text'] = dataset['generated']
  dataset['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(dataset.text, dataset.language)]
  dataset['source'] = [f'MULTITuDE_{x}' for x in dataset.source]
  return dataset

#uniqueness/repetitiveness - get number of unique sentences in row.text
def unique_sentences(row):
  if row.text == '':
    return 0
  sentences = Text(row.text, hint_language_code=row.language).sentences
  return len(set(sentences)) / len(sentences)

#uniqueness/repetitiveness - get number of unique words in row.text
def unique_words(row):
  if row.text == '':
    return 0
  words = Text(row.text, hint_language_code=row.language).words
  return len(set(words)) / len(words)

In [None]:
%%time
stat = {}
for model, dataset in datasets.items():
  print(f'Processing {model}')

  dataset['generated'] = dataset.apply(lambda x: remove_prompts(x), axis=1)
  dataset['generated'] = dataset['generated'].apply(lambda x: remove_bad_chars(x))
  dataset['generated'] = dataset.apply(lambda x: remove_unended_sentence(x), axis=1)
  dataset['generated'] = dataset.progress_apply(lambda x: shorten_generated(x), axis=1)
  empty_generation = len(dataset[dataset.generated == ''])

  dataset['generated_languages_fasttext'] = fasttext_detect_language(dataset)
  mismatched = dataset[(dataset.generated != '') & (dataset.language != dataset.generated_languages_fasttext)]
  mismatched = len(mismatched) / len(dataset) * 100
  print(f'{mismatched:0.2f}% mismatched based on FastText prediction')

  dataset = unify_form(dataset, model)

  shorts = len(dataset[dataset.length < 6])

  dataset['unique_sentences'] = [unique_sentences(row) for index, row in tqdm(dataset.iterrows())]
  dataset['unique_words'] = [unique_words(row) for index, row in tqdm(dataset.iterrows())]

  stat[model] = {'language_match' : (100 - mismatched), 'empty_generation' : empty_generation, 'short_texts': shorts, 'wordcount_mean' : dataset.length.mean(), 'wordcount_std' : dataset.length.std(), 'unique_sentences_mean' : dataset.unique_sentences.mean(), 'unique_sentences_std' : dataset.unique_sentences.std(), 'unique_words_mean' : dataset.unique_words.mean(), 'unique_words_std' : dataset.unique_words.std()}
  multitude = pd.concat([multitude, dataset], ignore_index=True, copy=False)

In [None]:
#are instruction-based prompts removed? it's ok
multitude[multitude.text.str.contains('You are a multilingual')].label.value_counts()

In [None]:
pd.options.display.float_format = "{:,.2f}".format
pd.DataFrame(stat).T

In [None]:
# @title
#shorten human texts
def shorten_text(row, index):
  human = str(row.text).strip()
  if (human == ''):
    return human
  generated_lengths = [multitude.iloc[(index + (i+1)*8300)].length for i in range(0, len(datasets))]
  generated_length = np.average(generated_lengths)
  human_length = len(row.text.split())
  if (row.language == 'zh'):
    human_length = len(Text(human, hint_language_code=row.language).words)

  while (((generated_length < (human_length - 5)) and (generated_length != 0)) or (human_length > 512)): #remove last sentence while more than 5 words longer or text longer than 512 words
    text = Text(human, hint_language_code=row.language)
    if (len(text.sentences) < 2): #single sentence will not be removed
      if (len(human.split()) > 512):
        return ' '.join(human.split()[:512])
      return human
    human = human.removesuffix(str(text.sentences[-1])).strip()
    human_length = len(human.split())
    if (row.language == 'zh'):
      human_length = len(Text(human, hint_language_code=row.language).words)
  return human

In [None]:
dataset = multitude[multitude.label.str.contains('human')].copy()
dataset['text'] = dataset['text'].apply(lambda x: remove_bad_chars(x))
dataset['text'] = [shorten_text(row, index) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
human_language_fasttext = [detect(text=text.replace('\n', ' '), low_memory=False)['lang'] for text in tqdm(dataset.text, total=len(dataset))]
dataset['generated_languages_fasttext'] = human_language_fasttext
mismatched = dataset[(dataset.language != dataset.generated_languages_fasttext)]
mismatched = len(mismatched) / len(dataset) *100
print(f'{mismatched:0.2f}% mismatched based on FastText prediction')
dataset['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(dataset.text, dataset.language)]
dataset['unique_sentences'] = [unique_sentences(row) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
dataset['unique_words'] = [unique_words(row) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
multitude.loc[multitude.label.str.contains('human'),:] = dataset

In [None]:
multitude[multitude.label.str.contains('human')].length.describe()

In [None]:
multitude[~multitude.label.str.contains('human')].length.describe()

In [None]:
multitude[multitude.label.str.contains('human')].unique_sentences.describe()

In [None]:
#number of all samples containing some duplicated sentences
len(multitude[(multitude.unique_sentences < 1) & (multitude.unique_sentences > 0)])

In [None]:
stat['human'] = {'language_match' : (100 - mismatched), 'empty_generation' : 0, 'short_texts': 0, 'wordcount_mean' : dataset.length.mean(), 'wordcount_std' : dataset.length.std(), 'unique_sentences_mean' : dataset.unique_sentences.mean(), 'unique_sentences_std' : dataset.unique_sentences.std(), 'unique_words_mean' : dataset.unique_words.mean(), 'unique_words_std' : dataset.unique_words.std()}

In [None]:
pd.DataFrame(stat).T

In [None]:
temp = pd.DataFrame(stat).T.reset_index()
temp
with open('generated_stat.tex', 'wt') as out: temp.to_latex(buf=out, index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.2f}".format)

In [None]:
#if length of human texts trimmed to 512 words
human_length = dataset.length.copy()
human_length = pd.Series([min(x, 512) for x in human_length])
human_length.describe()

In [None]:
#keep = [x not in to_be_changed.title.to_list() for x in multitude.title]
#multitude[keep].language.value_counts() / 7

## Language Mismatch Analysis

In [None]:
#human texts language mismatch
dataset = multitude[multitude.label.str.contains('human')]
mismatched = dataset[(dataset.language != dataset.generated_languages_fasttext)]

In [None]:
mismatched = multitude[(multitude.generated != '') & (multitude.generated_languages_fasttext.notna()) & (multitude.language != multitude.generated_languages_fasttext)]

In [None]:
mismatched[['language']].value_counts()

In [None]:
mismatched[['language', 'generated_languages_fasttext']].value_counts()

In [None]:
mismatched[['label']].value_counts()

In [None]:
mismatched[['label', 'language']].value_counts()

In [None]:
mismatched[mismatched.label == 'llama-65b'][['language', 'generated_languages_fasttext']].value_counts()

## Dataset Analysis & Clearing

In [None]:
multitude.head()

In [None]:
multitude.label.value_counts()

In [None]:
#move samples from duplicated titles that are in both splits from test split to train split
print(multitude.groupby(['title']).split.value_counts()['Daybreak Africa | Voice of America - English'])
print(multitude.groupby(['title']).split.value_counts()['International Edition 2330 EDT'])
print(multitude.groupby(['title']).split.value_counts()['Voice of America - English'])

#delete from test split
#multitude = multitude[((multitude.title != 'Daybreak Africa | Voice of America - English') & (multitude.title != 'International Edition 2330 EDT') & (multitude.title != 'Voice of America - English')) | (multitude.split != 'test')]

#just move to train split
multitude.loc[((multitude.title == 'Daybreak Africa | Voice of America - English') | (multitude.title == 'International Edition 2330 EDT') | (multitude.title == 'Voice of America - English')), 'split'] = "train"

print(multitude.groupby(['title']).split.value_counts()['Daybreak Africa | Voice of America - English'])
print(multitude.groupby(['title']).split.value_counts()['International Edition 2330 EDT'])
print(multitude.groupby(['title']).split.value_counts()['Voice of America - English'])

In [None]:
#delete empty and too-short (less than 6 words) texts
multitude.drop(columns=['url', 'title', 'generated', 'generated_languages_fasttext', 'unique_sentences',	'unique_words'], inplace=True)
multitude.loc[multitude.text == "nan", "text"] = pd.NA
multitude.loc[multitude.text == "", "text"] = pd.NA
multitude.dropna(inplace=True)
multitude = multitude[multitude.length > 5]

In [None]:
multitude.label.value_counts()

In [None]:
#delete text duplicates
multitude = multitude.drop_duplicates(subset=['text'])

In [None]:
multitude.label.value_counts()

In [None]:
multitude.language.value_counts()

In [None]:
multitude.groupby(['label'])['language'].value_counts()

In [None]:
multitude.split.value_counts()

In [None]:
multitude.groupby(['split'])['language'].value_counts()

In [None]:
multitude['multi_label'] = multitude['label'].copy()
multitude['label'] = int(0)
multitude.loc[~multitude.multi_label.str.contains('human'), 'label'] = int(1)
multitude = multitude.sample(frac=1., random_state = 0).reset_index(drop=True)
multitude.to_csv(datapath + f'multitude.csv', index=False)

In [None]:
multitude.head()

In [None]:
print(multitude[multitude.multi_label.str.contains('human')].length.describe())
print(multitude[~multitude.multi_label.str.contains('human')].length.describe())

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
multitude = pd.read_csv(datapath + f'multitude.csv')

In [None]:
multitude.head()

In [None]:
#are instruction-based prompts removed? it's ok
phrases = ['You are a multilingual', 'Task:', 'Instruction:', 'Response:', 'Output:']
for phrase in phrases:
  print(f'{phrase}\n{multitude[multitude.text.str.contains(phrase)].multi_label.value_counts()}\n')

In [None]:
multitude[multitude.split=='train'].groupby(['multi_label']).language.value_counts()

In [None]:
multitude[multitude.split=='test'].groupby(['multi_label']).language.value_counts()

In [None]:
!pip install language_data > /dev/null
from langcodes import *
#multitude[multitude.split=='test'].language.value_counts().reset_index().sort_values(by=['index']).to_latex(index=False, formatters={"multi_label": str.lower}, float_format="{:.1f}".format)
temp_train = multitude[multitude.split=='train'].groupby('language')['text'].count().reset_index()
temp_test = multitude[multitude.split=='test'].groupby('language')['text'].count().reset_index()
temp = temp_train.merge(temp_test, how='outer', on=['language']).sort_values(by=['language'])
temp['language'] = [Language.make(language=x).display_name() for x in temp['language']]
temp = temp.sort_values(by=['language'])
temp = temp.rename(columns={'language':'Language', 'text_x':'Train', 'text_y':'Test'})
total = temp.sum()
total['Language'] = '\textbf{Total}'
temp = temp.append(total,ignore_index=True)
temp
with open('table.tex', 'wt') as out: temp.to_latex(buf=out, index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.0f}".format)

In [None]:
temp_train = multitude[multitude.split=='train'].groupby('multi_label')['text'].count().reset_index()
temp_test = multitude[multitude.split=='test'].groupby('multi_label')['text'].count().reset_index()
temp = temp_train.merge(temp_test, how='outer', on=['multi_label']).sort_values(by=['multi_label'])
temp = temp.rename(columns={'multi_label':'Generator', 'text_x':'Train', 'text_y':'Test'})
human = temp[temp.Generator.str.contains('human')]
machine = temp[~temp.Generator.str.contains('human')]
#temp = pd.concat([human, machine], ignore_index=True)
temp = machine
total = temp.sum()
total['Generator'] = '\textbf{machine}'
temp = temp.append(total,ignore_index=True)
human['Generator'] = '\textbf{human}'
temp = temp.append(human,ignore_index=True)
temp
with open('table.tex', 'wt') as out: temp.to_latex(buf=out, index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.0f}".format)

In [None]:
multitude[multitude.split=='train'].groupby('language')['text'].count()

# Additional Human data
If dataset balancing required

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
multitude = pd.read_csv(datapath + f'multitude.csv')

In [None]:
multitude.head()

In [None]:
df = pd.read_csv(datapath + f'MassiveSumm_selected2.csv')

In [None]:
df.language.value_counts()

In [None]:
df.drop(columns=['url', 'title'], inplace=True)

In [None]:
#provide missing number of human samples in multitude dataset
temp = multitude.groupby(['language'])[['label']].value_counts().reset_index()
temp.loc[temp.label == 0, 0] *= -1
temp = temp.groupby('language')[0].sum().reset_index()
print(temp)
df_selected = pd.DataFrame()
for idx, row in temp.iterrows():
  df_selected = pd.concat([df_selected, df[df.language == row.language].sample(min(row[0], len(df[df.language == row.language])), random_state = 0)])
df_selected.language.value_counts()

In [None]:
df['multi_label'] = 'human'
df['label'] = 0

In [None]:
#shorten human texts to <512 words
def shorten_human_text(row, index):
  human = str(row.text).strip()
  if (human == ''):
    return human
  human_length = len(row.text.split())
  if (row.language == 'zh'):
    human_length = len(Text(human, hint_language_code=row.language).words)

  while (human_length > 512): #remove last sentence while text longer than 512 words
    text = Text(human, hint_language_code=row.language)
    if (len(text.sentences) < 2): #single sentence will not be removed
      if (len(human.split()) > 512):
        return ' '.join(human.split()[:512])
      return human
    human = human.removesuffix(str(text.sentences[-1])).strip()
    human_length = len(human.split())
    if (row.language == 'zh'):
      human_length = len(Text(human, hint_language_code=row.language).words)
  return human

In [None]:
df['text'] = df['text'].apply(lambda x: remove_bad_chars(x))
df['text'] = [shorten_human_text(row, index) for index, row in tqdm(df.iterrows(), total=len(df))]
df['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(df.text, df.language)]

In [None]:
multitude2 = pd.concat([multitude, df])
multitude2 = multitude2.drop_duplicates(subset=['text'])
print(multitude2.label.value_counts())
print(multitude2.language.value_counts())
print(multitude2.multi_label.value_counts())

In [None]:
multitude2 = multitude2.sample(frac=1., random_state = 0).reset_index(drop=True)
multitude2.to_csv(datapath + f'MULTITuDE2.csv', index=False)