In [2]:
import pandas as pd

CHUNK_SIZE = 512

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [4]:
from multiprocessing import Pool
from tqdm.notebook import tqdm

def split_data(row):
    eid, values = row
    input_ids = tokenizer(values.text).input_ids
    chunked = [input_ids[chunk: chunk + CHUNK_SIZE] for chunk in range(0, len(input_ids), CHUNK_SIZE)]
    decoded_chunked = tokenizer.batch_decode(chunked)
    return pd.DataFrame({'id': [eid]*len(chunked),
                         'pretokenized_text': chunked,
                         'decoded_text': decoded_chunked})
                         
def build_chunk_dataframe(text_data, metadata=None, cores=10):
    with Pool(cores) as p:
        chunks = list(tqdm(p.imap_unordered(split_data, text_data.iterrows()),
                            total=len(text_data)))
    
    if metadata is not None:
        return pd.concat(chunks).merge(metadata, on='id')
    else:
        return pd.concat(chunks)

def clean_non_unique(data):
    nunique_ids = (data.id.value_counts() > 1)
    nunique_ids = nunique_ids[nunique_ids].index
    return data[data.id.isin(nunique_ids)]

# Blog data processing

In [None]:
print('Load data blog_as_csv.csv')
blog_corpus = pd.read_csv("data/nlp/blog_corpus/blog_as_csv.csv")

In [16]:
blog_corpus.text = blog_corpus.text.apply(lambda x: x.strip())
clean_blog_corpus = blog_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))
meta_blog_corpus = blog_corpus[['id', 'age', 'topic', 'gender']].groupby("id").agg(lambda x: list(x)[0])
full_blog_corpus = meta_blog_corpus.merge(clean_blog_corpus, on='id')
full_blog_corpus

Unnamed: 0_level_0,age,topic,gender,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blog_0,15,Student,male,"Info has been found (+/- 100 pages, and 4.5 MB..."
blog_1,33,InvestmentBanking,male,Thanks to Yahoo!'s Toolbar I can now 'capture'...
blog_10,25,indUnk,female,"Even though I am exhausted after today, I must..."
blog_100,26,indUnk,male,Hello again. This is the offical No Action bl...
blog_1000,16,Student,male,My 'band' got in its first fight tonight. most...
...,...,...,...,...
blog_9995,17,Communications-Media,male,"Good morning folks, How are me brothers and s..."
blog_9996,23,indUnk,female,"NEWater Ok, that's just gross. Another pot..."
blog_9997,26,Education,male,I love salsa. It's one of the greatest foods e...
blog_9998,13,Law,male,"Hey all, This is Jared, this is my first post ..."


In [94]:
chunked_blog_data = build_chunk_dataframe(full_blog_corpus, meta_blog_corpus)
nunique_blog_data = clean_non_unique(chunked_blog_data)
nunique_blog_data

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/19320 [00:00<?, ?it/s]

Unnamed: 0,id,pretokenized_text,decoded_text,age,topic,gender
0,blog_10,"[0, 8170, 600, 38, 524, 17067, 71, 452, 6, 38,...","<s>Even though I am exhausted after today, I m...",25,indUnk,female
1,blog_10,"[216, 24, 4, 152, 7105, 16, 3680, 684, 25, 121...",know it. This hell is otherwise known as U Vi...,25,indUnk,female
2,blog_10,"[17027, 12, 560, 12, 1610, 18, 2850, 12179, 33...","groom-to-be's scrotal bling, but not the Fout...",25,indUnk,female
3,blog_100,"[0, 31414, 456, 4, 1437, 152, 16, 5, 160, 3569...",<s>Hello again. This is the offical No Action...,26,indUnk,male
4,blog_100,"[5, 7884, 20774, 29, 31, 14, 6, 25, 157, 25, 5...","the singalongs from that, as well as the mino...",26,indUnk,male
...,...,...,...,...,...,...
382132,blog_9660,"[6, 53, 38, 174, 69, 52, 1017, 1153, 357, 2067...",", but I told her we'd probably better wait on ...",35,indUnk,male
382133,blog_9660,"[6, 61, 16, 182, 9327, 4, 1437, 1437, 38, 21, ...",", which is very unfortunate. I was pretty un...",35,indUnk,male
382134,blog_9660,"[9, 5, 2859, 9572, 6, 30005, 24, 6, 8, 122, 52...","of the heat strip, disconnected it, and now w...",35,indUnk,male
382135,blog_9660,"[24, 19, 162, 8, 3668, 19975, 24, 31509, 243, ...",it with me and absolutely hated it ('It's stu...,35,indUnk,male


In [95]:
nunique_blog_data.to_csv("data/nlp/blog_corpus/blog_as_csv_preprocessed.csv", index=False)

# Mail data processing

In [None]:
print('Load data mail_as_csv.csv')
mail_corpus = pd.read_csv("data/nlp/enron_mail_20150507/mail_as_csv.csv")
mail_corpus

In [5]:
import re
def clean_text(text):
    clean_mail = re.sub(r'(\\+r)?(\\+n)+', '\n', text)
    clean_mail = re.sub(r'\\+t', '\t', clean_mail)
    clean_mail = '\n'.join(clean_mail.strip().split('\n')[15:-1])
    clean_mail = re.sub(r'X-.+:.*\n', '<s>', clean_mail)
    clean_mail = re.sub(r'From:.*\n', '', clean_mail)
    clean_mail = re.sub(r"\\'", "'", clean_mail)

    return clean_mail

mail_corpus['clean_text'] = mail_corpus.text.apply(clean_text)

In [8]:
mail_corpus.columns = ['user', 'old_text', 'id', 'text']
mail_corpus.text = mail_corpus.text.apply(lambda x: x.strip())
clean_mail_corpus = mail_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))

chunked_mail_data = build_chunk_dataframe(clean_mail_corpus, None)
nunique_mail_data = clean_non_unique(chunked_mail_data)
nunique_mail_data

  0%|          | 0/150 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (460644 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (454546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (505057 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691872 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (863829 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence

Unnamed: 0,id,pretokenized_text,decoded_text
0,mail_101,"[0, 28915, 6, 50118, 6715, 3438, 162, 31, 110,...","<s>Brian,\nPlease remove me from your distribu..."
1,mail_101,"[4, 1437, 22381, 42, 16, 230, 35, 48669, 44426...",. Usually this is C:\\Program Files\\Microsof...
2,mail_101,"[50118, 39767, 21194, 41552, 37457, 29, 15698,...",\nGabriel<\s>Hey do you have to go to the harr...
3,mail_101,"[87, 706, 722, 49069, 37457, 29, 15698, 100, 2...",than 24 hours.<\s>I think harassing me is a v...
4,mail_101,"[31271, 4, 50118, 176, 73, 1549, 73, 2663, 143...","520.\n2/16/01 MANAGEMENT-PWR 177,196.<\s>I'm ..."
...,...,...,...
63596,mail_19,"[246, 495, 5214, 246, 495, 5214, 246, 495, 521...",3D=3D=3D=3D=3D\nThe object of humor notwithsta...
63597,mail_19,"[4, 1437, 6830, 5, 414, 6, 79, 64, 75, 224, 93...",". Without the data, she can't say anything\nc..."
63598,mail_19,"[50118, 42038, 1258, 4, 1437, 38, 74, 28, 55, ...",\nparticipation. I would be more than happy t...
63599,mail_19,"[0, 0, 0, 0, 0, 50118, 28409, 100, 4, 1437, 45...",<s><s><s><s><s>\nFYI. Thanks to Max at the PX...


In [10]:
nunique_mail_data.to_csv("data/nlp/enron_mail_20150507/mail_as_csv_preprocessed.csv", index=False)

# Book data processing

In [5]:
print('Load data book_as_csv.csv')
book_corpus = pd.read_csv("data/nlp/gutenberg/book_as_csv.csv")
book_corpus

Load data book_as_csv.csv


FileNotFoundError: [Errno 2] No such file or directory: 'data/nlp/gutenberg/book_as_csv.csv'

In [27]:
import re
def clean_text(text):
    return re.sub(r'\n\n+', '\n', text)[512:]

book_corpus['clean_text'] = book_corpus.text.apply(clean_text)

In [38]:
book_corpus.columns = ['old_text', 'id', 'title', 'author', 'authoryearofbirth',
                        'authoryearofdeath', 'language', 'downloads', 'subjects', 'id_2',
                        'text']
book_corpus.text = book_corpus.text.apply(lambda x: x.strip())
clean_book_corpus = book_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))

chunked_book_data = build_chunk_dataframe(clean_book_corpus, book_corpus.drop(['old_text', 'text'], axis=1))
nunique_book_data = clean_non_unique(chunked_book_data)
nunique_book_data

  0%|          | 0/2934 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (39958 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (66207 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (63482 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (65489 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (61438 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Unnamed: 0,id,pretokenized_text,decoded_text,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,id_2
0,PG10007,"[0, 39986, 10, 2225, 7391, 7, 5, 36455, 3693, ...",<s>Upon a paper attached to the Narrative whic...,Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
1,PG10007,"[15, 10, 7019, 50118, 20554, 4086, 11, 10, 669...","on a slight\neminence in a forest. The road, ...",Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
2,PG10007,"[6, 50118, 8155, 56, 57, 19, 162, 31, 6, 38, 4...",",\nwho had been with me from, I might almost s...",Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
3,PG10007,"[802, 2185, 1937, 4, 38, 21, 45, 26851, 6, 13,...","thought myself alone. I was not frightened, f...",Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
4,PG10007,"[70, 363, 131, 8, 31, 14, 86, 10, 20667, 50118...",all night; and from that time a servant\nalwa...,Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
...,...,...,...,...,...,...,...,...,...,...,...
689426,PG36734,"[154, 36, 13728, 4, 939, 6, 33906, 4, 28222, 1...","ing (vol. i, pp. 169-70, of the\n seventeen...",The Browning Cyclopædia: A Guide to the Study ...,"Berdoe, Edward",1836.0,1916.0,['en'],100,"{'Browning, Robert, 1812-1889 -- Encyclopedias'}",book_1281
689427,PG36734,"[28173, 4, 925, 4, 24030, 2645, 5789, 7, 162, ...",satisfactory. Dr. Garnett writes to me on the...,The Browning Cyclopædia: A Guide to the Study ...,"Berdoe, Edward",1836.0,1916.0,['en'],100,"{'Browning, Robert, 1812-1889 -- Encyclopedias'}",book_1281
689428,PG36734,"[221, 2028, 271, 18, 22, 46354, 46439, 811, 38...","Pindar's ""Fourth Pythian Ode,"" where he speak...",The Browning Cyclopædia: A Guide to the Study ...,"Berdoe, Edward",1836.0,1916.0,['en'],100,"{'Browning, Robert, 1812-1889 -- Encyclopedias'}",book_1281
689429,PG36734,"[11005, 4, 50118, 10975, 401, 742, 20, 1065, 9...","Bible.\n[6] The above sonnet, by Robert Brown...",The Browning Cyclopædia: A Guide to the Study ...,"Berdoe, Edward",1836.0,1916.0,['en'],100,"{'Browning, Robert, 1812-1889 -- Encyclopedias'}",book_1281


In [33]:
book_corpus.columns

Index(['text', 'id', 'title', 'author', 'authoryearofbirth',
       'authoryearofdeath', 'language', 'downloads', 'subjects', 'id_2',
       'clean_text'],
      dtype='object')

In [39]:
nunique_book_data.to_csv("data/nlp/gutenberg/book_as_csv_preprocessed.csv", index=False)