In [4]:
import pandas as pd
import os

#read blogtext.csv from the data/nlp/blog_corpus directory
blog_data = pd.read_csv(os.path.join('data', 'nlp', 'blog_corpus', 'blogtext.csv'))

#replace unique id values with "blog_n" where n is a number beggining at 0
n_values = len(blog_data.id.unique())
author_mapping = {k: v for k, v in zip(blog_data.id.unique(), range(n_values))}

blog_data['id'] = blog_data['id'].apply(lambda x: 'blog_' + str(author_mapping[x]))

blog_data



Unnamed: 0,id,gender,age,topic,sign,date,text
0,blog_0,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,blog_0,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,blog_0,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,blog_0,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,blog_1,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
...,...,...,...,...,...,...,...
681279,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, I could write some really ..."
681280,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, 'I have the second yeast i..."
681281,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, Your 'boyfriend' is fuckin..."
681282,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan: Just to clarify, I am as..."


In [7]:
enron_path = "data/nlp/enron_mail_20150507/maildir"

def recursive_file_read(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            yield os.path.join(root, file)

user_frame = pd.DataFrame(columns=['user', 'text'])
examples = []
for user in os.listdir(enron_path):
    for file in recursive_file_read(os.path.join(enron_path, user)):
        with open(file, 'rb') as f:
            text = f.read()
            examples.append({'user': user, 'text': text})

enron_data = pd.DataFrame(examples, columns=['user', 'text'])
n_values = len(enron_data.user.unique())
email_mapping = {k: v for k, v in zip(enron_data.user.unique(), range(n_values))}

enron_data['id'] = enron_data['user'].apply(lambda x: 'mail_' + str(email_mapping[x]))
enron_data


Unnamed: 0,user,text
0,allen-p,"b""Message-ID: <18782981.1075855378110.JavaMail..."
1,allen-p,"b""Message-ID: <15464986.1075855378456.JavaMail..."
2,allen-p,"b""Message-ID: <24216240.1075855687451.JavaMail..."
3,allen-p,"b""Message-ID: <13505866.1075863688222.JavaMail..."
4,allen-p,"b""Message-ID: <30922949.1075863688243.JavaMail..."
...,...,...
517396,zufferli-j,"b""Message-ID: <26807948.1075842029936.JavaMail..."
517397,zufferli-j,b'Message-ID: <25835861.1075842029959.JavaMail...
517398,zufferli-j,"b""Message-ID: <28979867.1075842029988.JavaMail..."
517399,zufferli-j,b'Message-ID: <22052556.1075842030013.JavaMail...


In [14]:
meta = pd.read_csv('data/nlp/gutenberg/metadata/metadata.csv')

available_texts = list(os.listdir('data/nlp/gutenberg/data/text'))

clean_meta = meta[~meta.author.isin({'Anonymous', 'Various'})]
clean_meta = clean_meta[clean_meta.language.apply(lambda x: 'en' in x)]
clean_meta = clean_meta[clean_meta.id.apply(lambda x: f'{x}_text.txt' in available_texts)]

In [16]:
clean_meta

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
7,PG10007,Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",
13,PG10012,The Mountains of California,"Muir, John",1838.0,1914.0,['en'],88,"{'Natural history -- California', 'Muir, John,...",
21,PG1001,"Divine Comedy, Longfellow's Translation, Hell",Dante Alighieri,1265.0,1321.0,['en'],3358,"{'Epic poetry, Italian -- Translations into En...",
32,PG1002,"Divine Comedy, Longfellow's Translation, Purga...",Dante Alighieri,1265.0,1321.0,['en'],57,"{'Epic poetry, Italian -- Translations into En...",
34,PG10031,The Complete Poetical Works of Edgar Allan Poe...,"Poe, Edgar Allan",1809.0,1849.0,['en'],415,"{'Fantasy poetry, American'}",
...,...,...,...,...,...,...,...,...,...
29754,PG36828,"Pastor Pastorum; Or, The Schooling of the Apos...","Latham, Henry",1821.0,1902.0,['en'],33,"{'Teaching', 'Jesus Christ'}",
29759,PG36832,Conscript 2989: Experiences of a Drafted Man,"Crump, Irving",1887.0,1979.0,['en'],29,"{'Military training camps -- United States', '...",
29760,PG36833,"The Camp Fire Girls at Onoway House; Or, The M...","Frey, Hildegard G.",1891.0,1957.0,['en'],44,"{'Measles -- Juvenile fiction', 'Camp Fire Gir...",
29834,PG368,Acres of Diamonds: Our Every-day Opportunities,"Conwell, Russell H.",1843.0,1925.0,['en'],303,"{'Conwell, Russell H., 1843-1925', 'Success', ...",


In [18]:
books = []
for text_id in clean_meta.id:
    filename = f'data/nlp/gutenberg/data/text/{text_id}_text.txt'
    print(f'Reading {filename}')
    with open(filename, 'r') as f:
        text = f.read()
        books.append(pd.DataFrame({'text': [text], 'id': [text_id]}))

book_data = pd.concat(books)

Reading data/nlp/gutenberg/data/text/PG10007_text.txt
Reading data/nlp/gutenberg/data/text/PG10012_text.txt
Reading data/nlp/gutenberg/data/text/PG1001_text.txt
Reading data/nlp/gutenberg/data/text/PG1002_text.txt
Reading data/nlp/gutenberg/data/text/PG10031_text.txt
Reading data/nlp/gutenberg/data/text/PG10039_text.txt
Reading data/nlp/gutenberg/data/text/PG1003_text.txt
Reading data/nlp/gutenberg/data/text/PG1004_text.txt
Reading data/nlp/gutenberg/data/text/PG10052_text.txt
Reading data/nlp/gutenberg/data/text/PG1005_text.txt
Reading data/nlp/gutenberg/data/text/PG10070_text.txt
Reading data/nlp/gutenberg/data/text/PG100_text.txt
Reading data/nlp/gutenberg/data/text/PG10110_text.txt
Reading data/nlp/gutenberg/data/text/PG1013_text.txt
Reading data/nlp/gutenberg/data/text/PG10140_text.txt
Reading data/nlp/gutenberg/data/text/PG1014_text.txt
Reading data/nlp/gutenberg/data/text/PG10150_text.txt
Reading data/nlp/gutenberg/data/text/PG1015_text.txt
Reading data/nlp/gutenberg/data/text/P

In [39]:
book_data_meta = book_data.merge(clean_meta, on=['id']).drop('type', axis=1).dropna()
n_values = len(book_data_meta.author.unique())
book_mapping = {k: v for k, v in zip(book_data_meta.author.unique(), range(n_values))}

book_data_meta['id_2'] = book_data_meta.author.apply(lambda x: 'book_' + str(book_mapping[x]))
book_data_meta


Unnamed: 0,text,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,id_2
0,\n\n\n\nCarmilla\n\nby Joseph Sheridan Le Fanu...,PG10007,Carmilla,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],3626,"{'Vampires -- Fiction', 'Young women -- Fiction'}",book_0
1,\n[Illustration]\n\n\n\n\nThe Mountains of Cal...,PG10012,The Mountains of California,"Muir, John",1838.0,1914.0,['en'],88,"{'Natural history -- California', 'Muir, John,...",book_1
2,\n\n\n\nThe Divine Comedy\n\nof Dante Alighier...,PG1001,"Divine Comedy, Longfellow's Translation, Hell",Dante Alighieri,1265.0,1321.0,['en'],3358,"{'Epic poetry, Italian -- Translations into En...",book_2
3,\n\n\n\nThe Divine Comedy\n\nof Dante Alighier...,PG1002,"Divine Comedy, Longfellow's Translation, Purga...",Dante Alighieri,1265.0,1321.0,['en'],57,"{'Epic poetry, Italian -- Translations into En...",book_2
4,\n\n\n\n The Complete P...,PG10031,The Complete Poetical Works of Edgar Allan Poe...,"Poe, Edgar Allan",1809.0,1849.0,['en'],415,"{'Fantasy poetry, American'}",book_3
...,...,...,...,...,...,...,...,...,...,...
3432,\n\n\n\n\n Pastor ...,PG36828,"Pastor Pastorum; Or, The Schooling of the Apos...","Latham, Henry",1821.0,1902.0,['en'],33,"{'Teaching', 'Jesus Christ'}",book_1287
3433,\n\n\n\n\n[Illustration: I summoned “Local Boa...,PG36832,Conscript 2989: Experiences of a Drafted Man,"Crump, Irving",1887.0,1979.0,['en'],29,"{'Military training camps -- United States', '...",book_1288
3434,\n\n\n\n\n[Illustration: GLADYS TURNED THE CAR...,PG36833,"The Camp Fire Girls at Onoway House; Or, The M...","Frey, Hildegard G.",1891.0,1957.0,['en'],44,"{'Measles -- Juvenile fiction', 'Camp Fire Gir...",book_1289
3435,\n\n\n\n\nACRES OF DIAMONDS\n\nBy Russell H. C...,PG368,Acres of Diamonds: Our Every-day Opportunities,"Conwell, Russell H.",1843.0,1925.0,['en'],303,"{'Conwell, Russell H., 1843-1925', 'Success', ...",book_1290


In [38]:
all([x in book_data_meta.author.unique() for x in book_mapping.keys()])

True

In [41]:
blog_data.to_csv('data/nlp/blog_corpus/blog_as_csv.csv', index=False)
enron_data.to_csv('data/nlp/enron_mail_20150507/mail_as_csv.csv', index=False)
book_data_meta.to_csv('data/nlp/gutenberg/book_as_csv.csv', index=False)