In [25]:
from bs4 import BeautifulSoup
import gc
from nltk.tokenize import sent_tokenize
import numpy as np
import os
import pandas as pd
from pathlib import Path
from pprint import pprint

In [2]:
def get_paths(dirname):
    """Return list of paths a given directory.

    Parameters
    ----------
    dirname: str

    Returns
    -------
    List of path objects.
    """
    return list(Path(dirname).iterdir())

In [44]:
def get_text(path):
    """Read in xml file, extract relevant info, and return a dataframe.

    Parameters
    ----------
    path: pathlib.Path
        Specify which file to read in. Includes the name of the directory
        as well as the file.

    Returns
    -------
    pd.DataFrame containing a row for each sentence. There are also label 
    columns with age and sex.
    """
    sex, age = path.parts[-1].split('.')[1:3]
    with open(path, 'r', encoding='latin1') as f:
        soup = BeautifulSoup(f, 'xml')
    posts = [t.text.replace('\n', ' ').replace('\t', ' ').strip() 
             for t in soup.find_all('post')]
    sentences = [s for s in sent_tokenize(' '.join(posts))]
    return pd.DataFrame(dict(text=sentences, sex=sex, age=int(age)))

In [80]:
output_dir = 'data'
paths = get_paths('blogs')
len(paths)

19320

In [63]:
dfs = [get_text(path) for path in paths]

In [65]:
df = pd.concat(dfs, axis=0).reset_index(drop=True)

In [66]:
df.isnull().sum()

text    0
sex     0
age     0
dtype: int64

In [67]:
df.age.describe()

count    8.730028e+06
mean     2.422944e+01
std      7.861914e+00
min      1.300000e+01
25%      1.700000e+01
50%      2.400000e+01
75%      2.700000e+01
max      4.800000e+01
Name: age, dtype: float64

In [68]:
df.sex.value_counts(normalize=True)

female    0.522576
male      0.477424
Name: sex, dtype: float64

In [69]:
df.dtypes

text    object
sex     object
age      int64
dtype: object

In [70]:
df.text.str.len().quantile([0, .1, .25, .5, .75, .9, .95, .99, 1])

0.00        1.0
0.10       16.0
0.25       33.0
0.50       62.0
0.75      105.0
0.90      159.0
0.95      206.0
0.99      374.0
1.00    43280.0
Name: text, dtype: float64

In [78]:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
df.to_csv('data/sentences.csv', index=False)