In [1]:
import numpy as np
import pandas as pd
from distributed import Client, progress
from distributed.client import futures_of
import hdfs3
import dask.dataframe as dd

HDFS_NAMENODE = 'bernadette.lip.ens-lyon.fr'
hdfs = hdfs3.HDFileSystem(HDFS_NAMENODE)
DASK_SCHEDULER = 'orange.lip.ens-lyon.fr'
client = Client(DASK_SCHEDULER + ':8786')
client

0,1
Client  Scheduler: tcp://orange.lip.ens-lyon.fr:8786  Dashboard: http://orange.lip.ens-lyon.fr:8787,Cluster  Workers: 24  Cores: 364  Memory: 1153.39 GB


In [2]:
sosweet_text_undir_weighted_mention_network_thresh_5_glob = \
    '/sosweet-text/undir_weighted_mention_network_thresh_5/*'

# Load the data

In [3]:
print('Reading from:')
print('\n'.join(hdfs.glob(sosweet_text_undir_weighted_mention_network_thresh_5_glob)))

df = dd.read_csv('hdfs://' + HDFS_NAMENODE + sosweet_text_undir_weighted_mention_network_thresh_5_glob,
                 names=['user_id', 'timestamp', 'body'],
                 header=None,
                 parse_dates=['timestamp'])
df = df.persist()
progress(df)

Reading from:
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-02-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-03-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-04-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-05-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-06-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-07-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-08-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-09-pipe-user_timestamp_body-csv-pipe-filter_users
/sosweet-text/undir_weighted_mention_network_thresh_5/2016-10-pipe-user_timestamp_body-csv-pipe-filter_users
/sosw

A Jupyter Widget

### Using rust

In [4]:
import nw2vec

In [5]:
nw2vec.sum_as_string(3, 3)

'6'

In [6]:
import dask.delayed

In [7]:
x = dask.delayed(nw2vec.sum_as_string)(3, 4)
y = dask.delayed(nw2vec.sum_as_string)(5, 6)
z = dask.delayed(lambda a, b: a + b)(x, y)
z.compute()

'711'

### Getting words

In [55]:
import unicodedata
import re

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

url_regex = 'http[^ ]+'
url_cregex = re.compile(url_regex)
word_split_regex = '[?!;:,.<>\'"`=+\-*%$/\\\\|() ]+'
word_split_cregex = re.compile(word_split_regex)

def extract_words(text):
    # Transliterate accents and get lowercase
    text = strip_accents(text).lower()
    # Remove urls (which contain : and / characters)
    text = re.sub(url_cregex, '', text)
    # Split on ?!;:,.<>'"`=+-*%$/\|()
    words = re.split(word_split_cregex, text)
    # Remove stuff starting with @, and 'RT'
    return [word for word in words
            if len(word) > 0 and word[0] != '@' and word[:2] != 'RT']

In [67]:
def expand(group):
    print(type(group), group.shape)
    words = [word
             for _, row in group.iterrows()
             for rowwords in extract_words(row.body)]
    other_cols = [col for col in group.index if col != 'body']
    return group[other_cols].join(pd.Series(words, ))

In [68]:
ldf = df.head()

In [None]:
df.groupby(['user_id', 'timestamp']).get_group(0).compute()

In [70]:
ldf.groupby(ldf.index).get_group(0)

Unnamed: 0,user_id,timestamp,body
0,624590762,2016-02-02 09:28:43,Si j'ai 10 en histoire je sors grosse bouteill...


In [57]:
pd.concat([expand(row) for _, row in ldf.iterrows()])

Unnamed: 0,timestamp,user_id,words
0,2016-02-02 09:28:43,624590762,si
1,2016-02-02 09:28:43,624590762,j
2,2016-02-02 09:28:43,624590762,ai
3,2016-02-02 09:28:43,624590762,10
4,2016-02-02 09:28:43,624590762,en
5,2016-02-02 09:28:43,624590762,histoire
6,2016-02-02 09:28:43,624590762,je
7,2016-02-02 09:28:43,624590762,sors
8,2016-02-02 09:28:43,624590762,grosse
9,2016-02-02 09:28:43,624590762,bouteille
