# Initialisation

In [1]:
import pandas as pd
import pyarrow.parquet as pq
import random
import numpy as np
import seaborn as sns

In [2]:
%%time
t = pq.read_table('./data/passim_output.parquet')

Ignoring path: ./data/passim_output.parquet/.part-00142-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00044-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00158-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00016-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00175-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00165-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00134-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00196-99958234-2046-42a1-a219-16e6daa3408f-c000.snappy.parquet.crc
Ignoring path: ./data/passim_output.parquet/.part-00152-99958234

CPU times: user 7.24 s, sys: 3.98 s, total: 11.2 s
Wall time: 12.4 s


In [3]:
%%time
dataset = t.to_pandas()

CPU times: user 5.37 s, sys: 1.86 s, total: 7.23 s
Wall time: 7.02 s


# Add image url column+exploring function
## Image url

In [4]:
def build_iiif_link(row):
    """
    http://dhlabsrv8.epfl.ch/iiif_letemps/JDG_1995_11_10_38/full/full/0/default.jpg
    """
    # racine des url d'images
    base_url = "http://dhlabsrv8.epfl.ch/iiif_letemps/"
    
    # entier du numéro de page
    page_nos = row["page_no"]
    
    # format de l'id JDG-1967-08-31-a_Ar01001 dernière partie == junk
    issue_id, article_id = row["id"].split("_")
    
    # format JDG-1967-08-31-a extraction du journal, année, mois, jour; édition == junk
    journal, year, month, day, edition = issue_id.split('-')
    links = [
        "{}{}_{}_{}_{}_{}/{}".format(
            base_url,
            journal,
            year,
            month,
            day,
            page,
            "full/full/0/default.jpg"
        )
        for page in page_nos
    ]
    return links[0]

In [5]:
dataset.loc[:,"iiif_links"] = dataset.apply(build_iiif_link, axis=1)

## Exploring function

In [6]:
def print_cluster(cluster_id, clusters_df):
    clusters = clusters_df[clusters_df["cluster"]==cluster_id]
    for row_id, row in clusters.iterrows():
        print("{}\t{}\n{}".format(row["cluster"], row["id"], row["text"]))
# used in explore_cluster

In [7]:
def explore_clusters(clusters_df, n):
    cluster_ids = list(set(clusters_df["cluster"]))
    random.shuffle(cluster_ids)
    for cluster_id in cluster_ids[:n]:
        print_cluster(cluster_id, clusters_df)
        print("\n************\n")
        
# idea : shuffle the list of ids. Print n random cluster. That mean each article in the cluster

# Cluster dataset
## Time Coverage

In [8]:
import re
f = lambda x: re.findall("[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}", x)[0]
dataset.loc[:,'Publication Date'] = dataset['id'].apply(f)

In [9]:
dataset['Publication Date'] = pd.to_datetime(dataset['Publication Date'])

In [10]:
%%time
cluster_group = dataset[['cluster','Publication Date']].groupby('cluster')
cluster_dataset = cluster_group.max() - cluster_group.min()

CPU times: user 696 ms, sys: 96 ms, total: 792 ms
Wall time: 793 ms


In [35]:
cluster_dataset['Time Coverage'] = cluster_dataset['Time Coverage'] / np.timedelta64(1, 'D')

__Note :__ time coverage in days

## Size Cluster

In [11]:
cluster_dataset.loc[:,'Size'] = dataset['cluster'].value_counts().sort_index()
cluster_dataset.columns = pd.Index(['Time Coverage','Size'])

## Lexical Overlap
### Functions

In [12]:
def lexicaloverlap(row):
    texts = row['text']
    first = True
    intersection = list()
    for text in texts:
        if(first):
            first = False
            intersection = re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split()
        else:
            processing = re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split()
            intersection = np.intersect1d(processing, intersection)
    return len(intersection)

In [13]:
dataset = dataset.sort_values(by='cluster')

In [14]:
def countmaxlength(row):
    texts = row['text']
    length_array = np.array([])
    for text in texts:
        length = len(re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split())
        length_array = np.insert(length_array,0,length)
    return length_array.max()

In [15]:
def apply_with_partition(dataset, agg_funct, partition):
    last = 0
    lengthsamples = 0
    total = len(dataset['cluster'])
    part = list()
    for limit in partition:
        lengthsamples = len(dataset[(dataset['cluster']>=last)&(dataset['cluster']<limit)]['cluster'])+lengthsamples
        part.append(dataset[(dataset['cluster']>=last)&(dataset['cluster']<limit)].groupby('cluster').apply(agg_funct))
        print('{}{} {}%'.format('#'*round(lengthsamples/total*100),'.'*(100-round(lengthsamples/total*100)),round(lengthsamples/total*100)))
        last = limit
    return pd.concat(part)

### Overlap

In [16]:
partition = [0,4000000,10000000000,26000000000,50000000000,65000000000,85000000000,95000000000,130000000000,170000000000,230000000000,1000000000000000]

In [17]:
%%time
overlap_serie = apply_with_partition(dataset, lexicaloverlap, partition)

.................................................................................................... 0%
#########........................................................................................... 9%
################.................................................................................... 16%
##############################...................................................................... 30%
#########################################........................................................... 41%
####################################################................................................ 52%
############################################################........................................ 60%
####################################################################................................ 68%
################################################################################.................... 80%
#########################################################

In [18]:
overlap_serie.columns = pd.Index(['Lexical Overlap'])
cluster_dataset.loc[:,'Lexical Overlap'] = overlap_serie['Lexical Overlap']
cluster_dataset.head()

Unnamed: 0_level_0,Time Coverage,Size,Lexical Overlap
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0 days,3,50.0
1,0 days,2,192.0
2,181 days,2,36.0
3,0 days,2,85.0
4,0 days,2,189.0


### Maximal text length

In [19]:
%%time
maximaltextlength_serie = apply_with_partition(dataset, countmaxlength, partition)

.................................................................................................... 0%
#########........................................................................................... 9%
################.................................................................................... 16%
##############################...................................................................... 30%
#########################################........................................................... 41%
####################################################................................................ 52%
############################################################........................................ 60%
####################################################################................................ 68%
################################################################################.................... 80%
#########################################################

In [20]:
maximaltextlength_serie.columns = pd.Index(['Max text length'])
cluster_dataset.loc[:,'Max text length'] = maximaltextlength_serie['Max text length']

### Relative Lexical Overlap

In [21]:
cluster_dataset['Relative Overlap']=cluster_dataset['Lexical Overlap']/cluster_dataset['Max text length']

# Save Cluster Dataset

In [40]:
cluster_dataset.to_pickle('./data/cluster_dataset.pickle')
cluster_dataset.to_csv('./data/cluster_dataset.csv')