In [1]:
from impresso import connect, AND, OR

impresso = connect()

🎉 You are now connected to the Impresso API!  🎉
🔗 Using API: https://dev.impresso-project.ch/public-api/v1


# Text reuse clusters

## Find clusters mentioning text

In [2]:
impresso.text_reuse.clusters.find(
    term="banana",
    offset=2,
    limit=5,
    order_by="passages-count",
)

Unnamed: 0_level_0,lexicalOverlap,clusterSize,textSample,timeCoverage.startDate,timeCoverage.endDate
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tr-all-v1-24-c77310101729,79.439252,2,"sont pas admis.\nTaxe fr. 3, 50 ( bonification...",1887-06-22,1887-06-23
tr-all-v1-24-c77310101732,39.74359,2,"sont pas admis.\nTaxe fr. 3, 50 ( bonification...",1887-06-23,1887-06-23
tr-all-v1-24-c244588,78.823529,2,Suisses à l’étranger. — La Suisse aura mainten...,1884-08-20,1884-08-21


### Pagination

In [3]:
result = impresso.text_reuse.clusters.find(
    term="banana",
    limit=5,
    order_by="passages-count",
)

print(f"Total items in the result set: {result.total}. Limit: {result.limit}. Offset: {result.offset}. Size: {result.size}.")
for page in result.pages():
    print(
        f"Got page {page.offset} - {page.offset + page.size} of {page.total}. "
        + f"The first title is {page.raw['data'][0]['uid']}"
    )

Total items in the result set: 10. Limit: 5. Offset: 0. Size: 5.
Got page 0 - 5 of 10. The first title is tr-all-v1-24-c8590049316
Got page 5 - 10 of 10. The first title is tr-all-v1-24-c51540329030


## Cluster size

In [4]:
impresso.text_reuse.clusters.find(
    term="banana",
    cluster_size=(50, 100),
)

## Mentioning text in title

In [5]:
impresso.text_reuse.clusters.find(
    title=AND("luxembourg", "suisse"),
    cluster_size=(100, 200),
)

## Lexical overlap

In [6]:
impresso.text_reuse.clusters.find(
    term="banana",
    lexical_overlap=(50, 51),
)

## Day delta

Number of days between the first and last mention of the text in the cluster.

In [7]:
impresso.text_reuse.clusters.find(
    term="banana",
    day_delta=(50, 100),
)

Unnamed: 0_level_0,lexicalOverlap,clusterSize,textSample,timeCoverage.startDate,timeCoverage.endDate
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tr-all-v1-24-c137439334025,78.536585,2,Les Sociétés anonymes suisses\npendant la cris...,1937-05-08,1937-07-10


## Date range

In [8]:
from impresso import DateRange

impresso.text_reuse.clusters.find(
    term="banana", 
    date_range=DateRange("1921-05-21", "2001-01-02")
)

Unnamed: 0_level_0,lexicalOverlap,clusterSize,textSample,timeCoverage.startDate,timeCoverage.endDate
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tr-all-v1-24-c85900161645,53.061224,4,werden.\nAls Reise-Neuheiten präsentiert die p...,1977-01-20,1977-02-09
tr-all-v1-24-c77310034000,52.046784,2,Nos brochures con-\ntiennent davantage d'atout...,1977-04-06,1977-04-15
tr-all-v1-24-c51540329030,90.47619,2,Un slogan pour les catalogues Popularis en cou...,1977-04-06,1977-04-15


## Newspaper

In [9]:
impresso.text_reuse.clusters.find(
    term="banana", 
    newspaper_id=OR("EXP", "GDL"),
)

## Collection

In [10]:
impresso.text_reuse.clusters.find(collection_id="12312312")

## Front page

In [11]:
impresso.text_reuse.clusters.find(
    term="banana", 
    front_page=True,
)

## Topic

In [12]:
impresso.text_reuse.clusters.find(
    term="banana", 
    topic_id=OR("tm-fr-all-v2.0_tp07_fr", "tm-fr-all-v2.0_tp48_fr")
)

## Language

In [13]:
impresso.text_reuse.clusters.find(
    term="luxembourg", 
    language=OR("it", "en")
)

Unnamed: 0_level_0,lexicalOverlap,clusterSize,textSample,timeCoverage.startDate,timeCoverage.endDate
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tr-all-v1-24-c8589980257,66.153846,2,The British- Luxembourg Society\nbeg to call t...,1948-04-01,1948-04-01
tr-all-v1-24-c336675,76.923077,2,"Amsterdam, Atlanta, Barcelona, Brussels, Chica...",1989-12-02,1990-04-13
tr-all-v1-24-c10636,18.84058,30,INTERNATIONAL\nHOLDING S.A.\nSociété Anonyme\n...,1975-07-11,1997-03-26


## Country

In [14]:
impresso.text_reuse.clusters.find(
    term="schengen", 
    country=OR("FR", "CH")
)

Unnamed: 0_level_0,lexicalOverlap,clusterSize,textSample,timeCoverage.startDate,timeCoverage.endDate
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tr-all-v1-24-c68720183505,92.741935,2,Asile en Suisse\nRien ne va plus!\nIl faut en ...,1991-02-20,1991-02-27


## Entity mention

In [15]:
impresso.text_reuse.clusters.find(
    term="schengen", 
    mention=OR("Belval", "Lausanne")
)

## Entity by ID

In [16]:
impresso.text_reuse.clusters.find(
    term="banana",
    entity_id="aida-0001-54-Switzerland"
)

# Text reuse clusters facets

## Date range

In [17]:
impresso.text_reuse.clusters.facet("daterange", lexical_overlap=(1, 2))

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
1738-01-01T00:00:00Z,0
1739-01-01T00:00:00Z,0
1740-01-01T00:00:00Z,0


### Pagination

In [18]:
result = impresso.text_reuse.clusters.facet("daterange", lexical_overlap=(1, 2), limit=50)

print(f"Total items in the result set: {result.total}. Limit: {result.limit}. Offset: {result.offset}. Size: {result.size}.")
for page in result.pages():
    print(
        f"Got page {page.offset} - {page.offset + page.size} of {page.total}. "
        + f"The first title is {page.raw['data'][0]['value']}"
    )

Total items in the result set: 281. Limit: 50. Offset: 0. Size: 50.
Got page 0 - 50 of 281. The first title is 1738-01-01T00:00:00Z
Got page 50 - 100 of 281. The first title is 1788-01-01T00:00:00Z
Got page 100 - 150 of 281. The first title is 1838-01-01T00:00:00Z
Got page 150 - 200 of 281. The first title is 1888-01-01T00:00:00Z
Got page 200 - 250 of 281. The first title is 1938-01-01T00:00:00Z
Got page 250 - 281 of 281. The first title is 1988-01-01T00:00:00Z


## Cluster size

In [19]:
impresso.text_reuse.clusters.facet(
    "textReuseClusterSize",
    lexical_overlap=(1, 2)
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
1,2647
4223,0
8445,0


## Lexical overlap

In [20]:
impresso.text_reuse.clusters.facet(
    "textReuseClusterLexicalOverlap",
    lexical_overlap=(1, 2)
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
0,2647
10,0
20,0


## Day delta

In [21]:
impresso.text_reuse.clusters.facet(
    "textReuseClusterDayDelta",
    lexical_overlap=(1, 2)
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
0,2176
9470,284
18940,87


## Newspaper

In [22]:
impresso.text_reuse.clusters.facet(
    "newspaper",
    lexical_overlap=(1, 2)
)

Unnamed: 0_level_0,count,label
value,Unnamed: 1_level_1,Unnamed: 2_level_1
AV,1,Indicateur de Lausanne
BNN,10,Bündner Nachrichten
CDV,12,Courrier du Valais


# Text reuse passages

## Find passages mentioning text

In [23]:
impresso.text_reuse.passages.find(
    term="belval",
    offset=2,
    limit=5,
    order_by="clusterSize",
)

Unnamed: 0_level_0,content,contentItemId,offset.start,offset.end
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c103079374309-tageblatt-1946-04-12-a-i0117@23:645,Mme Jean BOULTGEN et son fils\nNicolas et les ...,tageblatt-1946-04-12-a-i0117,23,645
c103079419312-JV-1875-01-09-a-i0001@6137:7961,"se trouvaient MM. Buffet, le duc d’Audiffret-\...",JV-1875-01-09-a-i0001,6137,7961
c103079498166-tageblatt-1933-02-28-a-i0073@225:1486,"à Esch-sur-Alzette, rue de l'Industrie 48;\nPo...",tageblatt-1933-02-28-a-i0073,225,1486


### Pagination

In [24]:
result = impresso.text_reuse.passages.find(
    term="belval",
    offset=2,
    limit=50,
    order_by="clusterSize",
)

print(f"Total items in the result set: {result.total}. Limit: {result.limit}. Offset: {result.offset}. Size: {result.size}.")
for page in result.pages():
    print(
        f"Got page {page.offset} - {page.offset + page.size} of {page.total}. "
        + f"The first title is {page.raw['data'][0]['uid']}"
    )

Total items in the result set: 360. Limit: 50. Offset: 2. Size: 50.
Got page 2 - 52 of 360. The first title is c103079374309-tageblatt-1946-04-12-a-i0117@23:645
Got page 52 - 102 of 360. The first title is c137439596821-tageblatt-1946-12-06-a-i0087@16:897
Got page 102 - 152 of 360. The first title is c206159066754-tageblatt-1946-01-17-a-i0053@0:706
Got page 152 - 202 of 360. The first title is c51539607640-tageblatt-1949-07-21-a-i0052@827:1730
Got page 202 - 252 of 360. The first title is c68719908844-buergerbeamten-1911-04-01-b-i0009@2802:3174
Got page 252 - 302 of 360. The first title is c94489329094-tageblatt-1947-08-21-a-i0053@331:679
Got page 302 - 352 of 360. The first title is c94489496965-dunioun-1946-12-04-a-i0082@0:5041
Got page 352 - 360 of 360. The first title is c4836-dunioun-1947-06-27-a-i0120@589:1072


### Find passages for a cluster by its ID

In [25]:
impresso.text_reuse.passages.find(
    cluster_id="tr-nobp-all-v01-c137438978332",
    order_by="clusterSize",
)

## Cluster size

In [26]:
impresso.text_reuse.passages.find(
    term="banana",
    cluster_size=(50, 100),
)

## Mentioning text in title

In [27]:
impresso.text_reuse.passages.find(
    title=AND("luxembourg", "suisse"),
    cluster_size=(100, 200),
)

## Lexical overlap

In [28]:
impresso.text_reuse.passages.find(
    term="banana",
    lexical_overlap=(50, 51),
)

## Day delta

Number of days between the first and last mention of the text in the cluster.

In [29]:
impresso.text_reuse.passages.find(
    term="banana",
    day_delta=(50, 100),
)

Unnamed: 0_level_0,content,contentItemId,offset.start,offset.end
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c137439334025-LLS-1937-05-08-a-i0020@0:2709,Les Sociétés anonymes suisses\npendant la cris...,LLS-1937-05-08-a-i0020,0,2709


## Date range

In [30]:
from impresso import DateRange

impresso.text_reuse.passages.find(
    term="banana", 
    date_range=DateRange("1921-05-21", "2001-01-02")
)

Unnamed: 0_level_0,content,contentItemId,offset.start,offset.end
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c85900161645-SMZ-1977-02-02-a-i0036@629:2044,werden. Als Reise-Neuheiten präsentiert Popula...,SMZ-1977-02-02-a-i0036,629,2044
c85900161645-OIZ-1977-01-27-a-i0037@627:2010,werden.\nAls Reise-Neuheiten präsentiert die p...,OIZ-1977-01-27-a-i0037,627,2010
c85900161645-VHT-1977-02-09-a-i0022@630:2034,werden.\nAls Reise-Neuheiten präsentiert die p...,VHT-1977-02-09-a-i0022,630,2034


## Newspaper

In [31]:
impresso.text_reuse.passages.find(
    term="banana", 
    newspaper_id=OR("EXP", "GDL"),
)

## Collection

In [32]:
impresso.text_reuse.passages.find(collection_id="12312312")

## Front page

In [33]:
impresso.text_reuse.passages.find(
    term="banana", 
    front_page=True,
)

## Topic

In [34]:
impresso.text_reuse.passages.find(
    term="banana", 
    topic_id=OR("tm-fr-all-v2.0_tp07_fr", "tm-fr-all-v2.0_tp48_fr")
)

## Language

In [35]:
impresso.text_reuse.passages.find(
    term="luxembourg", 
    language=OR("it", "en")
)

Unnamed: 0_level_0,content,contentItemId,offset.start,offset.end
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c8589980257-dunioun-1948-04-01-a-i0009@0:509,The British- Luxembourg Society\nbeg to call t...,dunioun-1948-04-01-a-i0009,0,509
c10636-luxland-1982-07-16-a-i0048@10:631,"Société Anonyme\nHeadoffice:\nLuxembourg,\n47,...",luxland-1982-07-16-a-i0048,10,631
c10636-luxland-1987-10-09-a-i0050@17:566,"Société Anonyme\nLuxembourg,\n37, rue Notre-Da...",luxland-1987-10-09-a-i0050,17,566


## Country

In [36]:
impresso.text_reuse.passages.find(
    term="schengen", 
    country=OR("FR", "CH")
)

Unnamed: 0_level_0,content,contentItemId,offset.start,offset.end
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c68720183505-SDT-1991-02-27-a-i0012@0:9876,Asile en Suisse\nRien ne va plus!\nIl faut en ...,SDT-1991-02-27-a-i0012,0,9876
c68720183505-LLS-1991-02-20-a-i0016@0:9916,Asile en Suisse\nRien ne va plus!\nIl faut en ...,LLS-1991-02-20-a-i0016,0,9916


## Entity mention

In [37]:
impresso.text_reuse.passages.find(
    term="schengen", 
    mention=OR("Belval", "Lausanne")
)

## Entity by ID

In [38]:
impresso.text_reuse.passages.find(
    term="banana",
    entity_id="aida-0001-54-Switzerland"
)

# Text reuse passages facets

## newspaper

In [39]:
impresso.text_reuse.clusters.facet("newspaper")

Unnamed: 0_level_0,count,label
value,Unnamed: 1_level_1,Unnamed: 2_level_1
ACI,1,Almanach pour le commerce
AV,206,Indicateur de Lausanne
BDC,226,Bulletin des séances de la Constituante


### Pagination

In [40]:
result = impresso.text_reuse.clusters.facet("newspaper", limit=50)

print(f"Total items in the result set: {result.total}. Limit: {result.limit}. Offset: {result.offset}. Size: {result.size}.")
for page in result.pages():
    print(
        f"Got page {page.offset} - {page.offset + page.size} of {page.total}. "
        + f"The first title is {page.raw['data'][0]['label']}"
    )

Total items in the result set: 130. Limit: 50. Offset: 0. Size: 50.
Got page 0 - 50 of 130. The first title is Almanach pour le commerce
Got page 50 - 100 of 130. The first title is Le Courrier fribourgeois
Got page 100 - 130 of 130. The first title is Excelsior


## daterange

In [41]:
impresso.text_reuse.passages.facet(
    "daterange",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
1883-07-01T00:00:00Z,1
1884-07-01T00:00:00Z,1
1885-07-01T00:00:00Z,2


## yearmonth

In [42]:
impresso.text_reuse.passages.facet(
    "yearmonth",
    term="banana"
)

## year

In [43]:
impresso.text_reuse.passages.facet(
    "year",
    term="banana"
)

## connectedClusters

In [44]:
impresso.text_reuse.passages.facet(
    "connectedClusters",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
tr-all-v1-24-c103079453365,1
tr-all-v1-24-c111669295627,2
tr-all-v1-24-c111669295628,1


## textReuseClusterSize

In [45]:
impresso.text_reuse.passages.facet(
    "textReuseClusterSize",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
2,16
252,0
502,0


## textReuseClusterLexicalOverlap

In [46]:
impresso.text_reuse.passages.facet(
    "textReuseClusterLexicalOverlap",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
0,0
1,0
2,0


## textReuseClusterDayDelta

In [47]:
impresso.text_reuse.passages.facet(
    "textReuseClusterDayDelta",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
0,16
118,0
236,0


## textReuseCluster

In [48]:
impresso.text_reuse.passages.facet(
    "textReuseCluster",
    term="banana"
)

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
tr-all-v1-24-c111669295627,2
tr-all-v1-24-c137439334025,1
tr-all-v1-24-c244588,1


## collection

In [49]:
impresso.text_reuse.passages.facet(
    "collection",
    term="banana"
)

## topic

In [50]:
impresso.text_reuse.passages.facet("topic", term="banana")

## person

In [51]:
impresso.text_reuse.passages.facet("person", term="banana")

## location

In [52]:
impresso.text_reuse.passages.facet("location", term="banana")

## nag

In [53]:
impresso.text_reuse.passages.facet("nag", term="banana")

## language

In [54]:
impresso.text_reuse.passages.facet("language", term="banana")

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
de,6
fr,10


## country

In [55]:
impresso.text_reuse.passages.facet("country", term="banana")

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
CH,16
