# BL : Notebook to assign the mapping title - NLP - alias, and copy the data and images accordingly in a correct structure

We have received the BL data but we now to have to verify what we received, the actual years spanned, as well as the titles we will assign to each title and assign unique aliases to them.

In addition, we have to ensure that we have a correct title -> NLPs mapping since the files are organized by NLP in the file structure.

This will allow us to reorganise the existing data to reflext this organization, and ensure that there is the expected file structure/arborescence for our processing needs later on.

The file `/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/BL/BL-title-alias-mapping.csv` contains the current list of newspapers, NLPs, assigned aliases and actual identified years. 
It can thus be used to ensure this 1-1 mapping can then be imported back into google sheets to have the correct data in all sources

In [138]:
from impresso_essentials.utils import KNOWN_JOURNALS

import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from datetime import datetime

## 1. Assigning unique aliases for each of the working titles in the collection

The aliases have been generated by querying chatgpt, after providing a list of existing aliases found in the BL data (for only some titles).

### The final mapping of working title to alias for the new BL data

We will of course ensure that all of these aliases are unique, with respect to each other as well as the list of existing Impresso aliases

In [109]:
bl_aliases = {
    'Aberdeen Press and Journal': 'ANJO',
    'Alston Herald and East Cumberland Advertiser': 'AHEC',
    "Baldwin's London Weekly Journal": 'BLWJ',
    'Baner ac Amserau Cymru': 'BNER',
    'Bargoed Journal': 'BGJO',
    'Barnsley Telephone': 'BTEP',
    "Bell's Family Newspaper": 'BFNP',
    "Bell's News": 'BELL',
    "Bell's Penny Dispatch": 'BPDH',
    "Berrow's Worcester Journal": 'WOJL',
    "Berthold's Political Handkerchief": 'BPHF',
    'Birmingham Daily Post': 'BDPO',
    'Blandford Weekly News': 'BWNW',
    'Bradford Observer': 'BROR',
    'Bridgend Chronicle': 'BGCH',
    'Bridlington and Quay Gazette': 'BQGA',
    'Bridport, Beaminster, and Lyme Regis Telegram': 'BBLT',
    'Brief': 'BRIF',
    'Brighouse & Rastrick Gazette': 'BRGA',
    'Brighton Patriot': 'BRPT',
    'British Army Despatch': 'BRAD',
    'British Mercury or Wednesday Evening Post': 'BRMW',
    'British Miner and General Newsman': 'BRMG',
    'Caledonian Mercury': 'CNMR',
    "Charles Knight's Town & Country Newspaper": 'CKTC',
    'Chelsea & Pimlico Advertiser': 'CPAD',
    'Cheshire Observer': 'CHOR',
    'Christian Times': 'CHTI',
    'City of London Trade Protection Circular': 'CLTP',
    "Cleave's Weekly Police Gazette": 'CWPG',
    "Cobbett's Evening Post": 'CBEP',
    "Cobbett's Weekly Political Register": 'CWPR',
    'Colored News': 'CLNW',
    'Common Sense': 'CMSN',
    'Cradley Heath & Stourbridge Observer': 'CHSO',
    'Daily Gazette For Middlesbrough': 'DGMH',
    'Daily News': 'DNLN',
    'Daily Politician': 'DPLT',
    'Darlington & Richmond Herald': 'DRHE',
    'Denton and Haughton Examiner': 'DHEX',
    'Derby Mercury': 'DYMR',
    'Dewsbury Chronicle and West Riding Advertiser': 'DCWA',
    'Dorset County Express and Agricultural Gazette': 'DCEA',
    "Douglas Jerrold's Weekly Newspaper": 'DJWN',
    "Duckett's Dispatch": 'DDIS',
    'Dundee Courier': 'DUCR',
    'East London Advertiser': 'ELAD',
    'East Wind': 'EAWN',
    'Exeter Flying Post': 'TEFP',
    'Finsbury Free Press': 'FFPR',
    "Fleming's British Farmers' Chronicle": 'FBFC',
    "Fleming's Weekly Express": 'FWEX',
    'Fonetic Nuz': 'FONU',
    "Francis's Metropolitan News": 'FMNW',
    "Freeman's Journal": 'FRJO',
    'Glasgow Courier': 'GLCO',
    "Glasgow Herald": "GWHD",
    "Glasgow Sentinel": "GLSE",
    "Golden Times": "GOTM",
    "Halifax Comet": "HLCM",
    "Hampshire Telegraph": "HPTE",
    "Haslingden Gazette": "HAGZ",
    "Hetherington's Twopenny Dispatch": "HTWD",
    "High Life in London": "HLLN",
    "Holt's Weekly Chronicle": "HWCH",
    "Hour": "HOUR",
    "Huddersfield Chronicle": "HUCE",
    "Hull Packet": "HLPA",
    "Illustrated Crystal Palace Gazette": "ICPG",
    "Illustrated London Life": "ILOL",
    "Illustrated Midland News": "IMNW",
    "Illustrated Sporting News and Theatrical and Musical Review": "ISNT",
    "Illustrated Times 1853": "ILT53",
    "Illustrated Weekly Times": "ILWT",
    "Irvine Express": "IREX",
    "Isle of Wight Observer": "IWOR",
    "Islington Times": "ISTM",
    "Jewish Record": "JWRC",
    "Johnson's Sunday Monitor": "JSMN",
    "Kenilworth Advertiser": "KEAD",
    "Lancaster Standard and County Advertiser": "LSCA",
    "Leeds Intelligencer": "LSIR",
    "Leicester Chronicle": "LECH",
    "Liverpool Mercury": "LVMR",
    "Liverpool Standard and General Commercial Advertiser": "LSGA",
    "Liverpool Weekly Courier": "LWC",
    "Lloyd's Companion to the Penny Sunday Times and Peoples' Police Gazette": "LCPP",
    "Lloyd's Weekly Newspaper": "LINP",
    "London & Provincial News and General Advertiser": "LPNGA",
    "London Dispatch": "LNDH",
    "London Halfpenny Newspaper": "LHPN",
    "London Journal and General Advertiser for Town and Country": "LJGA",
    "London Life": "LNLF",
    "London Moderator and National Adviser": "LMNA",
    "London Railway Newspaper": "LRNW",
    "The London News Letter and Price Current": "LNPC",
    "Manchester Examiner": "MEXM",
    "Manchester Times": "MRTM",
    "Mirror of the Times": "MRTT",
    "Morning Chronicle": "MCLN",
    "Morning Herald": "MRHD",
    "Morning Post": "MOPT",
    "Nantwich, Sandbach & Crewe Star": "NSCS",
    "National Register": "NTRG",
    "Nelson Chronicle, Colne Observer and Clitheroe Division News": "NCCO",
    "New Court Gazette": "NCGA",
    "New Times": "NWTM",
    "Nonconformist Elector": "NCEF",
    "North London Record": "NLRD",
    "North Wales Chronicle": "NRWC",
    "Northern Echo": "NREC",
    "Northern Liberator": "NRLR",
    "Northern Star and Leeds General Advertiser": "NRSR",
    "Northern Weekly Gazette": "NWGZ",
    "Old England": "OLEN",
    "Orr's Kentish Journal": "OKJL",
    "Oxford Journal": "JOJL",
    "Passing Events": "PSEV",
    "Pen and Pencil": "PNPC",
    "Penistone, Stocksbridge and Hoyland Express": "PSHE",
    "Pictorial Times": "PICT",
    "Picture Times": "PITM",
    "Pierce Egan's Life in London, and Sporting Guide": "PELL",
    "Poole Telegram": "POTG",
    "Preston Pilot": "PRPL",
    "Reynold's Newspaper": "RDNP",
    "Ripon Observer": "RIOB",
    "Royal Cornwall Gazette": "COGE",
    "Royal York": "RYRK",
    "Runcorn Examiner": "RUEX",
    "Sainsbury's Weekly Register and Advertising Journal": "SWRJ",
    "Sheffield Public Advertiser": "SHPA",
    "South London Advertiser": "SLAD",
    "South London Times and Lambeth Observer": "SLTL",
    "Southern Star": "SNSR",
    "Southwark Mercury": "SWME",
    "Sport": "SPRT",
    "Stalybridge Examiner": "STEX",
    "Stockton Herald, South Durham and Cleveland Advertiser": "SHSD",
    "Stretford and Urmston Examiner": "STUE",
    "Sunday Gazette": "SUGA",
    "Sunday News": "SUNW",
    "Surrey & Middlesex Standard": "SMSD",
    "Surrey Herald and County Advertiser": "SHCA",
    "Surrey Mercury": "SURY",
    "Swansea and Glamorgan Herald": "SGHL",
    "Swansea Journal and South Wales Liberal": "SJWL",
    "Thacker's Overland News for India and the Colonies": "TONI",
    "The Age (London)": "TALN",
    "The Age 1852": "AGE52",
    "The Agricultural Advertiser and Tenant-Farmers' Advocate": "AATA",
    "The Albion": "ALBN",
    "The Albion and the Star": "ALST",
    "The Anti-Gallican Monitor": "AGMO",
    "The Argus, or, Broad-sheet of the Empire": "ARGB",
    "The Atherstone, Nuneaton, and Warwickshire Times": "ANWT",
    "The Aurora Borealis": "AUBO",
    "The Ballot": "BLOT",
    "The Barrow Herald and Furness Advertiser": "BHFA",
    "The Bath Chronicle": "BHCH",
    "The Beacon (Edinburgh)": "BCE1",
    "The Beacon (London)": "BCL2",
    "The Bee-Hive": "BEHI",
    "The Belfast News-Letter": "BNWL",
    "The Birkenhead News": "BKNW",
    "The Blackburn Standard": "BLSD",
    "The Blackpool Herald": "BLHD",
    "The Blandford and Wimborne Telegram": "BWTE",
    "The Borough of Greenwich Free Press": "BGFP",
    "The Bristol Mercury": "BLMY",
    "The British Banner": "BRBN",
    "The British Emancipator": "BREM",
    "The British Ensign": "BREN",
    "The British Liberator": "BRLB",
    "The British Luminary": "BRLU",
    "The British Neptune": "BRNP",
    "The British Press": "BRPR",
    "The British Standard": "BRST",
    "The British Statesman": "BRSS",
    "The Brunswick, or, True Blue": "BRTB",
    "The Bury and Norwich Post": "BNPT",
    "The Cannock Chase Examiner": "CCEX",
    "The Censor or Satirical Times": "CSTT",
    "The Central Glamorgan Gazette": "CGGA",
    "The Champion (London)": "CHPL",
    "The Champion": "CHPN",
    "The Charter": "CHTR",
    "The Chartist": "CHTT",
    "The City Chronicle": "CICN",
    "The Civil & Military Gazette": "CMGA",
    "The Clerkenwell Dial and Finsbury Advertiser": "CLDF",
    "The Colonist and Commercial Weekly Advertiser": "CCWA",
    "The Commercial Chronicle": "CMCH",
    "The Constitution": "CNSN",
    "The Cosmopolitan": "CSMP",
    "The Cotton Factory Times": "CFTM",
    "The Courier": "COUR",
    "The Court Gazette and Fashionable Guide": "CGFG",
    "The Crim. Con. Gazette": "CCGZ",
    "The Crown": "CRWN",
    "The Daily Director and Entr'acte": "DDEN",
    "The Day": "TDAY",
    "The Dewsbury Chronicle and West Riding Advertiser": "DCWR",
    "The Dial": "TDIA",
    "The Dissenter": "DSNR",
    "The East Riding Telegraph": "ERTG",
    "The Eastern Star": "EAST",
    "The Emigrant and the Colonial Advocate": "ECLA",
    "The English Chronicle and Whitehall Evening Post": "ECWP",
    "The Englishman": "ENGL",
    "The Era": "ERLN",
    "The Essex Standard": "ESSD",
    "The Evening Star": "EVST",
    "The Evening Times (London)": "EVTL",
    "The Evening Times 1825": "EVT25",
    "The Examiner": "EXLN",
    "The Express": "EXPR",
    "The Forest of Dean Examiner": "FODE",
    "The General Evening Post": "GEVP",
    "The Glasgow Chronicle": "GLCH",
    "The Graphic": "GCLN",
    "The Hammersmith Advertiser": "HMSA",
    "The Hampshire Advertiser": "SOHD",
    "The Hebrew Observer": "HBOV",
    "The Herald of Wales": "HOWL",
    "The Illustrated Newspaper": "ILNP",
    "The Illustrated Police News": "HPNW",
    "The Imperial Weekly Gazette": "IWGZ",
    "The Instructor and Select Weekly Advertiser": "ISWA",
    "The Ipswich Journal": "IPJO",
    "The Isle of Man Times": "IMTS",
    "The Kingsland Times and General Advertiser": "KTGA",
    "The Lady's Newspaper and Pictorial Times": "LNPT",
    "The Lady's Own Paper": "LOPA",
    "The Lancaster Gazette": "LAGER",
    "The Lancaster Herald and Town and County Advertiser": "LHTC",
    "The Leeds Mercury": "LEMR",
    "The Little Times": "LTIM",
    "The Liverpool Albion": "LIAL",
    "The Liverpool Chronicle": "LIVC",
    "The Liverpool Telegraph": "LITG",
    "The London & China Herald": "LCHH",
    "The London and Liverpool Advertiser": "LLAD",
    "The London and Scottish Review": "LSCR",
    "The London Chronicle": "LNCH",
    "The London Chronicle and Country Record": "LCCR",
    "The London Daily Guide and Stranger's Companion": "LDGS",
    "The London Evening Post": "LEVP",
    "The London Free Press": "LFPR",
    "The London Illustrated Weekly": "LIWL",
    "The London Journal and Pioneer Newspaper": "LJPN",
    "The London Mercury": "LNM1",
    "The London Mercury 1836": "LNM2",
    "The London Mercury 1847": "LNM3",
    "The London Mirror": "LONM",
    "The London Packet and New Lloyd's Evening Post": "LPNL",
    "The London Phalanx": "LOPH",
    "The London Scotsman": "LSCT",
    "The London Telegraph": "LTLG",
    "The London Weekly Investigator": "LWI",
    "The Man about Town": "MATN",
    "The Manchester Examiner": "MEXA",
    "The Metropolitan": "MTPN",
    "The Midland Examiner and Wolverhampton Times": "MEWT",
    "The Monthly Times": "MNTM",
    "The Morning Gazette": "MOGA",
    "The Morning Mail": "MOMA",
    "The Nation": "NATN",
    "The National": "NTNL",
    "The National Protector": "NTPR",
    "The National Standard": "NTSD",
    "The New Globe": "NGLB",
    "The New Weekly True Sun": "NWTS",
    "The Newcastle Courant": "NECT",
    "The News": "TNEW",
    "The North Cumberland Reformer": "NCRF",
    "The North Londoner": "NLON",
    "The North-West London Times": "NWLT",
    "The Northern Daily Times": "NDTM",
    "The Northern Guardian": "NOGU",
    "The Nottinghamshire Guardian": "NOGN",
    "The Nuneaton Times": "NUNT",
    "The Observer of the Times": "OBTM",
    "The Odd Fellow": "ODFW",
    "The Operative": "OPTE",
    "The Oracle and the Daily Advertiser": "ORDA",
    "The Paddington Advertiser": "PADV",
    "The Pall Mall Gazette": "PMGZ",
    "The Palladium": "PLDM",
    "The Patriot": "PATR",
    "The People's Hue and Cry or Weekly Police Register": "PHCW",
    "The People's Paper": "PPLP",
    "The Pilot": "PLTO",
    "The Pioneer and Weekly Record of Movements": "PWRM",
    "The Planet": "PLNT",
    "The Political Letter": "POLL",
    "The Political Observer": "PLOB",
    "The Pontypridd District Herald": "PDHD",
    "The Poor Man's Guardian": "PMGU",
    "The Porcupine": "PORC",
    "The Potteries Examiner": "POEX",
    "The Press": "TPRS",
    "The Preston Chronicle and Lancashire Advertiser": "PNCH",
    "The Public Cause": "PUCA",
    "The Radical": "RADL",
    "The Railway Bell and London Advertiser": "RBLD",
    "The Reformer": "REFM",
    "The Representative": "REPR",
    "The Saint James's Chronicle": "SJCH",
    "The Satirist; or, the Censor of the Times": "SATR",
    "The Sheffield Independent": "SHIN",
    "The Shropshire Examiner": "SHRE",
    "The Slaithwaite Guardian and Colne Valley News": "SGCV",
    "The South Staffordshire Examiner": "SSEX",
    "The St. Helens Examiner, and Prescot Weekly News": "SHEP",
    "The Standard": "SDLN",
    "The Standard of Freedom": "SOFR",
    "The Star": "STGY",
    "The Stockton Examiner and South Durham and North Yorkshire Herald": "SESD",
    "The Sun": "TSUN",
    "The Sun & Central Press": "SCPR",
    "The Sunday Evening Globe": "SEGL",
    "The Sunday Morning Herald": "SMHE",
    "The Sussex & Surrey Chronicle": "SSCH",
    "The Tamworth Miners' Examiner and Working Men's Journal": "TMEW",
    "The Tichborne Gazette": "TIGA",
    "The Tichborne News and Anti-Oppression Journal": "TNAJ",
    "The Tower Hamlets Mail": "THML",
    "The Trades' Free Press": "TFPR",
    "The True Briton": "TRBT",
    "The True Sun": "TRSN",
    "The Union": "TUNI",
    "The Universe": "UNIV",
    "The Verulam": "VERL",
    "The Vindicator": "VIND",
    "The Warrington Examiner": "WAEX",
    "The Warwickshire Herald": "WAHD",
    "The Watchman": "WTCH",
    "The Week's News": "WKNW",
    "The Weekly Advertiser": "WKAD",
    "The Weekly Chronicle": "WKCH",
    "The Weekly Echo": "WKEC",
    "The Weekly Globe": "WKGB",
    "The Weekly Independent": "WKIN",
    "The Weekly Intelligence": "WKIT",
    "The Weekly Journal": "WKJL",
    "The Weekly Mail": "WKML",
    "The Weekly Review": "WKRV",
    "The Weekly Star and Bell's News": "WSBN",
    "The Wellington Gazette and Military Chronicle": "WGMC",
    "The West End News": "WENW",
    "The West London Times": "WLTM",
    "The Westminster Times": "WMTM",
    "The Weymouth Telegram": "WMTG",
    "The World": "WRLD",
    "The World and Fashionable Sunday Chronicle": "WFSC",
    "The York Herald": "YOHD",
    "Town & Country Daily Newspaper": "TCDN",
    "Town and Country Advertiser": "TCAA",
    "Town Talk": "TTLK",
    "Town Talk 1822": "TTK22",
    "Trade Protection Record": "TPRD",
    "Weekly Times": "WKTN",
    "Weekly True Sun": "WKTS",
    "West Londoner and Select Advertiser for the Borough of Marylebone": "WLSA",
    "Western Mail": "WMCF",
    "Westminster Journal and Old British Spy": "WJBS",
    "Whitehall Evening Post": "WHEP",
    "Widnes Examiner": "WDEX",
    "Wooler's British Gazette": "WBGZ",
    "Wrexham Advertiser": "WRWA",
    "Y Genedl Gymreig": "GNDL",
    "Y Goleuad": "GLAD",
    "York House Papers": "YOHP"
}

In [106]:
len(bl_aliases)

374

In [107]:
# ensuring that the list of aliases provided by chatGPT is indeed unique.
sorted(list(set(bl_aliases.values()))) == sorted(list(bl_aliases.values()))

True

In [108]:
# also ensure that none of the new aliases are already in our collection
any(j in bl_aliases.values() for j in KNOWN_JOURNALS)

False

The final list contains 375 unique titles. 

We will have to reorganize the filestructure so that the input data is organized into these groups

Aberdeen Press and Journal	ANJO
Alston Herald and East Cumberland Advertiser	AHEC
Baldwin's London Weekly Journal	BLWJ
Baner ac Amserau Cymru	BNER
Bargoed Journal	BGJO
Barnsley Telephone	BTEP
Bell's Family Newspaper	BFNP
Bell's News	BELL
Bell's Penny Dispatch	BPDH
Berrow's Worcester Journal	WOJL
Berthold's Political Handkerchief	BPHF
Birmingham Daily Post	BDPO
Blandford Weekly News	BWNW
Bradford Observer	BROR
Bridgend Chronicle	BGCH
Bridlington and Quay Gazette	BQGA
Bridport, Beaminster, and Lyme Regis Telegram	BBLT
Brief	BRIF
Brighouse & Rastrick Gazette	BRGA
Brighton Patriot 	BRPT
British Army Despatch	BRAD
British Mercury or Wednesday Evening Post	BRMW
British Miner and General Newsman	BRMG
Caledonian Mercury	CNMR
Charles Knight's Town & Country Newspaper	CKTC
Chelsea & Pimlico Advertiser	CPAD
Cheshire Observer	CHOR
Christian Times 	CHTI
City of London Trade Protection Circular	CLTP
Cleave's Weekly Police Gazette	CWPG
Cobbett's Evening Post	CBEP
Cobbett's Weekly Political Register 	CWPR
Colored News	CLNW
Common Sense	CMSN
Cradley Heath & Stourbridge Observer	CHSO
Daily Gazette For Middlesbrough	DGMH
Daily News	DNLN
Daily Politician	DPLT
Darlington & Richmond Herald	DRHE
Denton and Haughton Examiner	DHEX
Derby Mercury	DYMR
Dewsbury Chronicle and West Riding Advertiser 	DCWA
Dorset County Express and Agricultural Gazette	DCEA
Douglas Jerrold's Weekly Newspaper	DJWN
Duckett's Dispatch	DDIS
Dundee Courier	DUCR
East London Advertiser	ELAD
East Wind	EAWN
Exeter Flying Post	TEFP
Finsbury Free Press	FFPR
Fleming's British Farmers' Chronicle	FBFC
Fleming's Weekly Express	FWEX
Fonetic Nuz	FONU
Francis's Metropolitan News	FMNW
Freeman's Journal	FRJO
Glasgow Courier	GLCO
Glasgow Herald	GWHD  ----------------
Glasgow Sentinel	GLSE
Golden Times	
Halifax Comet	
Hampshire Telegraph	HPTE
Haslingden Gazette	
Hetherington's Twopenny Dispatch	
High Life in London	
Holt's Weekly Chronicle	
Hour	
Huddersfield Chronicle	HUCE
Hull Packet	HLPA
Illustrated Crystal Palace Gazette	
Illustrated London Life	
Illustrated Midland News	
Illustrated Sporting News and Theatrical and Musical Review	
Illustrated Times 1853	
Illustrated Weekly Times	
Irvine Express	
Isle of Wight Observer	IWOR
Islington Times	
Jewish Record	
Johnson's Sunday Monitor	
Kenilworth Advertiser	
Lancaster Standard and County Advertiser	
Leeds Intelligencer	LSIR
Leicester Chronicle	LECH
Liverpool Mercury	LVMR
Liverpool Standard and General Commercial Advertiser	
Liverpool Weekly Courier	
Lloyd's Companion to the Penny Sunday Times and Peoples' Police Gazette	
Lloyd's Weekly Newspaper	LINP
London & Provincial News and General Advertiser	
London Dispatch	LNDH
London Halfpenny Newspaper	
London Journal and General Advertiser for Town and Country	
London Life	
London Moderator and National Adviser	
London Railway Newspaper	
LThe ondon News Letter and Price Current	
Manchester Examiner	
Manchester Times	MRTM
Mirror of the Times	
Morning Chronicle	MCLN
Morning Herald	
Morning Post	MOPT
Nantwich, Sandbach & Crewe Star	
National Register	
Nelson Chronicle, Colne Observer and Clitheroe Division News	
New Court Gazette	
New Times	
Nonconformist Elector	
North London Record	
North Wales Chronicle	NRWC
Northern Echo	NREC
Northern Liberator	NRLR
Northern Star and Leeds General Advertiser	NRSR
Northern Weekly Gazette	
Old England	
Orr's Kentish Journal	
Oxford Journal	JOJL
Passing Events	
Pen and Pencil	
Penistone, Stocksbridge and Hoyland Express	
Pictorial Times	
Picture Times	
Pierce Egan's Life in London, and Sporting Guide	
Poole Telegram	
Preston Pilot	
Reynold's Newspaper	RDNP
Ripon Observer	
Royal Cornwall Gazette	COGE
Royal York	
Runcorn Examiner	
Sainsbury's Weekly Register and Advertising Journal	
Sheffield Public Advertiser	
South London Advertiser	
South London Times and Lambeth Observer	
Southern Star	SNSR
Southwark Mercury	
Sport	
Stalybridge Examiner	
Stockton Herald, South Durham and Cleveland Advertiser	
Stretford and Urmston Examiner	
Sunday Gazette	
Sunday News	
Surrey & Middlesex Standard	
Surrey Herald and County Advertiser	
Surrey Mercury	
Swansea and Glamorgan Herald	
Swansea Journal and South Wales Liberal	
Thacker's Overland News for India and the Colonies	
The Age (London)	
The Age 1852	
The Agricultural Advertiser and Tenant-Farmers' Advocate	AATA
The Albion	
The Albion and the Star 	
The Anti-Gallican Monitor	
The Argus, or, Broad-sheet of the Empire	
The Atherstone, Nuneaton, and Warwickshire Times	
The Aurora Borealis	
The Ballot	
The Barrow Herald and Furness Advertiser	
The Bath Chronicle	BHCH
The Beacon (Edinburgh)	
The Beacon (London)	
The Bee-Hive	
The Belfast News-Letter	BNWL
The Birkenhead News	
The Blackburn Standard	BLSD
The Blackpool Herald	
The Blandford and Wimborne Telegram	
The Borough of Greenwich Free Press	
The Bristol Mercury	BLMY
The British Banner	
The British Emancipator	
The British Ensign	
The British Liberator	
The British Luminary	
The British Neptune	
The British Press	
The British Standard	
The British Statesman	
The Brunswick, or, True Blue	
The Bury and Norwich Post	BNPT
The Cannock Chase Examiner	
The Censor or Satirical Times	
The Central Glamorgan Gazette	----------------
The Champion (London)
The Champion 	CHPN
The Charter	CHTR
The Chartist	CHTT
The City Chronicle	
The Civil & Military Gazette	
The Clerkenwell Dial and Finsbury Advertiser	
The Colonist and Commercial Weekly Advertiser	
The Commercial Chronicle	
The Constitution	
The Cosmopolitan	
The Cotton Factory Times	
The Courier	
The Court Gazette and Fashionable Guide	
The Crim. Con. Gazette
The Crown	
The Daily Director and Entr'acte	
The Day	
The Dewsbury Chronicle and West Riding Advertiser	
The Dial	
The Dissenter	
The East Riding Telegraph	
The Eastern Star	
The Emigrant and the Colonial Advocate	
The English Chronicle and Whitehall Evening Post
The Englishman	
The Era	ERLN
The Essex Standard	ESSD
The Evening Star	
The Evening Times (London)	
The Evening Times 1825	
The Examiner	EXLN
The Express	
The Forest of Dean Examiner	
The General Evening Post	
The Glasgow Chronicle	
The Graphic	GCLN
The Hammersmith Advertiser	
The Hampshire Advertiser	SOHD
The Hebrew Observer	
The Herald of Wales	
The Illustrated Newspaper	
The Illustrated Police News	HPNW
The Imperial Weekly Gazette	
The Instructor and Select Weekly Advertiser	
The Ipswich Journal    IPJO
The Isle of Man Times	IMTS
The Kingsland Times and General Advertiser	
The Lady's Newspaper and Pictorial Times	
The Lady's Own Paper	
The Lancaster Gazette	LAGER
The Lancaster Herald and Town and County Advertiser 	
The Leeds Mercury	LEMR
The Little Times	
The Liverpool Albion	
The Liverpool Chronicle	
The Liverpool Telegraph	
The London & China Herald	
The London and Liverpool Advertiser	
The London and Scottish Review	
The London Chronicle	
The London Chronicle and Country Record	
The London Daily Guide and Stranger's Companion	
The London Evening Post 	
The London Free Press	
The London Illustrated Weekly	
The London Journal and Pioneer Newspaper	
The London Mercury	
The London Mercury 1836	
The London Mercury 1847	
The London Mirror	
The London Packet and New Lloyd's Evening Post	
The London Phalanx	
The London Scotsman	
The London Telegraph	
THe London Weekly Investigator	
The Man about Town	
The Manchester Examiner	
The Metropolitan	
The Midland Examiner and Wolverhampton Times	
The Monthly Times	
The Morning Gazette	
The Morning Mail	
The Nation	
The National	
The National Protector	
The National Standard	
The New Globe	
The New Weekly True Sun	
The Newcastle Courant 	NECT
The News	
The North Cumberland Reformer	
The North Londoner	
The North-West London Times	
The Northern Daily Times	
The Northern Guardian	
The Nottinghamshire Guardian	NOGN
The Nuneaton Times	
The Observer of the Times	
The Odd Fellow	ODFW
The Operative	OPTE
The Oracle and the Daily Advertiser	
The Paddington Advertiser	
The Pall Mall Gazette	PMGZ
The Palladium	
The Patriot	
The People's Hue and Cry or Weekly Police Register	
The People's Paper	
The Pilot	
The Pioneer and Weekly Record of Movements	
The Planet	
The Political Letter	
The Political Observer	
The Pontypridd District Herald	
The Poor Man's Guardian	PMGU
The Porcupine	
The Potteries Examiner	
The Press	
The Preston Chronicle and Lancashire Advertiser	PNCH
The Public Cause	
The Radical	
The Railway Bell and London Advertiser	
The Reformer	
The Representative	
The Saint James's Chronicle	
The Satirist; or, the Censor of the Times	
The Sheffield Independent	SHIN
The Shropshire Examiner	
The Slaithwaite Guardian and Colne Valley News	
The South Staffordshire Examiner	
The St. Helens Examiner, and Prescot Weekly News	
The Standard	SDLN
The Standard of Freedom 	
The Star	STGY
The Stockton Examiner and South Durham and North Yorkshire Herald	
The Sun	
The Sun & Central Press	
The Sun & Central Press	
The Sunday Evening Globe	
The Sunday Morning Herald	
The Sussex & Surrey Chronicle	
The Tamworth Miners' Examiner and Working Men's Journal	
The Tichborne Gazette	
The Tichborne News and Anti-Oppression Journal	
The Tower Hamlets Mail	
The Trades' Free Press	
The True Briton	
The True Sun	
The Union	
The Universe	
The Verulam	
The Vindicator	
The Warrington Examiner	
The Warwickshire Herald	
The Watchman	
The Week's News	
The Weekly Advertiser	
The Weekly Chronicle	
The Weekly Echo	
The Weekly Globe	
The Weekly Independent	
The Weekly Intelligence	
The Weekly Journal	
The Weekly Mail	
The Weekly Review	
The Weekly Star and Bell's News	
The Wellington Gazette and Military Chronicle	
The West End News	
The West London Times	
The Westminster Times	
The Weymouth Telegram	
The World	
The World and Fashionable Sunday Chronicle	
The York Herald	YOHD
Town & Country Daily Newspaper	
Town and Country Advertiser	
Town Talk	
Town Talk 1822	
Town Talk 1823	
Trade Protection Record	
Weekly Times	
Weekly True Sun	
West Londoner and Select Advertiser for the Borough of Marylebone	
Western Mail	WMCF
Westminster Journal and Old British Spy	
Whitehall Evening Post	
Widnes Examiner	
Wooler's British Gazette	
Wrexham Advertiser	WRWA
Y Genedl Gymreig	GNDL
Y Goleuad	GLAD
York House Papers	

## Using this mapping and list to get a final list of titles-alias with their corresponding lists of NLPs and effective start/end years in the data

This will allow us to generate the final and update DSA_access-rights list, which is then used for several internal processings

In [None]:
# read the data in the csv, extracted from the gsheet

bl_media_list_ext_path = '/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/BL/BL_extended_title_list.csv'

bl_media_lst_ext_raw_df = pd.read_csv(bl_media_list_ext_path, header=1, index_col=0)
print(bl_media_lst_ext_raw_df.info())
bl_media_lst_ext_raw_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 647 entries, 1 to 628
Data columns (total 12 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Normalized Working Title           647 non-null    object 
 1   Working title (BL)                 647 non-null    object 
 2   Variant Title                      647 non-null    object 
 3   NLP                                647 non-null    int64  
 4   Alias (in file-syst or generated)  647 non-null    object 
 5   Country                            549 non-null    object 
 6   Start Year                         544 non-null    float64
 7   End Year                           544 non-null    float64
 8   Copy already shared with Impresso  647 non-null    object 
 9   Start year in Impresso local copy  647 non-null    int64  
 10  End year in Impresso local copy    647 non-null    int64  
 11  Notes about local copy             87 non-null     object 
dtyp

Unnamed: 0,Normalized Working Title,Working title (BL),Variant Title,NLP,Alias (in file-syst or generated),Country,Start Year,End Year,Copy already shared with Impresso,Start year in Impresso local copy,End year in Impresso local copy,Notes about local copy
1,Aberdeen Press and Journal,Aberdeen Press and Journal,Aberdeen Journal and General Advertiser,31,ANJO,Scotland,1798.0,1876.0,"Yes, fully",1789,1876,
2,Aberdeen Press and Journal,Aberdeen Press and Journal,Aberdeen Weekly Journal and General Advertiser,32,ANJO,Scotland,1876.0,1900.0,"Yes, fully",1877,1900,There were some small problems in the filenami...
444,Alston Herald and East Cumberland Advertiser,Alston Herald and East Cumberland Advertiser,"Alston Herald, and East Cumberland Advertiser.",3043,AHEC,England,1875.0,1879.0,"Yes, fully",1875,1879,Not separated in the data
492,Alston Herald and East Cumberland Advertiser,Alston Herald and East Cumberland Advertiser,"Alston Herald, and East Cumberland Advertiser",3043,AHEC,England,1880.0,1880.0,"Yes, fully",1880,1880,Not separated in the data
369,Baldwin's London Weekly Journal,Baldwin's London Weekly Journal,"Baldwin's London Weekly Journal, etc",2243,BLWJ,England,1803.0,1836.0,"Yes, fully",1803,1836,


In [111]:
# ennsuring that we actually have data fro all the titles present in the list
bl_media_lst_ext_raw_df['Copy already shared with Impresso'].value_counts()

Copy already shared with Impresso
Yes, fully                    505
Yes, not originally lsit      102
Yes, partially                 21
Yes, more than in the list     19
Name: count, dtype: int64

In [112]:
# reformatting the csv to keep only columns of interest, and have the columns be in the correct format

cols_to_remove = ['Country', 'Notes about local copy', 'Start Year', 'End Year', 'Copy already shared with Impresso']

bl_media_lst_ext_df = bl_media_lst_ext_raw_df.drop(cols_to_remove, axis=1)
bl_media_lst_ext_df['NLP'] = bl_media_lst_ext_df['NLP'].apply(lambda x: str(x).zfill(7))
bl_media_lst_ext_df['Normalized Working Title'] = bl_media_lst_ext_df['Normalized Working Title'].apply(lambda x: x.strip())
bl_media_lst_ext_df.head()

Unnamed: 0,Normalized Working Title,Working title (BL),Variant Title,NLP,Alias (in file-syst or generated),Start year in Impresso local copy,End year in Impresso local copy
1,Aberdeen Press and Journal,Aberdeen Press and Journal,Aberdeen Journal and General Advertiser,31,ANJO,1789,1876
2,Aberdeen Press and Journal,Aberdeen Press and Journal,Aberdeen Weekly Journal and General Advertiser,32,ANJO,1877,1900
444,Alston Herald and East Cumberland Advertiser,Alston Herald and East Cumberland Advertiser,"Alston Herald, and East Cumberland Advertiser.",3043,AHEC,1875,1879
492,Alston Herald and East Cumberland Advertiser,Alston Herald and East Cumberland Advertiser,"Alston Herald, and East Cumberland Advertiser",3043,AHEC,1880,1880
369,Baldwin's London Weekly Journal,Baldwin's London Weekly Journal,"Baldwin's London Weekly Journal, etc",2243,BLWJ,1803,1836


In [113]:
bl_media_lst_gpd = bl_media_lst_ext_df.groupby('Normalized Working Title').agg({
        "Alias (in file-syst or generated)": lambda x: x.unique()[0] if len(x.unique())==1 else x.unique(),
        "Start year in Impresso local copy": lambda x: x.min(),
        "End year in Impresso local copy": lambda x: x.max(),
        "NLP": lambda x: x.unique(),
        'Working title (BL)': lambda x: x.unique(),
        "Variant Title": lambda x: x.unique(),
    },
).reset_index().rename(columns={
    "Alias (in file-syst or generated)": "Alias",
    'Working title (BL)': 'BL Working Titles',
    "Variant Title": "Variant Titles",
    "NLP": "NLPs",
    "Start year in Impresso local copy": "Start Year",
    "End year in Impresso local copy": "End Year",
})

# assert that there is indeed only one alias for each working title:
assert all([isinstance(x, str) for x in bl_media_lst_gpd["Alias"].values]), "There are working titles with multiple aliases!"

bl_media_lst_gpd

Unnamed: 0,Normalized Working Title,Alias,Start Year,End Year,NLPs,BL Working Titles,Variant Titles
0,Aberdeen Press and Journal,ANJO,1789,1900,"[0000031, 0000032]",[Aberdeen Press and Journal],"[Aberdeen Journal and General Advertiser, Aber..."
1,Alston Herald and East Cumberland Advertiser,AHEC,1875,1880,[0003043],[Alston Herald and East Cumberland Advertiser],"[Alston Herald, and East Cumberland Advertiser..."
2,Baldwin's London Weekly Journal,BLWJ,1803,1836,[0002243],[Baldwin's London Weekly Journal],"[Baldwin's London Weekly Journal, etc]"
3,Baner ac Amserau Cymru,BNER,1857,1900,"[0000036, 0000037]",[Baner ac Amserau Cymru],"[Baner Cymru, Baner ac Amserau Cymru]"
4,Bargoed Journal,BGJO,1904,1912,"[0003104, 0003548]",[Bargoed Journal],"[Bargoed Journal, New Tredegar, Bargoed & Caer..."
...,...,...,...,...,...,...,...
369,Wooler's British Gazette,WBGZ,1819,1823,[0002762],[Wooler's British Gazette],[Wooler's British Gazette]
370,Wrexham Advertiser,WRWA,1854,1900,"[0000185, 0000496]",[Wrexham Advertiser],"[Wrexham Weekly Advertiser, Wrexham Advertiser]"
371,Y Genedl Gymreig,GNDL,1877,1900,[0000059],[Y Genedl Gymreig],[Y Genedl Gymreig]
372,Y Goleuad,GLAD,1869,1900,[0000058],[Y Goleuad],[Y Goleuad]


Small sanity check that all title-alias mappings are indeed correct

In [115]:
alias_mismatch = [bl_aliases[t]==a for t, a in bl_media_lst_gpd[['Normalized Working Title',"Alias"]].values]
mismatch_idices = np.where(~np.array(alias_mismatch))[0].tolist()
assert all(alias_mismatch), f"There is a mismatch in the title-aliases mapping!, indices: {mismatch_idices}"
print("It's all good! all IDs are unique and match!")

It's all good! all IDs are unique and match


In [116]:
for idx in mismatch_idices:
    title = bl_media_lst_gpd.iloc[idx]['Normalized Working Title']
    print(f"Working title: {title}, alias in gsheet: {bl_media_lst_gpd.iloc[idx]['Alias']}, correct alias: {bl_aliases[title]}")

### Now that the list is finalized and compiled, save it

In [118]:
out_dir = os.path.dirname(bl_media_list_ext_path)

out_path = os.path.join(out_dir, "BL_title_alias_mapping.csv")
out_path

'/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/BL/BL_title_alias_mapping.csv'

In [119]:
bl_media_lst_gpd.to_csv(out_path)

## 2. Reorganizing the data on the NAS to fit our requirements

Currently we have under `/mnt/project_impresso/original/BL_old`:
- All the NLPs that were shared with us
- With the substructure `NLP/YYYY/MMDD/files`
- There are some errors with files copied across multiple subdirs
- Images are included in the issues files

We would like to have, under `/mnt/project_impresso/original/BL`:
- A file structure of the following format: `alias/NLP(s)/YYYY/MM/DD/files
- not copy the images, but only the OCR XML files

This will be done using mounting points to folders for which I have rw access to the NAS (the mount into `original` is ro for security):
- `/mnt/impresso_ocr_BL` for the files
- `/mnt/impresso_images_BL` for the images

In [120]:
source_path = '/mnt/project_impresso/original/BL_old'
dest_path = '/mnt/impresso_ocr_BL'

In [121]:
NLPs_in_source = os.listdir(source_path)
len(NLPs_in_source), NLPs_in_source[:5]

(601, ['0000504', '0002366', '0000491', '0002364', '0002357'])

Create a mapping from NLP to Alias

In [127]:
alias_to_nlps = bl_media_lst_gpd[['Alias', 'NLPs']].to_dict(orient='records')
print(alias_to_nlps[:5])

nlp_to_alias = {nlp: record['Alias'] for record in alias_to_nlps for nlp in record['NLPs']}
nlp_to_alias

[{'Alias': 'ANJO', 'NLPs': array(['0000031', '0000032'], dtype=object)}, {'Alias': 'AHEC', 'NLPs': array(['0003043'], dtype=object)}, {'Alias': 'BLWJ', 'NLPs': array(['0002243'], dtype=object)}, {'Alias': 'BNER', 'NLPs': array(['0000036', '0000037'], dtype=object)}, {'Alias': 'BGJO', 'NLPs': array(['0003104', '0003548'], dtype=object)}]


{'0000031': 'ANJO',
 '0000032': 'ANJO',
 '0003043': 'AHEC',
 '0002243': 'BLWJ',
 '0000036': 'BNER',
 '0000037': 'BNER',
 '0003104': 'BGJO',
 '0003548': 'BGJO',
 '0003041': 'BTEP',
 '0002986': 'BFNP',
 '0002789': 'BELL',
 '0002347': 'BPDH',
 '0000150': 'WOJL',
 '0002778': 'BPHF',
 '0000033': 'BDPO',
 '0003052': 'BWNW',
 '0003053': 'BWNW',
 '0000155': 'BROR',
 '0003056': 'BGCH',
 '0003057': 'BGCH',
 '0003059': 'BQGA',
 '0003060': 'BBLT',
 '0002769': 'BRIF',
 '0002770': 'BRIF',
 '0002771': 'BRIF',
 '0003062': 'BRGA',
 '0003061': 'BRGA',
 '0000040': 'BRPT',
 '0002811': 'BRAD',
 '0002812': 'BRAD',
 '0002813': 'BRAD',
 '0002772': 'BRMW',
 '0002773': 'BRMW',
 '0003537': 'BRMG',
 '0003538': 'BRMG',
 '0003539': 'BRMG',
 '0003540': 'BRMG',
 '0003541': 'BRMG',
 '0000045': 'CNMR',
 '0000046': 'CNMR',
 '0000047': 'CNMR',
 '0002984': 'CKTC',
 '0002985': 'CKTC',
 '0003244': 'CPAD',
 '0003245': 'CPAD',
 '0000157': 'CHOR',
 '0000158': 'CHOR',
 '0000485': 'CHOR',
 '0002765': 'CHTI',
 '0002766': 'CHTI',


In [141]:
def extract_date(root_path):
    # extract the year, month and day for a root path which has been format-checked

    path_tail = root_path.split('/')[-2:]
    # path_tail should be in format: ['YYYY', 'MMDD']
    y, m, d = path_tail[0], path_tail[1][:2], path_tail[1][2:]
    
    try:
        # assert that this is a valid date
        date = datetime(year=int(y), month=int(m), day=int(d))
        return True, y, m, d
    except ValueError as e:
        msg = f"{root_path}: Invalid date! {y, m, d}, error: {e}"
        print(msg)
        #logger.error(msg)
        return False, y, m, d

In [136]:
'MMDD'[2:]

'DD'

In [None]:
def copy_files_for_NLP(nlp, alias, source_dir=source_path, dest_dir=dest_path, xml_ext = '.xml'):
    # given an NLP, copy all the files within it in the new desired structure
    msg = f"Processing {alias} - NLP {nlp}"
    print(msg)
    #logger.info(msg)

    # first create the subdir for the NLP, inside a director for the Alias, creating it if it does not exist yet
    os.makedirs(os.path.join(dest_dir, alias, nlp), exist_ok=True)

    # then iterate on all the years, and for each one, recreate the structure (MM/DD) and copy the *.xml files
    all_years = os.listdir(os.path.join(source_dir, nlp))
    for root,dirs,files in os.walk(os.path.join(source_dir, nlp)):
        # identify the cases when we are in a issue's directory
        if len(files)!=0 and any(xml_ext in f for f in files):
            # standard case, ensure the root path follows what we expect (there are errors)
            valid_date, y, m ,d = extract_date(root)
            if len(dirs) == 0:
            #if len(dirs) == 0 and verify_path(root):
                year, month, day = extract_date(root)
                
            else:
                msg = f"the"
                

    for y_idx, year in tqdm(enumerate(all_years)):
        msg = f"  - {alias} - {nlp}: Processing year {year} ({y_idx+1}/{len(all_years)})..."
        print(msg)
        #logger.info(msg)

        # iterate over all the month/day
        mmmdd_subdirs

In [151]:
i = 0
last_nlp = ''
for root,dirs,files in os.walk(source_path):
    curr_nlp = root.split('/')[-1]
    if curr_nlp != last_nlp and len(curr_nlp)==7:
        print(curr_nlp)
        last_nlp = curr_nlp
        i = 0
    if len(files)!=0 and any('.xml' in f for f in files):
        i +=1
        valid_date, y, m ,d = extract_date(root)
        if not valid_date:
            print(f"root: {root}, dirs: {dirs}, files: {files}")
    if i>40:
        continue

0000504
0002366
0000491
0002364
0002357
0003064
0000045


KeyboardInterrupt: 

In [146]:
i = 0
for root,dirs,files in os.walk(os.path.join(source_path, '0000031')):
    if i < 5 or len(files)==0:
        print(f"root: {root}, dirs: {dirs}, files: {files}")
    i+=1
    if root == '/mnt/project_impresso/original/BL_old/0000031/1813':
        break

KeyboardInterrupt: 

In [145]:
valid_date, y, m ,d = extract_date('/mnt/project_impresso/original/BL_old/0000031/1860/0125')
valid_date, y, m ,d 

(True, '1860', '01', '25')