In [1]:
import pandas as pd
import numpy as np
from scipy import stats
df = pd.read_csv('Analysis.csv') 
df.head()

Unnamed: 0,Aleph system no._cleaned,Country code_cleaned,Language code (008)_cleaned,Language code (041)_cleaned,Personal author_cleaned,Personal author_QID,Corporate author_cleaned,Title_cleaned,Edition_cleaned,Publisher,...,Publisher_QID,Place,Year,Series_cleaned,Subjects_cleaned,Other personal authors_cleaned,Other personal authors_QID,Other corporate authors_cleaned,Type_cleaned,Genre_cleaned
0,14602840,,,,Campbell Rae Brown,,,Providence Point and Quartremayne's Quarry,,Samuel French,...,Q7411424,London; New York,1889.0,,,,,,recitation,Drama
1,14602858,,,,Elizabeth Inchbald,Q469974,,Next door Neighbours,,,...,,London,1791.0,,,Philippe Néricault Destouches,Q177063,,comedy,Drama
2,14602867,,,,,,,The Queens visit to Birmingham,,W. Cornish,...,,Birmingham,1858.0,,,Victoria,Q9439,,play,Drama
3,14602868,,,,,,,Manipure Tragedy,,Jagot Chandra Das,...,,Chittagong,1893.0,,,,,,tragedy,Drama
4,14602876,,,,Leonard Terry,Q20005050,,To the Lions Christians in the Colosseum,,A. Betterton,...,,London,1889.0,,,,,,recitation,Drama


In [2]:
# Handle Missing Year Values
df_clean = df.dropna(subset=['Year']).copy()
df_clean['Year'] = df_clean['Year'].astype(int)

# Handle Missing Place Values
df_clean['Place'] = df_clean['Place'].fillna('Unknown').astype(str)

# plit Co-publications into Lists
df_clean['Place_List'] = df_clean['Place'].apply(lambda x: x.split(';'))

# Expand the dataframe so each item in 'Place_List' gets its own row
df_final = df_clean.explode('Place_List')

# Remove whitespaces
df_final['Place_Final'] = df_final['Place_List'].str.strip()

# Verification
print(f"Original records: {len(df)}")
print(f"Post-split records: {len(df_final)}") 
co_publications = df_final[df_final['Place'].str.contains(';', na=False)]
co_publications.sort_index()

Original records: 1647
Post-split records: 1676


Unnamed: 0,Aleph system no._cleaned,Country code_cleaned,Language code (008)_cleaned,Language code (041)_cleaned,Personal author_cleaned,Personal author_QID,Corporate author_cleaned,Title_cleaned,Edition_cleaned,Publisher,...,Year,Series_cleaned,Subjects_cleaned,Other personal authors_cleaned,Other personal authors_QID,Other corporate authors_cleaned,Type_cleaned,Genre_cleaned,Place_List,Place_Final
0,14602840,,,,Campbell Rae Brown,,,Providence Point and Quartremayne's Quarry,,Samuel French,...,1889,,,,,,recitation,Drama,London,London
0,14602840,,,,Campbell Rae Brown,,,Providence Point and Quartremayne's Quarry,,Samuel French,...,1889,,,,,,recitation,Drama,New York,New York
40,14610213,,,,,,,Recitations for Infants,,W.; R. Chambers,...,1888,,,,,,recitation,Drama,Edinburgh,Edinburgh
40,14610213,,,,,,,Recitations for Infants,,W.; R. Chambers,...,1888,,,,,,recitation,Drama,London,London
74,14616790,,,,,,,Hodgepodge.,,William Blackwood; Sons,...,1884,,,"Disraeli, Benjamin, Earl Of Beaconsfield, - --...",,,comedy,Drama,London,London
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,14847724,,eng,,R. C. H Morison,,,Little Recitations for Little Reciters,,W.; R. Chambers,...,1898,,,,,,recitation,Drama,Edinburgh,Edinburgh
1568,14866494,,eng,,John William Calcraft,,,The Bride of Lammermoor,,"Simpkin, Marshall",...,1823,,,Walter Scott,Q79025,,play,Drama,Edinburgh,Edinburgh
1568,14866494,,eng,,John William Calcraft,,,The Bride of Lammermoor,,"Simpkin, Marshall",...,1823,,,Walter Scott,Q79025,,play,Drama,London,London
1604,14912292,,,,John Delap,Q15989957,,The Royal Suppliants,,J. Bowen,...,1781,,,,,,tragedy,Drama,London,London


In [3]:
from scipy import stats

# Grouping by Year and the cleaned Place column to count annual publications
geo_trends = df_final.groupby(['Year', 'Place_Final']).size().reset_index(name='Publication_Count')
geo_trends.head()

Unnamed: 0,Year,Place_Final,Publication_Count
0,1540,London,1
1,1592,London,1
2,1605,London,1
3,1605,Unknown,1
4,1606,London,3


In [4]:
# Comparison: London vs. Non-London Regions

# Create a binary location category
df_final['Location_Type'] = df_final['Place_Final'].apply(lambda x: 'London' if x == 'London' else 'Non-London')

# Re-aggregate trends based on this new category
binary_trends = df_final.groupby(['Year', 'Location_Type']).size().reset_index(name='Count')

# Regression Analysis for both
def run_binary_regression(category):
    data = binary_trends[binary_trends['Location_Type'] == category]
    if len(data) > 2:
        slope, intercept, r, p, std = stats.linregress(data['Year'], data['Count'])
        print(f" {category} Aggregate Trend")
        print(f"Growth Slope: {slope:.4f}")
        print(f"P-value: {p:.6f}") # Using standard decimal
        print(f"R-squared: {r**2:.4f}\n")

run_binary_regression('London')
run_binary_regression('Non-London')

 London Aggregate Trend
Growth Slope: 0.0163
P-value: 0.000001
R-squared: 0.0907

 Non-London Aggregate Trend
Growth Slope: 0.0123
P-value: 0.000067
R-squared: 0.1519



In [5]:
# Growth Rate Analysis (Log Transformation)
trends_pivot = binary_trends.pivot(index='Year', columns='Location_Type', values='Count').fillna(0)

print(" Relative GROWTH RATE")
for cat in ['London', 'Non-London']:
    y_log = np.log1p(trends_pivot[cat])
    slope, _, _, p, _ = stats.linregress(trends_pivot.index, y_log)
    p_fmt = f"{p:.6f}" if p > 0.001 else "< 0.001"
    print(f"{cat:10} Log Slope: {slope:.6f} (p {p_fmt})")
    
# Temporal Segmentation
def run_period_analysis_refined(year_start, year_end):
    print(f" PERIOD: {year_start} - {year_end}")      
    for cat in ['London', 'Non-London']:
        mask = (trends_pivot.index >= year_start) & (trends_pivot.index <= year_end)
        y = trends_pivot.loc[mask, cat]
        x = y.index
        
        if len(y) > 2:
            slope, intercept, r, p, _ = stats.linregress(x, y)
            p_fmt = f"{p:.6f}" if p > 0.001 else "< 0.001"
            # Here we include R-squared to show the "Coupling" strength
            print(f"{cat:10}: Slope={slope:8.4f}, R²={r**2:.4f}, p {p_fmt}")

run_period_analysis_refined(1800, 1850)
run_period_analysis_refined(1851, 1900)



 Relative GROWTH RATE
London     Log Slope: 0.002458 (p < 0.001)
Non-London Log Slope: 0.004026 (p < 0.001)
 PERIOD: 1800 - 1850
London    : Slope= -0.1811, R²=0.2974, p < 0.001
Non-London: Slope= -0.0349, R²=0.1034, p 0.021368
 PERIOD: 1851 - 1900
London    : Slope=  0.3042, R²=0.5328, p < 0.001
Non-London: Slope=  0.0855, R²=0.2528, p < 0.001


In [7]:
# Co-publishing Linked to London

id_col = 'Aleph system no._cleaned'

# Map each ID to its set of publication places
id_to_places = df_final.groupby(id_col)['Place_Final'].apply(set).to_dict()

def classify_strategy(row):
    all_places = id_to_places.get(row[id_col], {row['Place_Final']})
    if len(all_places) <= 1:
        return 'Purely Local'
    
    # Check if London is in the collaboration network
    other_places = all_places - {row['Place_Final']}
    return 'Linked to London' if 'London' in other_places else 'Regional Collab'

# Filter non-London records and extend to 1900
non_london_view = df_final[(df_final['Place_Final'] != 'London') &
                           (df_final['Year'] <= 1926)].copy()
non_london_view['Strategy'] = non_london_view.apply(classify_strategy, axis=1)

# Aggregate by 25-year periods
non_london_view['Period'] = (non_london_view['Year'] // 25) * 25
strategy_counts = non_london_view.groupby(['Period', 'Strategy']).size().unstack(fill_value=0)

# Calculate percentages
strategy_pct = strategy_counts.div(strategy_counts.sum(axis=1), axis=0) * 100

print(" NON-LONDON PUBLISHING STRATEGIES (%)")
print(strategy_pct.round(2))

 NON-LONDON PUBLISHING STRATEGIES (%)
Strategy  Linked to London  Purely Local
Period                                  
1600                  0.00        100.00
1625                  0.00        100.00
1650                  0.00        100.00
1700                  0.00        100.00
1750                  0.00        100.00
1775                  8.33         91.67
1800                  7.69         92.31
1825                  0.00        100.00
1850                  9.76         90.24
1875                 23.91         76.09


In [8]:
# remove missing value
df_sna = df_final[['Personal author_cleaned', 'Publisher']].dropna()

# Group by Author and Publisher to count how many times they worked together
weighted_edges = df_sna.groupby(['Personal author_cleaned', 'Publisher']).size().reset_index(name='Weight')
weighted_edges.columns = ['Source', 'Target', 'Weight']

# Export to CSV
weighted_edges.to_csv('author_publisher_weighted_edges.csv', index=False)
print(f"Success!Total unique relationships found: {len(weighted_edges)}")


Success!Total unique relationships found: 595
