In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
df = pd.read_csv('Dropbox/Careers/Data Analytics/Portfolio/World Important Dates.csv')

In [None]:
display(df.head())

In [None]:
print(df.describe(include ='all'))

In [None]:
print(df.dtypes)

In [None]:
missing_values_row = df.isnull().sum(axis=0)

print(missing_values_row)

In [None]:
#create new clean year column is an integer so that BC is - and AD is +
def convert_year(year_value):
    sign = -1 if 'BC' in year_value else 1
    year = int(year_value.replace('BC', ' '))
    return year*sign

df['Year_formatted'] = df['Year'].apply(lambda x: convert_year(x))

print(df['Year_formatted'].head())


print(df['Year_formatted'].tail())

In [None]:
#how many events occurred each year?
events_by_year = df.groupby('Year_formatted')['Name of Incident'].count().reset_index()
events_by_year.columns = ['Year', 'Number of Events']

display(events_by_year)

In [None]:
#plot a histogram of the number of events per year
plt.hist(df['Year_formatted'], bins=100) 
plt.title('Number of events per year')
plt.xlabel('Year')
plt.ylabel('Number of Events')
plt.show()

In [None]:
print(df['Year_formatted'].max())
print(df['Year_formatted'].min())

In [None]:
#plot a histogram of the number of events per year in BC
plt.hist(df['Year_formatted'], bins=100, range = (-3000, 0)) 
plt.title('Number of events per year')
plt.xlabel('Year')
plt.ylabel('Number of Events')
plt.show()

In [None]:
#what type of events are there?
print(df['Type of Event'].value_counts())

In [None]:
#create a BC df
df_BC = df[df['Year_formatted'] <=0]
print(df_BC.head())
print(df_BC.info())

In [None]:
#create an AD df
df_AD = df[df['Year_formatted'] > 0]
print(df_AD.head())
print(df_AD.info())

In [None]:
#create manual clusters for type of event
# Create a new column to store the cluster labels
df['Cluster'] = None

# Define the keywords for each cluster
keyword_clusters = {
    'Military & Conflict': ['Military', 'War', 'Civil War', 'WWII', 'Secession', 'Naval', 'War Declaration', 'Battle', 'Conquest', 'Coup', 'Conflict', 'Insurgency','Offensive', 'Crusade', 'Defense', 'Ceasefire', 'Protest',  'Rebellion', 'Revolt', 'Resistance', 'Dictator', 'Uprising', 'Revolution'],
    'Political & Governance': ['Political', 'Leadership', 'Corruption', 'Assassination', 'Religious', 'Scandal', 'Electoral', 'Civil', 'Election', 'Referendum', 'Politics'],
    'Economic & Labour': ['Economic', 'Trade', 'Banking', 'Finance', 'Labor'],
    'Extremism & Crime': ['Terrorism', 'Criminal Incident', 'Violence', 'Crime', 'Execution'],
    'Human Rights & Social Issues': ['Unrest', 'Social', 'Human Rights', 'National Movement', 'Reconciliation', 'Population'],
    'Environment': ['Environmental', 'Conservation'],
    'Accidents & Disasters':['Natural', 'Disaster', 'Rescue', 'Accident', 'Famine'],
    'Security & Defence': ['Nuclear', 'Security'],
    'International Relations & Diplomacy':['International',  'International Crisis', 'International Incident','Negotiation',  'Conference', 'Government Agency', 'International Organization',  'Cooperation', 'International Integration', 'Foreign Policy', 'Relations', 'Agreement', 'Treaty', 'International Policy', 'Diplomatic', 'Summit'],
    'Independence & Peace':['Independence', 'Partition', 'Peace', 'Country Formation', 'Union', 'Unification', 'Formation', 'Annexation', 'State Establishment','Territorial', 'Federation', 'Nationalization', 'Protectorate'],
    'Sovereign':['Monarch', 'Dynastic', 'Coronation', 'Empire', 'Sovereign', 'Colonial'],
    'Genocide': ['Pogrom', 'Holocaust', 'Genocide', 'Massacre'],
    'Legislation & Regulation' : ['Administration', 'Investigation', 'Legislation', 'Legal', 'Judiciary', 'Regulatory', 'Justice', 'Constitution', 'Administrative', 'Legislative', 'Judicial'],
    'Science & Techhnology':['Aviation', 'Maritime', 'Space', 'Exploration', 'Technology', 'Telecommunication', 'Scientific', 'Engineering', 'Innovation', 'Technological', 'Modernization'],   
    'Historical & Cultural Events':['Historical', 'Mausoleum', 'Monument', 'Media', 'Sports', 'Sport','Cultural', 'Literature','Era Change', 'Industrial', 'Civilization', 'Historical Period', 'Commemoration', 'Memorial', 'Discovery', 'Event', 'Art', 'Architecture', 'Expo', 'Exhibition', 'Architectural', 'Fair',],
    'Infrastructure & Urbanisation': [ 'Structural', 'Land','Urban Development', 'Housing', 'Infrastructure', 'Reconstruction', 'Rail'],
    'Other': ['Foundation', 'Recognition',  'Found', 'Decline'],
    'Education': ['Education'],
    'Aid & International Relations': ['Humanitarian', 'Aid'],
    'Migration & Displacement':['Refugee', 'Settlement', 'Relocation'],
    'Pandemic & Health Crises': ['Pandemic', 'Health','Medical']
    # Add more clusters and corresponding keywords as needed
}

# Assign cluster labels based on keyword matching
for cluster, keywords in keyword_clusters.items():
    mask = df['Type of Event'].str.contains('|'.join(keywords), case=False)
    df.loc[mask, 'Cluster'] = cluster

# Display the DataFrame with cluster labels
#print(df.head())

In [None]:
display( df.groupby('Cluster')['Sl. No'].sum().sort_values(ascending=False))

In [None]:
print(df['Cluster'].isnull().sum())

In [None]:
null_cluster_by_event = pd.crosstab(
    index=df['Type of Event'],
    columns=df['Cluster'].isnull(),
    rownames=['Type of Event'],
    colnames=['Cluster is null']
)

# Select rows where "Cluster is null" column is True and only include "Cluster" and "Type of Event" columns
selected_rows = df[df['Cluster'].isnull()][['Cluster', 'Type of Event']]

# Display the selected rows
print(selected_rows)


In [None]:
#What is the most common type of event by year?
display(df.head())

In [None]:
events_by_year = df.groupby('Year_formatted')['Cluster'].sum().reset_index()
events_by_year.columns = ['Year', 'Type of Event']

display(events_by_year)

In [None]:
import seaborn as sns

In [None]:
events_by_year_AD = df_AD.groupby('Cluster')['Sl. No'].sum().reset_index()

display(events_by_year_AD)

In [None]:
# Assuming 'Cluster' is the x-axis and 'Sl. No' is the y-axis

# Sort the DataFrame by the sum of 'Sl. No' in descending order
df_AD_sorted = df_AD.groupby('Cluster')['Sl. No'].sum().reset_index()
df_AD_sorted = df_AD_sorted.sort_values(by='Sl. No', ascending=False)

plt.figure(figsize=(14,10))
plt.bar(df_AD_sorted['Cluster'], df_AD_sorted['Sl. No'])
plt.title('Number of Events by Cluster (AD Years)')
plt.xlabel('Type of Event')
plt.ylabel('Number of Events')
plt.xticks(range(len(df_AD_sorted['Cluster'])), df_AD_sorted['Cluster'], rotation=45, fontsize=10)
plt.tight_layout()  # Adjust layout to prevent any clipping of labels
plt.show()
plt.clf()


In [None]:
#create a time series where the x axis is the year and the y axis is the number of events and these are categorised/hue by the type of event
# Define the bin edges for 100-year intervals
bin_edges = np.arange(df_AD['Year_formatted'].min(), df_AD['Year_formatted'].max() + 101, 100)

# Group the data by year and type of event and calculate the count
events_by_year_type = df_AD.groupby([pd.cut(df['Year_formatted'], bin_edges), 'Cluster']).size().unstack(fill_value=0)

plt.figure(figsize=(14, 10))

bar_width = 0.1
bar_padding = 2
index = np.arange(len(events_by_year_type.index))

for i, cluster in enumerate(events_by_year_type.columns):
    plt.bar(index + i * bar_width, events_by_year_type[cluster], bar_width, label=cluster)

plt.title('Types of Events Over the Years')
plt.xlabel('Year Range')
plt.ylabel('Number of Events')
plt.xticks(index + (bar_width * len(events_by_year_type.columns) / 2), events_by_year_type.index, rotation=45)
plt.legend(title='Type of Event', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
#1600s onwards - create a time series where the x axis is the year and the y axis is the number of events and these are categorised/hue by the type of event
# Define the bin edges for 100-year intervals
bin_edges = np.arange(df_AD['Year_formatted'].min(), df_AD['Year_formatted'].max() + 101, 100) #creates an array of evenly spaced values (bin edges) - returns min and max values in year, adds 101 to ensure that the max year is included and increments by 100 

# Group the data by year and type of event and calculate the count
events_by_year_type = df_AD.groupby([pd.cut(df['Year_formatted'], bin_edges), 'Cluster']).size().unstack(fill_value=0) 
#groups the data by year as per the bin edges and the type of event (cluster) by counting the number of occurrences of the cluster and year

plt.figure(figsize=(14, 12))

bar_width = 0.1
bar_padding = 3
index = np.arange(len(events_by_year_type.index)) #creates an index from 0 to the length of the events by year on the x axis

for i, cluster in enumerate(events_by_year_type.columns): #iterates over each column of the events by year df to get the i of each cluster 
    plt.bar(index + i * bar_width, events_by_year_type[cluster], bar_width, label=cluster)

plt.title('Types of Events Over the Years')
plt.xlabel('Year Range')
plt.ylabel('Number of Events')
plt.xticks(index + (bar_width * len(events_by_year_type.columns) / 2), events_by_year_type.index, rotation=45)
plt.xlim(16, len(events_by_year_type.index))  # Adjust the range from the 1600s onwards
plt.legend(title='Type of Event', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Specify the file path where you want to save the CSV file
file_path = 'C:\\Users\\Ilana\\Dropbox\\Careers\\Data Analytics\\Portfolio\\world_events.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)


In [None]:
df_sixteenhundreds = df_AD['Year_formatted']>=1600
print(df_sixteenhundreds.head())

In [None]:
# Define the bin edges for 100-year intervals
# Group the data by year range and type of event and calculate the count
Year_sixteenhundreds = df_AD[df_AD['Year_formatted'].apply(lambda x: x.left >= 1600)]

plt.figure(figsize=(14, 12))

sns.barplot(data=Year_sixteenhundreds, x='Year_formatted', y='Sl. No', hue='Cluster', palette='viridis')

plt.title('Types of Events Over the Years')
plt.xlabel('Year Range')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()