In [17]:
import pandas as pd

In [18]:
crimes = pd.read_csv('data/crime.csv')
education = pd.read_csv('data/education.csv')
population = pd.read_csv('data/population.csv')
visitors = pd.read_csv('data/visitors.csv')
census = pd.read_csv('data/population factors.csv')

### Cleaning and Combining the Datasets

In [19]:
# Removing the year 2021 as this is not included in the other datasets
crimes = crimes.loc[crimes['REF_DATE'] != 2021]

# Remove unwanted columns. I kept the 'GEO', 'REF_DATE', and 'VALUE' columns.
crimes = crimes[['GEO', 'REF_DATE', 'VALUE']]

# Group by 'GEO' and 'REF_DATE'. Here I summed together the values for each province for each year.
crimes_grouped = crimes.groupby(['GEO', 'REF_DATE'])['VALUE'].sum().reset_index()

# Clean 'GEO' names. Remove the number in brackets at the end of each province name.
crimes_grouped['GEO'] = crimes_grouped['GEO'].str.replace('\s*\[\d+\]', '', regex=True)

# Filter out unwanted rows. I removed the rows that contained the following value in the 'GEO' column.
crimes_grouped = crimes_grouped.loc[crimes_grouped['GEO'] != 'Canadian Forces Military Police']

# Rename columns. I renamed the columns to 'Province', 'Year', and 'Crimes'.
crimes_grouped = crimes_grouped.rename(columns={'VALUE': 'Crimes', 'GEO': 'Province', 'REF_DATE': 'Year'})

# Group Territories. I grouped the territories together and summed the values for each year.
territories = ['Northwest Territories', 'Nunavut', 'Yukon']
total_territories_crimes = crimes_grouped[crimes_grouped['Province'].isin(territories)].groupby('Year')['Crimes'].sum().reset_index()
total_territories_crimes['Province'] = 'Territories'

# Remove Territories from crimes_grouped and add total_territories_crimes to crimes_grouped.
crimes_grouped = pd.concat([crimes_grouped, total_territories_crimes], ignore_index=True)
crimes_grouped = crimes_grouped[~crimes_grouped['Province'].isin(territories)]

# Group Territories for the Population dataset
total_territories_population = population[population['Province'].isin(territories)].groupby('Year')['Population'].sum().reset_index()
total_territories_population['Province'] = 'Territories'

# Remove individual Territories from the Population dataset and add the grouped data
population = pd.concat([population, total_territories_population], ignore_index=True)
population = population[~population['Province'].isin(territories)]

# Group Territories for the Education dataset
total_territories_visitors = visitors[visitors['Province'].isin(territories)].groupby('Year')['Visitors'].sum().reset_index()
total_territories_visitors['Province'] = 'Territories'

# Remove individual Territories from the Education dataset and add the grouped data
visitors = pd.concat([visitors, total_territories_visitors], ignore_index=True)
visitors = visitors[~visitors['Province'].isin(territories)]

# Merge the datasets together
merged_df = crimes_grouped.merge(education, on=['Province', 'Year'], how='outer')

merged_df_2 = merged_df.merge(population, on=['Province', 'Year'], how='outer')

merged_df_3 = merged_df_2.merge(visitors, on=['Province', 'Year'], how='outer')

final_merge = merged_df_3.merge(census, on=['Province', 'Year'], how='outer')

In [21]:
final_merge.to_csv('merged_dataset.csv', index=False)