In [37]:
import pandas as pd

In [38]:
crimes = pd.read_csv('data/crime.csv')
education = pd.read_csv('data/education.csv')

### Cleaning Crimes Dataset and Joining to Education Dataset

In [39]:
# Removing the year 2021 as this is not included in the other datasets
crimes = crimes.loc[crimes['REF_DATE'] != 2021]

# Remove unwanted columns. I kept the 'GEO', 'REF_DATE', and 'VALUE' columns.
crimes = crimes[['GEO', 'REF_DATE', 'VALUE']]

# Group by 'GEO' and 'REF_DATE'. Here I summed together the values for each province for each year.
crimes_grouped = crimes.groupby(['GEO', 'REF_DATE'])['VALUE'].sum().reset_index()

# Clean 'GEO' names. Remove the number in brackets at the end of each province name.
crimes_grouped['GEO'] = crimes_grouped['GEO'].str.replace('\s*\[\d+\]', '', regex=True)

# Filter out unwanted rows. I removed the rows that contained the following value in the 'GEO' column.
crimes_grouped = crimes_grouped.loc[crimes_grouped['GEO'] != 'Canadian Forces Military Police']

# Rename columns. I renamed the columns to 'Province', 'Year', and 'Crimes'.
crimes_grouped = crimes_grouped.rename(columns={'VALUE': 'Crimes', 'GEO': 'Province', 'REF_DATE': 'Year'})

# Group Territories. I grouped the territories together and summed the values for each year.
territories = ['Northwest Territories', 'Nunavut', 'Yukon']
total_territories_crimes = crimes_grouped[crimes_grouped['Province'].isin(territories)].groupby('Year')['Crimes'].sum().reset_index()
total_territories_crimes['Province'] = 'Territories'

# Remove Territories from crimes_grouped and add total_territories_crimes to crimes_grouped.
crimes_grouped = pd.concat([crimes_grouped, total_territories_crimes], ignore_index=True)
crimes_grouped = crimes_grouped[~crimes_grouped['Province'].isin(territories)]

# Merge crimes_grouped with education. I merged the two datasets together on the 'Province' and 'Year' columns.
merged_df = pd.merge(crimes_grouped, education, on=['Province', 'Year'], how='inner')

Unnamed: 0,Province,Year,Crimes,Registrants
0,Alberta,2017,1005325.43,194010
1,Alberta,2018,1043221.45,200391
2,Alberta,2019,1071487.39,203823
3,Alberta,2020,933926.17,203523
4,British Columbia,2017,838288.24,286914
5,British Columbia,2018,852692.63,295320
6,British Columbia,2019,949793.36,297432
7,British Columbia,2020,871957.23,287523
8,Manitoba,2017,361011.29,62802
9,Manitoba,2018,361049.0,63363
