 Import requests and Beautifulsoup to web srape from a website to get my crime data for 52 states from 2011 to 2020

 Pulling all data from the website for all the data of all years and saved to violent_crime_rates_all_years.csv

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re  # Regular expression module

url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_violent_crime_rate"
response = requests.get(url)


if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')


    table = soup.find('table', {'class': 'wikitable'})


    with open('violent_crime_rates_all_years.csv', 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)


        # Use a regular expression to match headers that are exactly four digits (years)
        headers = table.find_all('th')
        year_headers = [header.text.strip() for header in headers if re.match(r'^\d{4}$', header.text.strip())]


        header_row = ['State'] + year_headers
        csv_writer.writerow(header_row)

        for row in table.find_all('tr')[1:]:  # Adjusted index to 1 as table headers are usually in the first row
            columns = row.find_all(['th', 'td'])

            if len(columns) > 1:  
                state = columns[0].text.strip()
                # Fetch only columns that match the length of year_headers
                data_columns = [column.text.strip() for column in columns[2:2 + len(year_headers)]]
                csv_writer.writerow([state] + data_columns)

    print("Data for all years has been successfully saved to 'violent_crime_rates_all_years.csv'.")

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


 Read the CSV file into a DataFrame and save DataFrame to new name and display the first few rows of the DataFrame

In [None]:
import pandas as pd

violent_crime = pd.read_csv('violent_crime_rates_all_years.csv')


print(violent_crime.head(20))

 Read the csv file into a pandas DataFrame and use keep, to keep the columns I want in my DataFrame

 Remove the asterisk from my DataFrame and Print out the DataFrame out

In [None]:

violent_crime['State'] = violent_crime['State'].str.replace('*', '')

# Remove any rows where 'State' is NaN
violent_crime = violent_crime[violent_crime['State'].notna()]

 Display the full DataFrame using display.max

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
violent_crime


 Remove row 52 from the index using drop then displaying the DataFrame 

In [None]:
violent_crime = violent_crime.drop(violent_crime.index[52])

In [None]:
violent_crime

 Pulled up the the states with the highest crime rate for each year using convert_to_numeric(value) to handle non-strings

In [None]:
import pandas as pd

columns_to_convert = ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011']

def convert_to_numeric(value):
    try:
        return pd.to_numeric(value.replace(',', ''), errors='coerce')
    except AttributeError:
        return pd.to_numeric(value, errors='coerce')

for column in columns_to_convert:
    violent_crime[column] = violent_crime[column].apply(convert_to_numeric)

violent_crime = violent_crime.fillna(0)


for column in columns_to_convert:
    highest_crime_state = violent_crime.loc[violent_crime[column].idxmax(), 'State']
    print(f"State with the highest crime rate in {column}: {highest_crime_state}")


 The top states with the highest violent crimes from 2000 to 2020 was Washington D.C. had the highest voilent crime rate from 2011 to 2020.



  Pulled up the the states with the lowest crime rate for each year using convert_to_numeric(value) to handle non-strings

In [None]:
import pandas as pd

columns_to_convert = ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011']

def convert_to_numeric(value):
    try:
        return pd.to_numeric(value.replace(',', ''), errors='coerce')
    except AttributeError:
        return pd.to_numeric(value, errors='coerce')

for column in columns_to_convert:
    violent_crime[column] = violent_crime[column].apply(convert_to_numeric)

violent_crime = violent_crime.fillna(0)

for column in columns_to_convert:
    lowest_crime_state = violent_crime.loc[violent_crime[column].idxmin(), 'State']
    print(f"State with the lowest crime rate in {column}: {lowest_crime_state}")


  The states with the lowest crime rate from 2011 to 2020 were Maine and Vermont.


  Saved my violent_crime DataFrame to a new csv file called new_crimefile.cvs

In [None]:
violent_crime.to_csv('new_violent_crimefile.csv', index=False)

In [None]:
violent_crime

 Made a bar chart that shows the top two states during 2011-2020 that shows the greatest and lowest crime rates.  

In [None]:
import matplotlib.pyplot as plt
import pandas as pd



years = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']

highest_values = []
lowest_values = []


for year in years:
    
    highest_state = violent_crime.loc[violent_crime[year].idxmax()]['State']
    lowest_state = violent_crime.loc[violent_crime[year].idxmin()]['State']
    
    highest_rate = violent_crime.loc[violent_crime['State'] == highest_state, year].values[0]
    lowest_rate = violent_crime.loc[violent_crime['State'] == lowest_state, year].values[0]

    highest_values.append((highest_state, highest_rate))
    lowest_values.append((lowest_state, lowest_rate))


highest_df = pd.DataFrame(highest_values, columns=['State', 'Rate'])
lowest_df = pd.DataFrame(lowest_values, columns=['State', 'Rate'])


state_colors = {
    'California': 'blue',
    'Maine': 'red',
    'Washington D.C.': 'blue',
    'Vermont': 'blue'
}

highest_df['Color'] = highest_df['State'].map(state_colors).fillna('blue')
lowest_df['Color'] = lowest_df['State'].map(state_colors).fillna('blue')

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 10))


axes[0].bar(highest_df['State'], highest_df['Rate'], color=highest_df['Color'], label='Highest Crime Rate')
axes[0].set_title('Top two states with the highest Violent Crime Rates during the years  (2011-2020)')
axes[0].set_ylabel('Crime Rate Per 100,000')


axes[1].bar(lowest_df['State'], lowest_df['Rate'], color=lowest_df['Color'], label='Lowest Crime Rate')
axes[1].set_title('Top two states with the Lowest Violent Crime Rates during the years  (2011-2020)')
axes[1].set_ylabel('Crime Rate Per 100,000')

for ax in axes:
    ax.tick_params(axis='x', rotation=45, labelrotation=0)


plt.tight_layout()
plt.show()


 This bar chart above shows the two states that has the highest and the lowest vionlent crime rates from 2011 to 2020, the top highest State was Washington D.C. and the lowest crime rates were Maine and Vermont.


 Opened my crimes.dataset.csv file and making the population_crime the variable 

In [None]:
import pandas as pd
population_crime= pd.read_csv("crimes.dataset.csv")
 
population_crime

 Used keep for the columns that I want to have in my DataFrame 

In [None]:
columns_to_keep = ['Year', 'Population', 'Violent Crimes']


population_crime = population_crime[columns_to_keep]
population_crime

 Saved DataFrame to a new csv file named poplution_violent_crime_rate.cvs using popultation_crime as my variable

In [None]:

population_crime.to_csv('population_violent_crime_rate.csv', index=False)

 Made a small DataFrame so that can save as a csv file and merge with population_violent_crime_rate.csv DataFrame 

In [None]:
import pandas as pd


data = {'Year': [2020],
        'Population': ['335,942,111'],
        'Violent Crimes': ['1,313,105']}

year_2020_dataframe_for_merge = pd.DataFrame(data)

print(year_2020_dataframe_for_merge)

 Save my year_2020_dataframe_for_merge DataFrame to a new csv file called 2020_population_violent_crime.cvs 

In [None]:
year_2020_dataframe_for_merge.to_csv('2020_population_violent_crime.csv', index=False)

 Read two csv files and merge on year and using outer join and save the merge to merged_file.csv using variable called merge_crime_dataframes 

In [None]:
import pandas as pd

year_2020_dataframe_for_merge = pd.read_csv('2020_population_violent_crime.csv')
population_crime = pd.read_csv('population_violent_crime_rate.csv')

merge_crime_datframes = pd.merge(year_2020_dataframe_for_merge, population_crime, on='Year', how='outer')

merge_crime_datframes.to_csv('merged_file.csv', index=False)


In [None]:
merge_crime_datframes

 Read merged_file.csv DataFrame and filled in missing values and drop unneccary columns and print DataFrame 

In [None]:
import pandas as pd

Violent_Population = pd.read_csv('merged_file.csv')


Violent_Population['Population'] = Violent_Population['Population_x'].fillna(Violent_Population['Population_y'])
Violent_Population['Violent Crimes'] = Violent_Population['Violent Crimes_x'].fillna(Violent_Population['Violent Crimes_y'])


Violent_Population = Violent_Population.drop(['Population_x', 'Violent Crimes_x', 'Population_y', 'Violent Crimes_y'], axis=1)

print(Violent_Population)


 I moved the top row to bottom using pd.concat and used reset_index and display changes to DataFrame 

In [None]:
import pandas as pd

data = {
    'Year': [2020, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
    'Population': ['335,942,111', '281,421,906', '285,317,559', '287,973,924', '290,788,976', '293,656,842', '296,507,061', '299,398,484', '301,621,157', '304,059,724', '307,006,550', '309,330,219', '311,587,816', '313,873,685', '316,497,531', '318,907,401', '320,896,618', '323,405,935', '325,147,121', '326,687,501', '328,239,523'],
    'Violent Crimes': ['1,313,105', '1,425,486', '1,439,480', '1,423,677', '1,383,676', '1,360,088', '1,390,745', '1,435,123', '1,422,970', '1,394,461', '1,325,896', '1,251,248', '1,206,005', '1,217,057', '1,168,298', '1,153,022', '1,199,310', '1,250,162', '1,247,917', '1,209,997', '1,203,808']
}

Violent_Population = pd.DataFrame(data)


Violent_Population = pd.concat([Violent_Population.iloc[1:], Violent_Population.iloc[:1]])


Violent_Population.reset_index(drop=True, inplace=True)


print(Violent_Population)


 Save my cleaned and finished DataFrame to violent_population_cleaned_dataframe.cvs file using Violent_Population variable

In [None]:
Violent_Population.to_csv('violent_population_cleaned_dataframe.csv', index=False)

 Made a line chart to so the population and violent crime rates using the Violent_Population DataFrame
 This line chart show the population growth and the violent crime rates from 2000-2020 in the United States.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


Violent_Population['Population'] = pd.to_numeric(Violent_Population['Population'].astype(str).str.replace(',', ''), errors='coerce')
Violent_Population['Violent Crimes'] = pd.to_numeric(Violent_Population['Violent Crimes'].astype(str).str.replace(',', ''), errors='coerce')

# Normalize the 'Violent Crimes' data to be in the scale of 'Population' for comparative purposes
Violent_Population['Violent Crimes Normalized'] = (Violent_Population['Violent Crimes'] / Violent_Population['Violent Crimes'].max()) * Violent_Population['Population'].max()

# Create a figure and a single set of axes
fig, ax = plt.subplots(figsize=(10, 6))

# Plotting both Population and Normalized Violent Crimes on the same axis
ax.plot(Violent_Population['Year'], Violent_Population['Population'], marker='o', label='Population', color='b')
ax.plot(Violent_Population['Year'], Violent_Population['Violent Crimes Normalized'], marker='o', label='Violent Crimes (Normalized)', color='r')

ax.set_xlabel('Year')
ax.set_ylabel('Population and Normalized Violent Crimes')
ax.tick_params('y')
ax.legend()

plt.title('Population and Normalized Violent Crimes Over Time')
plt.show()

 This line chart above shows the Population rate in blue and the Violent Crime rates in red from 2000 to 2020. 

 I calcutated the population growth and calcutated the violent crimes from 2020 to 2020 using data from my Violent_Population DataFrame

In [None]:
import pandas as pd

data = {'Year': list(range(2000, 2021)),
        'Population': [281421906, 285317559, 287973924, 290788976, 293656842, 296507061, 299398484, 301621157,
                       304059724, 307006550, 309330219, 311587816, 313873685, 316497531, 318907401, 320896618,
                       323405935, 325147121, 326687501, 328239523, 335942111],
        'Violent Crimes': [1425486, 1439480, 1423677, 1383676, 1360088, 1390745, 1435123, 1422970, 1394461,
                           1325896, 1251248, 1206005, 1217057, 1168298, 1153022, 1199310, 1250162, 1247917,
                           1209997, 1203808, 1313105]}

Violent_Population = pd.DataFrame(data)

Violent_Population['Population Growth'] = Violent_Population['Population'].pct_change() * 100

Violent_Population['Violent Crimes Percentage'] = Violent_Population['Violent Crimes'] / Violent_Population['Population'] * 100

total_population_growth = Violent_Population['Population Growth'].sum()
total_violent_crimes_percentage = (Violent_Population['Violent Crimes'] / Violent_Population['Population']).mean() * 100

print(f'Total Population Growth from 2000 to 2020: {total_population_growth:.2f}%')
print(f'Total Violent Crimes Percentage from 2000 to 2020: {total_violent_crimes_percentage:.2f}%')


 Made a line chart calculating the percentage of the Population and Violent crimes from 2000 to 2020 in the United States using data from my Violent_Population DataFrame 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = {'Year': list(range(2000, 2021)),
        'Population': [281421906, 285317559, 287973924, 290788976, 293656842, 296507061, 299398484, 301621157,
                       304059724, 307006550, 309330219, 311587816, 313873685, 316497531, 318907401, 320896618,
                       323405935, 325147121, 326687501, 328239523, 335942111],
        'Violent Crimes': [1425486, 1439480, 1423677, 1383676, 1360088, 1390745, 1435123, 1422970, 1394461,
                           1325896, 1251248, 1206005, 1217057, 1168298, 1153022, 1199310, 1250162, 1247917,
                           1209997, 1203808, 1313105]}

Violent_Population = pd.DataFrame(data)

Violent_Population['Population Growth'] = Violent_Population['Population'].pct_change() * 100

Violent_Population['Violent Crimes Percentage'] = Violent_Population['Violent Crimes'] / Violent_Population['Population'] * 100

plt.figure(figsize=(10, 6))

plt.plot(Violent_Population['Year'], Violent_Population['Population Growth'], label='Population Growth (%)', marker='o')
plt.plot(Violent_Population['Year'], Violent_Population['Violent Crimes Percentage'], label='Violent Crimes Percentage', marker='o')

plt.title('Population Growth and Violent Crimes Percentage (2000-2020)')
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.legend()
plt.grid(True)
plt.show()


 This line chart above shows the Total Population Growth from 2000 to 2020 being 17.80% and
  the Total Violent Crimes Percentage from 2000 to 2020 being 0.43%
