In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as pyo
import statsmodels.api as sm
import ipywidgets as widgets
from ipywidgets import interactive

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('suicide_rates_1990-2022.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118560 entries, 0 to 118559
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   RegionCode                    118560 non-null  object 
 1   RegionName                    118560 non-null  object 
 2   CountryCode                   118560 non-null  object 
 3   CountryName                   118560 non-null  object 
 4   Year                          118560 non-null  int64  
 5   Sex                           118560 non-null  object 
 6   AgeGroup                      118560 non-null  object 
 7   Generation                    118560 non-null  object 
 8   SuicideCount                  118096 non-null  float64
 9   CauseSpecificDeathPercentage  114271 non-null  float64
 10  DeathRatePer100K              107896 non-null  float64
 11  Population                    112640 non-null  float64
 12  GDP                           111320 non-nul

In [4]:
def summary(df):
    # Create a DataFrame to hold summary statistics
    summ = pd.DataFrame(index=df.columns)

    # Add columns for data types, missing values, unique values, and missing percentage
    summ['Data Type'] = df.dtypes
    summ['Missing'] = df.isnull().sum()
    summ['Uniques'] = df.nunique()
    summ['Missing (%)'] = (df.isnull().mean() * 100).round(1)

    # Add columns for duplicated values count, non-missing count
    summ['Duplicates'] = df.duplicated().sum()
    summ['Count'] = df.count()

    # Add columns for descriptive statistics
    desc = df.describe(include='all').transpose()
    summ['Min'] = desc['min']
    summ['Max'] = desc['max']
    summ['Average'] = desc['mean']
    summ['Standard Deviation'] = desc['std']

    # Add columns for first three values
    first_three_values = df.head(3).transpose()
    for i, row in first_three_values.iterrows():
        summ.loc[i, 'First Value'] = row[0]
        summ.loc[i, 'Second Value'] = row[1]
        summ.loc[i, 'Third Value'] = row[2]

    return summ

# Example usage
summary(df)

Unnamed: 0,Data Type,Missing,Uniques,Missing (%),Duplicates,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
RegionCode,object,0,6,0.0,22584,118560,,,,,EU,EU,EU
RegionName,object,0,6,0.0,22584,118560,,,,,Europe,Europe,Europe
CountryCode,object,0,117,0.0,22584,118560,,,,,ALB,ALB,ALB
CountryName,object,0,117,0.0,22584,118560,,,,,Albania,Albania,Albania
Year,int64,0,33,0.0,22584,118560,1990.0,2022.0,2005.740047,8.745003,1992,1992,1992
Sex,object,0,3,0.0,22584,118560,,,,,Male,Male,Male
AgeGroup,object,0,7,0.0,22584,118560,,,,,0-14 years,0-14 years,0-14 years
Generation,object,0,7,0.0,22584,118560,,,,,Generation Alpha,Generation Alpha,Generation Alpha
SuicideCount,float64,464,1868,0.4,22584,118096,0.0,6787.0,63.632037,254.095408,0.0,0.0,0.0
CauseSpecificDeathPercentage,float64,4289,46425,3.6,22584,114271,0.0,100.0,3.87369,7.31782,0.0,0.0,0.0


In [5]:
# Group data by continent and year
grouped_data = df.groupby(['RegionName', 'Year'])['SuicideCount'].sum().reset_index()


# Create the plot
fig = px.line(grouped_data, x='Year', y='SuicideCount', color='RegionName',
              title='Suicide Counts by Year and Continent')

# Update the layout
fig.update_layout(xaxis_title='Year', yaxis_title='Suicide Count')

# Add a hoverbox
fig.update_layout(
    hovermode='x unified',
    hoverlabel=dict(
        bgcolor='white',
        font_size=14,
        font_family='Arial'
        )
)

# Update the legend
fig.update_layout(legend_title='Continent')

# Show the plot
fig.show()

In [6]:
suicide_rate_by_gender = df.groupby('Sex')['DeathRatePer100K'].mean()
colors = ['#eec2c2', '#aaccff', '#c2ffc2']

# Create the doughnut chart
fig = go.Figure(data=[go.Pie(labels=suicide_rate_by_gender.index,
                                 values=suicide_rate_by_gender, hole=.45,
                                 textinfo='label+percent',
                                 marker_colors=colors)])

# Customize the layout
fig.update_layout(
    title='Suicide Rate by Gender',
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
)

# Show the chart
fig.show()

In [7]:
generation_suicide_rates = df.groupby('Generation')['DeathRatePer100K'].mean().reset_index()

fig = px.bar(generation_suicide_rates, x='Generation', y='DeathRatePer100K',
             title='Suicide Rates by Generation',
             labels={'Generation': 'Generation', 'DeathRatePer100K': 'Suicide Rate per 100K Population'},
             color_discrete_sequence=px.colors.qualitative.Pastel1)

fig.update_layout(
    title='Suicide Rates by Generation',
    xaxis_title='Generation',
    yaxis_title='Suicide Rate per 100K Population',
    showlegend=False
)

fig.show()

In [8]:
grouped_data = df.groupby(['AgeGroup', 'Sex'])['DeathRatePer100K'].mean().unstack()

fig = px.bar(grouped_data, title='Suicide Rates by Age Group and Gender',
             labels={
                 'AgeGroup': 'Age Group',
                 'DeathRatePer100K': 'Suicide Rate per 100K Population',
                 'Sex': 'Gender',
                 'color_discrete_sequence': ['#3498db', '#e74c3c', '#f1c40f'],
                 'color_discrete_map': {
                     'male': '#3498db',
                     'female': '#e74c3c',
                     'unknown': '#f1c40f',
                 },
             },
             )

fig.update_layout(
    title='Suicide Rates by Age Group and Gender',
    xaxis_title='Age Group',
    yaxis_title='Suicide Rate per 100K Population',
    legend_title='Gender'
)

fig.show()

In [12]:
# Function to plot suicide rates by age group for a given year
def plot_suicide_rates_by_age(year):
    # Filter data for the selected year
    df_year = df[df['Year'] == year]

    # Group data by age group and calculate average suicide rate
    age_group_suicide_rates = df_year.groupby('AgeGroup')['DeathRatePer100K'].mean().reset_index()

    # Sort age groups by suicide rate in descending order
    age_group_suicide_rates = age_group_suicide_rates.sort_values(by='DeathRatePer100K', ascending=False)

    # Create plot
    plt.figure(figsize=(8, 5))
    sns.barplot(data=age_group_suicide_rates, x='AgeGroup', y='DeathRatePer100K', palette='Blues_r')
    plt.title(f'Suicide Rates by Age Group ({year})', fontsize=16, color='black', pad=20)
    plt.xlabel('Age Group', fontsize=12, color='black')
    plt.ylabel('Suicide Rate per 100K Population', fontsize=12, color='black')
    plt.xticks(fontsize=10, color='black', rotation=0)
    plt.yticks(fontsize=10, color='black')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Define years available in the dataset
years_available = sorted(df['Year'].unique())

# Create slider widget for selecting year
year_slider = widgets.SelectionSlider(
    options=years_available,
    value=years_available[0],
    description='Year:',
    continuous_update=False,
    layout={'width': '400px'}
)

# Create interactive plot
interactive_plot = interactive(plot_suicide_rates_by_age, year=year_slider)
output = interactive_plot.children[-1]
output.layout.height = '400px'
interactive_plot

interactive(children=(SelectionSlider(continuous_update=False, description='Year:', layout=Layout(width='400px…