In [1]:
# Librairies
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/kueyram/DSC540/main/suicide_rates_1990_2022.csv'
suicide_rates = pd.read_csv(url)

In [3]:
# Columns
suicide_rates.columns

Index(['CodeRegion', 'Nom_Region', 'Code_Pays', 'Nom_Pays', 'Annee', 'Sex',
       'Nombre_Suicide', 'RatioDecesPar100K', 'GDPPerCapita', 'GNIPerCapita',
       'InflationRate', 'EmploymentPopulationRatio'],
      dtype='object')

In [4]:
# Size of the dataset
suicide_rates.shape

(5928, 12)

In [5]:
# Viewing the 1st 5 rows
suicide_rates.head(10)

Unnamed: 0,CodeRegion,Nom_Region,Code_Pays,Nom_Pays,Annee,Sex,Nombre_Suicide,RatioDecesPar100K,GDPPerCapita,GNIPerCapita,InflationRate,EmploymentPopulationRatio
0,EU,Europe,ALB,Albania,1992,Male,33,2.076386,200.85222,1740.0,226.005421,45.315
1,EU,Europe,ALB,Albania,1992,Female,14,0.874563,200.85222,1740.0,226.005421,45.315
2,EU,Europe,ALB,Albania,1993,Male,46,2.937233,367.279225,2110.0,85.004751,47.798
3,EU,Europe,ALB,Albania,1993,Female,27,1.686025,367.279225,2110.0,85.004751,47.798
4,EU,Europe,ALB,Albania,1994,Male,37,2.332619,586.416135,2300.0,22.565053,50.086
5,EU,Europe,ALB,Albania,1994,Female,15,0.928333,586.416135,2300.0,22.565053,50.086
6,EU,Europe,ALB,Albania,1995,Male,57,3.545217,750.604449,2710.0,7.793219,53.186
7,EU,Europe,ALB,Albania,1995,Female,34,2.071781,750.604449,2710.0,7.793219,53.186
8,EU,Europe,ALB,Albania,1996,Male,53,3.263547,1009.977111,3050.0,12.725478,53.039
9,EU,Europe,ALB,Albania,1996,Female,39,2.350814,1009.977111,3050.0,12.725478,53.039


In [6]:
new_column_names = {
    'CodeRegion': 'RegionCode',
    'Nom_Region': 'Region',
    'Code_Pays': 'CountryCode',
    'Nom_Pays': 'Country',
    'Annee': 'Year',
    'Sex': 'Gender',
    'Nombre_Suicide': 'SuicideCount',
    'RatioDecesPar100K': 'DeathRatePer100K',
    'GDPPerCapita': 'GDPPerCapita',
    'GNIPerCapita': 'GNIPerCapita',
    'InflationRate': 'InflationRate',
    'EmploymentPopulationRatio': 'EmploymentPopulationRatio'
}

# Rename the columns using the new names
suicide_rates.rename(columns=new_column_names, inplace=True)

# View the new columns
suicide_rates.columns

Index(['RegionCode', 'Region', 'CountryCode', 'Country', 'Year', 'Gender',
       'SuicideCount', 'DeathRatePer100K', 'GDPPerCapita', 'GNIPerCapita',
       'InflationRate', 'EmploymentPopulationRatio'],
      dtype='object')

In [7]:
# Viewing the 1st 5 rows
suicide_rates.head(10)

Unnamed: 0,RegionCode,Region,CountryCode,Country,Year,Gender,SuicideCount,DeathRatePer100K,GDPPerCapita,GNIPerCapita,InflationRate,EmploymentPopulationRatio
0,EU,Europe,ALB,Albania,1992,Male,33,2.076386,200.85222,1740.0,226.005421,45.315
1,EU,Europe,ALB,Albania,1992,Female,14,0.874563,200.85222,1740.0,226.005421,45.315
2,EU,Europe,ALB,Albania,1993,Male,46,2.937233,367.279225,2110.0,85.004751,47.798
3,EU,Europe,ALB,Albania,1993,Female,27,1.686025,367.279225,2110.0,85.004751,47.798
4,EU,Europe,ALB,Albania,1994,Male,37,2.332619,586.416135,2300.0,22.565053,50.086
5,EU,Europe,ALB,Albania,1994,Female,15,0.928333,586.416135,2300.0,22.565053,50.086
6,EU,Europe,ALB,Albania,1995,Male,57,3.545217,750.604449,2710.0,7.793219,53.186
7,EU,Europe,ALB,Albania,1995,Female,34,2.071781,750.604449,2710.0,7.793219,53.186
8,EU,Europe,ALB,Albania,1996,Male,53,3.263547,1009.977111,3050.0,12.725478,53.039
9,EU,Europe,ALB,Albania,1996,Female,39,2.350814,1009.977111,3050.0,12.725478,53.039


In [8]:
# Columns to drop
columns_to_drop = ['RegionCode', 'CountryCode']

# Dropping the columns
suicide_rates = suicide_rates.drop(columns = columns_to_drop)

# View the existing columns
suicide_rates.columns

Index(['Region', 'Country', 'Year', 'Gender', 'SuicideCount',
       'DeathRatePer100K', 'GDPPerCapita', 'GNIPerCapita', 'InflationRate',
       'EmploymentPopulationRatio'],
      dtype='object')

In [9]:
# Viewing the 1st 5 rows
suicide_rates.head(10)

Unnamed: 0,Region,Country,Year,Gender,SuicideCount,DeathRatePer100K,GDPPerCapita,GNIPerCapita,InflationRate,EmploymentPopulationRatio
0,Europe,Albania,1992,Male,33,2.076386,200.85222,1740.0,226.005421,45.315
1,Europe,Albania,1992,Female,14,0.874563,200.85222,1740.0,226.005421,45.315
2,Europe,Albania,1993,Male,46,2.937233,367.279225,2110.0,85.004751,47.798
3,Europe,Albania,1993,Female,27,1.686025,367.279225,2110.0,85.004751,47.798
4,Europe,Albania,1994,Male,37,2.332619,586.416135,2300.0,22.565053,50.086
5,Europe,Albania,1994,Female,15,0.928333,586.416135,2300.0,22.565053,50.086
6,Europe,Albania,1995,Male,57,3.545217,750.604449,2710.0,7.793219,53.186
7,Europe,Albania,1995,Female,34,2.071781,750.604449,2710.0,7.793219,53.186
8,Europe,Albania,1996,Male,53,3.263547,1009.977111,3050.0,12.725478,53.039
9,Europe,Albania,1996,Female,39,2.350814,1009.977111,3050.0,12.725478,53.039


In [10]:
# Columns to drop
columns_to_drop = ['DeathRatePer100K', 'GDPPerCapita', 'GNIPerCapita']

# Dropping the columns
suicide_rates = suicide_rates.drop(columns = columns_to_drop)

# View the existing columns
suicide_rates.columns

Index(['Region', 'Country', 'Year', 'Gender', 'SuicideCount', 'InflationRate',
       'EmploymentPopulationRatio'],
      dtype='object')

In [11]:
# Columns to drop
columns_to_drop = ['InflationRate', 'EmploymentPopulationRatio']

# Dropping the columns
suicide_rates = suicide_rates.drop(columns = columns_to_drop)

# View the existing columns
suicide_rates.columns

Index(['Region', 'Country', 'Year', 'Gender', 'SuicideCount'], dtype='object')

In [12]:
suicide_rates['Region'] = suicide_rates['Region'].str.upper()
suicide_rates['Country'] = suicide_rates['Country'].str.upper()
# Viewing the 1st 5 rows
suicide_rates.head(10)

Unnamed: 0,Region,Country,Year,Gender,SuicideCount
0,EUROPE,ALBANIA,1992,Male,33
1,EUROPE,ALBANIA,1992,Female,14
2,EUROPE,ALBANIA,1993,Male,46
3,EUROPE,ALBANIA,1993,Female,27
4,EUROPE,ALBANIA,1994,Male,37
5,EUROPE,ALBANIA,1994,Female,15
6,EUROPE,ALBANIA,1995,Male,57
7,EUROPE,ALBANIA,1995,Female,34
8,EUROPE,ALBANIA,1996,Male,53
9,EUROPE,ALBANIA,1996,Female,39


In [13]:
# Looking for duplicates
duplicate_rows = suicide_rates[suicide_rates.duplicated()]
# Number of duplicated rows
print("Number of duplicated rows")
print(len(duplicate_rows))

Number of duplicated rows
0


In [14]:
# Check for NaN values in each column
nan_columns = suicide_rates.columns[suicide_rates.isnull().any()]

# Print columns with NaN values, if any
if len(nan_columns) > 0:
    print("Columns with NaN values:")
    print(nan_columns)
else:
    print("No columns with NaN values.")


No columns with NaN values.


In [15]:
# Let's print 50 random rows from the dataset
suicide_rates.sample(n=25)

Unnamed: 0,Region,Country,Year,Gender,SuicideCount
1047,CENTRAL AND SOUTH AMERICA,CHILE,2017,Female,310
2875,ASIA,MALDIVES,2010,Female,0
5023,EUROPE,UKRAINE,2009,Female,1724
1369,EUROPE,GERMANY,1995,Male,9222
911,CENTRAL AND SOUTH AMERICA,CHILE,2009,Male,1724
4494,EUROPE,SERBIA,2003,Female,383
4900,ASIA,TURKMENISTAN,2006,Male,392
3814,ASIA,MALAYSIA,2012,Male,23
3312,OCEANIA,NEW ZEALAND,2011,Female,116
5305,CENTRAL AND SOUTH AMERICA,VENEZUELA (BOLIVARIAN REPUBLIC OF),2012,Male,602


In [16]:
suicide_rates.to_csv('suicide_rates.csv')