In [106]:
# Imports
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [85]:
# Load Data
raw_df = pd.read_csv('../../data/suicide.csv')
raw_df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [86]:
# Renaming the columns
raw_df.columns = ['country', 'year', 'gender', 'age_group', 'suicide_count', 'population', 'suicide_rate', 'country-year', 'HDI for year', 'gdp_for_year', 'gdp_per_capita', 'generation']
raw_df.columns

Index(['country', 'year', 'gender', 'age_group', 'suicide_count', 'population',
       'suicide_rate', 'country-year', 'HDI for year', 'gdp_for_year',
       'gdp_per_capita', 'generation'],
      dtype='object')

In [87]:
raw_df.age_group.value_counts()

15-24 years    4642
35-54 years    4642
75+ years      4642
25-34 years    4642
55-74 years    4642
5-14 years     4610
Name: age_group, dtype: int64

In [88]:
raw_df.generation.value_counts()

Generation X       6408
Silent             6364
Millenials         5844
Boomers            4990
G.I. Generation    2744
Generation Z       1470
Name: generation, dtype: int64

In [89]:
# Counting Countries
#Listing countries

country = raw_df.country.unique()
print("Number of countries:", len(country))
country

Number of countries: 101


array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Barbados', 'Belarus', 'Belgium', 'Belize',
       'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cabo Verde',
       'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominica', 'Ecuador',
       'El Salvador', 'Estonia', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hungary',
       'Iceland', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Kazakhstan', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Macau', 'Maldives', 'Malta',
       'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands',
       'New Zealand', 'Nicaragua', 'Norway', 'Oman', 'Panama', 'Paraguay',
       'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
       'Republic of Korea', 'Romania', '

# What we noticed about the data

- `HDI for year` column has a lot of NaN values. No other columns in the dataset have missing values or NaN values. Therefore, we will be removing the `HDI for year` column
- There are 6 different age groups in the dataset
    - 15-24 years
    - 35-54 years    
    - 75+ years      
    - 25-34 years    
    - 55-74 years    
    - 5-14 years 
- Due to the fact that there are age groups instead of quantitative values for age and it is in a categorical format, we need to encode this data.
- Gender should also be encoded as it can be a boolean value ex. 1 = Male, 0 = Female
- There are 6 different generations in the dataset
    - Generation X       
    - Silent             
    - Millenials         
    - Boomers            
    - G.I. Generation    
    - Generation Z  
- Generation can be encoded as well as it is in a categorical format

In [90]:
raw_df.describe()

Unnamed: 0,year,suicide_count,population,suicide_rate,HDI for year,gdp_per_capita
count,27820.0,27820.0,27820.0,27820.0,8364.0,27820.0
mean,2001.258375,242.574407,1844794.0,12.816097,0.776601,16866.464414
std,8.469055,902.047917,3911779.0,18.961511,0.093367,18887.576472
min,1985.0,0.0,278.0,0.0,0.483,251.0
25%,1995.0,3.0,97498.5,0.92,0.713,3447.0
50%,2002.0,25.0,430150.0,5.99,0.779,9372.0
75%,2008.0,131.0,1486143.0,16.62,0.855,24874.0
max,2016.0,22338.0,43805210.0,224.97,0.944,126352.0


In [91]:
# Checking for null or missing values

raw_df.isnull().sum()

country               0
year                  0
gender                0
age_group             0
suicide_count         0
population            0
suicide_rate          0
country-year          0
HDI for year      19456
gdp_for_year          0
gdp_per_capita        0
generation            0
dtype: int64

In [92]:
# Dropping the HDI for year column
# The HDI has 19456 NaN values out of the 27820 total, which accounts for approximately 70% of the data in this column.
# This may affect the model's performance, therefore we will drop this column.

raw_df = raw_df.drop(['HDI for year'], axis = 1)
raw_df.shape

(27820, 11)

In [93]:
# Dropping the country-year for year column
# This is just a combination of 2 columns that already exist in the dataset

raw_df = raw_df.drop(['country-year'], axis = 1)
raw_df.shape

(27820, 10)

In [94]:
# Drop any NaN values if there are any, there should be very few if any

raw_df = raw_df.dropna()
raw_df.shape

(27820, 10)

In [95]:
raw_df.columns

Index(['country', 'year', 'gender', 'age_group', 'suicide_count', 'population',
       'suicide_rate', 'gdp_for_year', 'gdp_per_capita', 'generation'],
      dtype='object')

In [96]:
# Encode the categorical data as stated above

categorical_columns = ['country', 'year','age_group', 'gender', 'generation']
encoder = sklearn.preprocessing.LabelEncoder()

for column in categorical_columns:
    raw_df[column] = encoder.fit_transform(raw_df[column]) # Change the categorical columns to their new encoded values

In [97]:
# Copy of dataset for testing

test_df = raw_df.copy()
test_df

Unnamed: 0,country,year,gender,age_group,suicide_count,population,suicide_rate,gdp_for_year,gdp_per_capita,generation
0,0,2,1,0,21,312900,6.71,2156624900,796,2
1,0,2,1,2,16,308000,5.19,2156624900,796,5
2,0,2,0,0,14,289700,4.83,2156624900,796,2
3,0,2,1,5,1,21800,4.59,2156624900,796,1
4,0,2,1,1,9,274300,3.28,2156624900,796,0
...,...,...,...,...,...,...,...,...,...,...
27815,100,29,0,2,107,3620833,2.96,63067077179,2309,2
27816,100,29,0,5,9,348465,2.58,63067077179,2309,5
27817,100,29,1,3,60,2762158,2.17,63067077179,2309,3
27818,100,29,0,3,44,2631600,1.67,63067077179,2309,3


In [98]:
# Convert the column 'gdp_for_year' to float from object

raw_df['gdp_for_year'] = raw_df['gdp_for_year'].str.replace(',','').astype(float)
raw_df.dtypes

country             int64
year                int64
gender              int64
age_group           int64
suicide_count       int64
population          int64
suicide_rate      float64
gdp_for_year      float64
gdp_per_capita      int64
generation          int64
dtype: object

In [99]:
# Scaling the numerical data columns 

numerical = ['suicide_count', 'population', 'suicide_rate', 
              'gdp_for_year','gdp_per_capita']

scalar = RobustScaler()
raw_df[numerical] = scalar.fit_transform(raw_df[numerical])
raw_df

Unnamed: 0,country,year,gender,age_group,suicide_count,population,suicide_rate,gdp_for_year,gdp_per_capita,generation
0,0,2,1,0,-0.031250,-0.084435,0.045860,-0.182942,-0.400243,2
1,0,2,1,2,-0.070312,-0.087963,-0.050955,-0.182942,-0.400243,5
2,0,2,0,0,-0.085938,-0.101142,-0.073885,-0.182942,-0.400243,2
3,0,2,1,5,-0.187500,-0.294064,-0.089172,-0.182942,-0.400243,1
4,0,2,1,1,-0.125000,-0.112232,-0.172611,-0.182942,-0.400243,0
...,...,...,...,...,...,...,...,...,...,...
27815,100,29,0,2,0.640625,2.297696,-0.192994,0.059520,-0.329631,2
27816,100,29,0,5,-0.125000,-0.058824,-0.217197,0.059520,-0.329631,5
27817,100,29,1,3,0.273438,1.679341,-0.243312,0.059520,-0.329631,3
27818,100,29,0,3,0.148438,1.585323,-0.275159,0.059520,-0.329631,3


In [100]:
# Target Column X & Y Assignment

Y = raw_df['suicide_rate']
X = raw_df.drop('suicide_rate', axis=1)
X.shape, Y.shape


((27820, 9), (27820,))

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((22256, 9), (5564, 9))

In [None]:
# Helper functions needed for KNN
def sigmoid(x):
    """Computes the element wise logistic sigmoid of x.

    Inputs:
        x: Either a row vector or a column vector.
    """
    return 1.0 / (1.0 + np.exp(-x))

def euclidean_distance(a, b):
    """ Computes the Euclidean distance between a and b."""
    if a.shape[0] != b.shape[0]:
        raise ValueError("A and B should be of same dimensionality.")

    aa = np.sum(a ** 2, axis=0)
    bb = np.sum(b ** 2, axis=0)
    ab = np.dot(a.T, b)

    return np.sqrt(aa[:, np.newaxis] + bb[np.newaxis, :] - 2 * ab)

In [102]:
# My own KNN function

In [107]:
# Library KNN

# Create model
knn = KNeighborsRegressor()

param_grid = {'n_neighbors':list(range(1, 31)), 'weights': ['uniform', 'distance']}

# Create grid of nodes
knn_grid = GridSearchCV(knn, param_grid , cv=10)

# Fit model with data
knn_grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})

In [108]:
# Checking best parameters for model
params = knn_grid.best_params_
print(params)

{'n_neighbors': 1, 'weights': 'uniform'}


In [109]:
# Predicting the target value from the model for the samples
y_train_knn = knn_grid.predict(X_train)
y_test_knn = knn_grid.predict(X_test)

In [115]:
# Compute accuracy of the model
train_acc = knn_grid.score(X_train, y_train) 
test_acc = knn_grid.score(X_test, y_test)

print("KNN: Accuracy on training Data: {:.3f}".format(train_acc))
print("KNN: Accuracy on test Data: {:.3f}".format(test_acc)) # 81% accuracy on the test data

KNN: Accuracy on training Data: 1.000
KNN: Accuracy on test Data: 0.812
