In [15]:
# Importing libraries
import pandas as pd
import re
import category_encoders as ce
import numpy as np

In [64]:
# Importing as df
df = pd.read_csv('master.csv')
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [65]:
# Droping columns with high numbers of missing values
df.drop(columns=['HDI for year'], inplace=True)
df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].apply(lambda x: int(re.sub(r'[, ]', '', x)))
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
 gdp_for_year ($)       int64
gdp_per_capita ($)      int64
generation             object
dtype: object

In [66]:
# Calculating modal values for categorical variables
modalValues = df.groupby(['country', 'year', 'age', 'sex', 'generation'])['population'].sum().reset_index()
modalValues = modalValues.groupby(['country', 'year']).\
    apply(lambda x: x.loc[x['population'].idxmax()]).reset_index(drop=True).\
    drop(columns=['population'])
modalValues.head()

Unnamed: 0,country,year,age,sex,generation
0,Albania,1987,5-14 years,male,Generation X
1,Albania,1988,5-14 years,male,Generation X
2,Albania,1989,5-14 years,male,Generation X
3,Albania,1992,5-14 years,male,Millenials
4,Albania,1993,5-14 years,male,Millenials


In [67]:
# Totalizing values
totalValues = df.groupby(['country', 'year']).\
    agg({'suicides_no': 'sum',
         'population': 'sum',
         'gdp_per_capita ($)': 'mean',
        ' gdp_for_year ($) ': 'mean'}).reset_index()
totalValues[' gdp_for_year ($) '] = totalValues[' gdp_for_year ($) '].astype(int)
totalValues.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($)
0,Albania,1987,73,2709600,796.0,2156624900
1,Albania,1988,63,2764300,769.0,2126000000
2,Albania,1989,68,2803100,833.0,2335124988
3,Albania,1992,47,2822500,251.0,709452584
4,Albania,1993,73,2807300,437.0,1228071038


In [68]:
# Merging dataframes and creating response variable
dfPro = totalValues.merge(modalValues, on=['country', 'year'], how='left').\
    sort_values(['country', 'year'])
dfPro['suicideRate'] = dfPro['suicides_no']*100000/dfPro['population']
dfPro.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate
0,Albania,1987,73,2709600,796.0,2156624900,5-14 years,male,Generation X,2.694125
1,Albania,1988,63,2764300,769.0,2126000000,5-14 years,male,Generation X,2.279058
2,Albania,1989,68,2803100,833.0,2335124988,5-14 years,male,Generation X,2.425886
3,Albania,1992,47,2822500,251.0,709452584,5-14 years,male,Millenials,1.66519
4,Albania,1993,73,2807300,437.0,1228071038,5-14 years,male,Millenials,2.600363


In [69]:
# Selecting countries with information between 2009-2013
selCountries = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013)].\
    groupby('country')['year'].nunique().reset_index().query('year==5')
selCountries.shape

(75, 2)

In [70]:
# Filtering dataframe
dfPro = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013) & (dfPro['country'].isin(selCountries['country']))]
dfPro.shape

(375, 10)

In [71]:
# Number of unique countries
totalCountries = len(dfPro['country'].unique())
totalCountries

75

In [72]:
# Encoding categorical variables
mapping = [{'col': 'age','mapping':{'5-14 years': 0,
                                    '15-24 years': 1,
                                    '35-54 years': 2,
                                    '55-74 years': 3}},
           {'col': 'generation', 'mapping':{'Millenials': 0,
                                            'Generation Z': 1,
                                            'Generation X': 2,
                                            'Boomers': 3,
                                            'Silent': 4}},
          {'col': 'sex', 'mapping': {'male': 0,
                                     'female': 1}}]

encoder = ce.OrdinalEncoder(cols = 'age',
                           return_df=True,
                           mapping=mapping)

encoder.fit(dfPro)

In [73]:
# Train & test division by country preparation
countryID = pd.DataFrame({'country': dfPro['country'].drop_duplicates(),
                          'id': np.arange(0, totalCountries)})
dfPro = dfPro.merge(countryID, on=['country'], how='left')

In [74]:
# Train and test size
trainSize = int(totalCountries*0.7)
testSize = totalCountries - trainSize
print(trainSize, testSize)

52 23


In [75]:
# Creating random samples
dataSample = np.arange(0, totalCountries)
trainSample = np.random.choice(totalCountries, size=trainSize, replace=False)
testSample = [i for i in dataSample if i not in trainSample]

In [62]:
# Dividing
TrainDF = dfPro[dfPro[]]

23

In [58]:
dataSample

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74])

In [63]:
np.sort(trainSample)

array([ 2,  3,  4,  5,  7, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 26, 27,
       28, 29, 30, 31, 32, 33, 35, 37, 38, 40, 43, 45, 46, 47, 48, 49, 50,
       51, 52, 54, 55, 57, 58, 59, 60, 61, 62, 63, 67, 68, 70, 71, 72, 73,
       74])