In [221]:
# Importing libraries
import pandas as pd
import re
import category_encoders as ce
import numpy as np
from sklearn.preprocessing import StandardScaler

In [222]:
# Importing as df
df = pd.read_csv('master.csv')
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [223]:
# Droping columns with high numbers of missing values
df.drop(columns=['HDI for year'], inplace=True)
df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].apply(lambda x: int(re.sub(r'[, ]', '', x)))
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
 gdp_for_year ($)       int64
gdp_per_capita ($)      int64
generation             object
dtype: object

In [224]:
# Calculating modal values for categorical variables
modalValues = df.groupby(['country', 'year', 'age', 'sex', 'generation'])['population'].sum().reset_index()
modalValues = modalValues.groupby(['country', 'year']).\
    apply(lambda x: x.loc[x['population'].idxmax()]).reset_index(drop=True).\
    drop(columns=['population'])
modalValues.head()

Unnamed: 0,country,year,age,sex,generation
0,Albania,1987,5-14 years,male,Generation X
1,Albania,1988,5-14 years,male,Generation X
2,Albania,1989,5-14 years,male,Generation X
3,Albania,1992,5-14 years,male,Millenials
4,Albania,1993,5-14 years,male,Millenials


In [225]:
# Totalizing values
totalValues = df.groupby(['country', 'year']).\
    agg({'suicides_no': 'sum',
         'population': 'sum',
         'gdp_per_capita ($)': 'mean',
        ' gdp_for_year ($) ': 'mean'}).reset_index()
totalValues[' gdp_for_year ($) '] = totalValues[' gdp_for_year ($) '].astype(int)
totalValues.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($)
0,Albania,1987,73,2709600,796.0,2156624900
1,Albania,1988,63,2764300,769.0,2126000000
2,Albania,1989,68,2803100,833.0,2335124988
3,Albania,1992,47,2822500,251.0,709452584
4,Albania,1993,73,2807300,437.0,1228071038


In [226]:
# Merging dataframes and creating response variable
dfPro = totalValues.merge(modalValues, on=['country', 'year'], how='left').\
    sort_values(['country', 'year'])
dfPro['suicideRate'] = dfPro['suicides_no']*100000/dfPro['population']
dfPro.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate
0,Albania,1987,73,2709600,796.0,2156624900,5-14 years,male,Generation X,2.694125
1,Albania,1988,63,2764300,769.0,2126000000,5-14 years,male,Generation X,2.279058
2,Albania,1989,68,2803100,833.0,2335124988,5-14 years,male,Generation X,2.425886
3,Albania,1992,47,2822500,251.0,709452584,5-14 years,male,Millenials,1.66519
4,Albania,1993,73,2807300,437.0,1228071038,5-14 years,male,Millenials,2.600363


In [227]:
# Selecting countries with information between 2009-2013
selCountries = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013)].\
    groupby('country')['year'].nunique().reset_index().query('year==5')
selCountries.shape

(75, 2)

In [228]:
# Filtering dataframe
dfPro = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013) & (dfPro['country'].isin(selCountries['country']))]
dfPro.shape

(375, 10)

In [229]:
# Number of unique countries
totalCountries = len(dfPro['country'].unique())
totalCountries

75

In [230]:
# Encoding categorical variables
mapping = [{'col': 'age','mapping':{'5-14 years': 0,
                                    '15-24 years': 1,
                                    '35-54 years': 2,
                                    '55-74 years': 3}},
           {'col': 'generation', 'mapping':{'Millenials': 0,
                                            'Generation Z': 1,
                                            'Generation X': 2,
                                            'Boomers': 3,
                                            'Silent': 4}},
          {'col': 'sex', 'mapping': {'male': 0,
                                     'female': 1}}]

encoder = ce.OrdinalEncoder(cols = 'age',
                           return_df=True,
                           mapping=mapping)

encoder.fit(dfPro)

In [231]:
# Train & test division by country preparation
countryID = pd.DataFrame({'country': dfPro['country'].drop_duplicates(),
                          'id': np.arange(0, totalCountries)})
dfPro = dfPro.merge(countryID, on=['country'], how='left')

In [232]:
# Train and test size
trainSize = int(totalCountries*0.7)
testSize = totalCountries - trainSize
print(trainSize, testSize)

52 23


In [233]:
# Creating random samples
dataSample = np.arange(0, totalCountries)
trainSample = np.random.choice(totalCountries, size=trainSize, replace=False)
testSample = [i for i in dataSample if i not in trainSample]

In [234]:
# Dividing
TrainDF = dfPro[dfPro['id'].isin(list(trainSample))].drop(columns='id')
TestDF = dfPro[dfPro['id'].isin(list(testSample))].drop(columns='id')

In [235]:
# Encoding train ant test
TrainDF = encoder.transform(TrainDF)
TestDF = encoder.transform(TestDF)
TrainDF.head(10)

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate
0,Argentina,2009,2884,37158001,8961.0,332976484578,2,1,3,7.761451
1,Argentina,2010,2943,37578454,11273.0,423627422092,2,1,2,7.831615
2,Argentina,2011,2912,38015739,13946.0,530163281575,2,1,2,7.659985
3,Argentina,2012,3248,38441778,14203.0,545982375701,2,1,2,8.449141
4,Argentina,2013,2987,38859125,14206.0,552025140252,2,1,2,7.68674
5,Armenia,2009,53,2689695,3215.0,8647936748,2,1,3,1.970484
6,Armenia,2010,73,2676225,3460.0,9260284938,2,1,2,2.727723
7,Armenia,2011,67,2670366,3798.0,10142111334,2,1,2,2.509019
8,Armenia,2012,79,2814300,3773.0,10619320049,2,1,2,2.807092
9,Armenia,2013,67,2810664,3957.0,11121465767,2,1,2,2.383778


Given that suicide rate is the variable to predict and it is a linear relationship between suicides_no and population, they will not be considered, only their lagged values.

In [236]:
# Rolling window function
def create_dataset(data, columns):
    auxDf1 = data.reset_index().copy()
    auxDf2 = auxDf1.copy()
    auxDf2['index'] = auxDf2['index'] + 1
    rezDF = pd.merge(auxDf1.drop(columns=columns),
                     auxDf2[columns + ['index', 'country']], on=['index', 'country'],
                    how='inner')
    return rezDF

In [237]:
# Rolling window
TrainDF = create_dataset(TrainDF, ['population', 'suicides_no'])
TestDF = create_dataset(TestDF, ['population', 'suicides_no'])

In [238]:
# renaming columns & Dividing x & Y matrix
yTrain = TrainDF['suicideRate']
yTest = TestDF['suicideRate']
TrainDF = TrainDF.rename(columns={'population': 'population-1', 'suicides_no': 'suicides_no-1'}).\
               drop(columns=['suicideRate', 'index']).sort_values(['year', 'country']).reset_index(drop=True)
TestDF = TestDF.rename(columns={'population': 'population-1', 'suicides_no': 'suicides_no-1'}).\
              drop(columns=['suicideRate', 'index']).sort_values(['year', 'country']).reset_index(drop=True)
TrainDF.head()

Unnamed: 0,country,year,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,population-1,suicides_no-1
0,Argentina,2010,11273.0,423627422092,2,1,2,37158001,2884
1,Armenia,2010,3460.0,9260284938,2,1,2,2689695,53
2,Austria,2010,49181.0,391892746545,2,0,2,7946894,1278
3,Bahamas,2010,30239.0,10095760000,2,1,2,327813,6
4,Bahrain,2010,22572.0,25713271277,2,0,2,1092922,36


Longitudinal data is a stack of multiple cross sectional datasets, so makes sense we need to standardize the variables for each year individually.

In [239]:
class LongitudinalScaler:
    def __init__(self, train, test):
        self.TrainData = train.copy()
        self.TestData = test.copy()
        self.scalerDict = {}
        self.ScaledTrain = pd.DataFrame()
        self.ScaledTest = pd.DataFrame()
    
    def createScaler(self, years, colDrop):
        for year in years:
            aux = self.TrainData.copy()
            aux = aux[aux['year'] == year].drop(columns=colDrop)
            colNames = aux.columns
            scalerName = 'scaler' + str(year)
            scaler = StandardScaler()
            aux = scaler.fit_transform(aux)
            aux = pd.DataFrame(aux, columns=colNames)
            self.scalerDict[scalerName] = scaler
            self.ScaledTrain = pd.concat([self.ScaledTrain, aux], axis=0)

In [240]:
# Scaling
LSC = LongitudinalScaler(train=XTrain, test=XTest)
LSC.createScaler(years=[2010, 2011, 2012, 2013], colDrop=['country', 'year'])

In [241]:

TrainScaled = LSC.ScaledTrain.copy()
TrainScaled[['country', 'year']] = TrainDF[['country', 'year']]

In [242]:
TrainScaled.head()

Unnamed: 0,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,population-1,suicides_no-1,country,year
0,-0.61696,-0.154253,0.279553,0.96225,0.271563,0.189008,-0.049902,Argentina,2010
1,1.266754,0.185159,0.279553,0.96225,0.271563,-0.138284,-0.123564,Armenia,2010
2,0.202193,-0.349022,0.279553,0.96225,0.271563,-0.535898,-0.437471,Austria,2010
3,-0.368139,-0.351676,0.279553,0.96225,0.271563,-0.537219,-0.437606,Bahamas,2010
4,0.941443,-0.126031,0.279553,-1.03923,0.271563,-0.342114,-0.167196,Bahrain,2010
