In [98]:
# Importing libraries
import pandas as pd
import re
import category_encoders as ce
import numpy as np
from sklearn.preprocessing import StandardScaler

In [99]:
# Importing as df
df = pd.read_csv('master.csv')
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [100]:
# Droping columns with high numbers of missing values
df.drop(columns=['HDI for year'], inplace=True)
df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].apply(lambda x: int(re.sub(r'[, ]', '', x)))
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
 gdp_for_year ($)       int64
gdp_per_capita ($)      int64
generation             object
dtype: object

In [101]:
# Calculating modal values for categorical variables
modalValues = df.groupby(['country', 'year', 'age', 'sex', 'generation'])['population'].sum().reset_index()
modalValues = modalValues.groupby(['country', 'year']).\
    apply(lambda x: x.loc[x['population'].idxmax()]).reset_index(drop=True).\
    drop(columns=['population'])
modalValues.head()

Unnamed: 0,country,year,age,sex,generation
0,Albania,1987,5-14 years,male,Generation X
1,Albania,1988,5-14 years,male,Generation X
2,Albania,1989,5-14 years,male,Generation X
3,Albania,1992,5-14 years,male,Millenials
4,Albania,1993,5-14 years,male,Millenials


In [102]:
# Totalizing values
totalValues = df.groupby(['country', 'year']).\
    agg({'suicides_no': 'sum',
         'population': 'sum',
         'gdp_per_capita ($)': 'mean',
        ' gdp_for_year ($) ': 'mean'}).reset_index()
totalValues[' gdp_for_year ($) '] = totalValues[' gdp_for_year ($) '].astype(int)
totalValues.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($)
0,Albania,1987,73,2709600,796.0,2156624900
1,Albania,1988,63,2764300,769.0,2126000000
2,Albania,1989,68,2803100,833.0,2335124988
3,Albania,1992,47,2822500,251.0,709452584
4,Albania,1993,73,2807300,437.0,1228071038


In [103]:
# Merging dataframes and creating response variable
dfPro = totalValues.merge(modalValues, on=['country', 'year'], how='left').\
    sort_values(['country', 'year'])
dfPro['suicideRate'] = dfPro['suicides_no']*100000/dfPro['population']
dfPro.head()

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate
0,Albania,1987,73,2709600,796.0,2156624900,5-14 years,male,Generation X,2.694125
1,Albania,1988,63,2764300,769.0,2126000000,5-14 years,male,Generation X,2.279058
2,Albania,1989,68,2803100,833.0,2335124988,5-14 years,male,Generation X,2.425886
3,Albania,1992,47,2822500,251.0,709452584,5-14 years,male,Millenials,1.66519
4,Albania,1993,73,2807300,437.0,1228071038,5-14 years,male,Millenials,2.600363


In [104]:
# Selecting countries with information between 2009-2013
selCountries = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013)].\
    groupby('country')['year'].nunique().reset_index().query('year==5')
selCountries.shape

(75, 2)

In [105]:
# Filtering dataframe
dfPro = dfPro[(dfPro['year'] >= 2009) & (dfPro['year'] <= 2013) & (dfPro['country'].isin(selCountries['country']))]
dfPro.shape

(375, 10)

In [106]:
# Number of unique countries
totalCountries = len(dfPro['country'].unique())
totalCountries

75

In [107]:
# Encoding categorical variables
mapping = [{'col': 'age','mapping':{'5-14 years': 0,
                                    '15-24 years': 1,
                                    '35-54 years': 2,
                                    '55-74 years': 3}},
           {'col': 'generation', 'mapping':{'Millenials': 0,
                                            'Generation Z': 1,
                                            'Generation X': 2,
                                            'Boomers': 3,
                                            'Silent': 4}},
          {'col': 'sex', 'mapping': {'male': 0,
                                     'female': 1}}]

encoder = ce.OrdinalEncoder(cols = 'age',
                           return_df=True,
                           mapping=mapping)

encoder.fit(dfPro)

In [108]:
# Train & test division by country preparation
countryID = pd.DataFrame({'country': dfPro['country'].drop_duplicates(),
                          'id': np.arange(0, totalCountries)})
dfPro = dfPro.merge(countryID, on=['country'], how='left')

In [109]:
# Train and test size
trainSize = int(totalCountries*0.7)
testSize = totalCountries - trainSize
print(trainSize, testSize)

52 23


In [110]:
# Creating random samples
dataSample = np.arange(0, totalCountries)
trainSample = np.random.choice(totalCountries, size=trainSize, replace=False)
testSample = [i for i in dataSample if i not in trainSample]

In [111]:
# Dividing
TrainDF = dfPro[dfPro['id'].isin(list(trainSample))].drop(columns='id')
TestDF = dfPro[dfPro['id'].isin(list(testSample))].drop(columns='id')

In [112]:
# Encoding train ant test
TrainDF = encoder.transform(TrainDF)
TestDF = encoder.transform(TestDF)
TrainDF.head(10)

Unnamed: 0,country,year,suicides_no,population,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate
20,Bahamas,2009,6,327813,30450.0,9981960000,2,1,3,1.830312
21,Bahamas,2010,10,333869,30239.0,10095760000,2,1,2,2.995187
22,Bahamas,2011,4,338383,29761.0,10070450000,2,1,2,1.182092
23,Bahamas,2012,5,343558,31204.0,10720500000,2,1,2,1.455358
24,Bahamas,2013,5,348959,30455.0,10627600000,2,1,2,1.432833
25,Bahrain,2009,36,1092922,20988.0,22938218085,2,0,3,3.293922
26,Bahrain,2010,19,1139160,22572.0,25713271277,2,0,2,1.667896
27,Bahrain,2011,18,1103462,26078.0,28776595745,2,0,2,1.63123
28,Bahrain,2012,19,1201778,25587.0,30749308511,2,0,2,1.580991
29,Bahrain,2013,5,1214492,26793.0,32539547872,2,0,2,0.411695


Given that suicide rate is the variable to predict and it is a linear relationship between suicides_no and population, they will not be considered, only their lagged values.

In [113]:
# Rolling window function
def create_dataset(data, columns):
    auxDf1 = data.reset_index().copy()
    auxDf2 = auxDf1.copy()
    auxDf2['index'] = auxDf2['index'] + 1
    rezDF = pd.merge(auxDf1.drop(columns=columns),
                     auxDf2[columns + ['index', 'country']], on=['index', 'country'],
                    how='inner')
    return rezDF

In [114]:
# Rolling window
TrainDF = create_dataset(TrainDF, ['population', 'suicides_no'])
TestDF = create_dataset(TestDF, ['population', 'suicides_no'])

In [115]:
# renaming columns & Dividing x & Y matrix
yTrain = TrainDF['suicideRate']
yTest = TestDF['suicideRate']
XTrain = TrainDF.rename(columns={'population': 'population-1', 'suicides_no': 'suicides_no-1'}).\
               drop(columns=['suicideRate', 'index']).sort_values(['year', 'country']).reset_index(drop=True)
XTest = TestDF.rename(columns={'population': 'population-1', 'suicides_no': 'suicides_no-1'}).\
              drop(columns=['suicideRate', 'index']).sort_values(['year', 'country']).reset_index(drop=True)
TrainDF.head()

Unnamed: 0,index,country,year,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,suicideRate,population,suicides_no
0,21,Bahamas,2010,30239.0,10095760000,2,1,2,2.995187,327813,6
1,22,Bahamas,2011,29761.0,10070450000,2,1,2,1.182092,333869,10
2,23,Bahamas,2012,31204.0,10720500000,2,1,2,1.455358,338383,4
3,24,Bahamas,2013,30455.0,10627600000,2,1,2,1.432833,343558,5
4,26,Bahrain,2010,22572.0,25713271277,2,0,2,1.667896,1092922,36


Longitudinal data is a stack of multiple cross sectional datasets, so makes sense we need to standardize the variables for each year individually.

In [116]:
class LongitudinalScaler:
    def __init__(self, train, test):
        self.TrainData = train.copy()
        self.TestData = test.copy()
        self.scalerDict = {}
        self.ScaledTrain = pd.DataFrame()
        self.ScaledTest = pd.DataFrame()
    
    def createScaler(self, years, colDrop):
        for year in years:
            aux = self.TrainData.copy()
            aux = aux[aux['year'] == year].drop(columns=colDrop)
            colNames = aux.columns
            scalerName = 'scaler' + str(year)
            scaler = StandardScaler()
            aux = scaler.fit_transform(aux)
            aux = pd.DataFrame(aux, columns=colNames)
            self.scalerDict[scalerName] = scaler
            self.ScaledTrain = pd.concat([self.ScaledTrain, aux], axis=0)
    
    def scalingTest(self, years, colDrop):
        for year in years:
            aux = self.TestData.copy()
            aux = aux[aux['year'] == year].drop(columns=colDrop)
            colNames = aux.columns
            scalerName = 'scaler' + str(year)
            scaler = self.scalerDict[scalerName]
            aux = scaler.transform(aux)
            aux = pd.DataFrame(aux, columns=colNames)
            self.ScaledTest = pd.concat([self.ScaledTest, aux], axis=0)

In [117]:
# Scaling Train
LSC = LongitudinalScaler(train=XTrain, test=XTest)
LSC.createScaler(years=[2010, 2011, 2012, 2013], colDrop=['country', 'year'])
LSC.scalingTest(years=[2010, 2011, 2012, 2013], colDrop=['country', 'year'])

In [118]:
# Saving Train & test scaled
TrainScaled = LSC.ScaledTrain.copy()
TestScaled = LSC.ScaledTest.copy()

In [119]:
# Adding ids
TrainScaled[['country', 'year']] = TrainDF[['country', 'year']]
TestScaled[['country', 'year']] = TestDF[['country', 'year']]
TestScaled.head()

Unnamed: 0,gdp_per_capita ($),gdp_for_year ($),age,sex,generation,population-1,suicides_no-1,country,year
0,-0.594935,-0.178495,0.228086,0.890564,0.126435,0.173386,-0.075818,Argentina,2010
1,-0.936484,-0.362959,0.228086,0.890564,0.126435,-0.484104,-0.422357,Argentina,2011
2,1.311673,0.142311,0.228086,0.890564,0.126435,-0.143809,-0.142776,Argentina,2012
3,1.062232,-0.192622,0.228086,-1.122884,0.126435,-0.383822,-0.272407,Argentina,2013
4,1.096898,0.351187,0.228086,-1.122884,0.126435,0.073801,0.047325,Armenia,2010


Now it is needed to reshape the data into a numpy array with a (Country, periods, variables) structure. This is required for RNN modeling and its going to be done in the next notebook

In [121]:
# Exporting data
TrainScaled.to_csv('TrainScaled.csv')
TestScaled.to_csv('TestScaled.csv')
yTrain.to_csv('yTrain.csv')
yTest.to_csv('yTest.csv')