# Assignment 6

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 72

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df = pd.read_csv('suicide-rates.csv')

print(f'Number of datapoints: {len(df)} \nNumber of features: {len(df.columns)}\n')

df.head()

Number of datapoints: 27820 
Number of features: 12



Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


# Data Preprocessing

In [3]:
df = df.drop(['HDI for year'], axis = 1)
df.shape

(27820, 11)

### Question 1

In [4]:
df = df.drop('country', axis=1)
df = df.drop('country-year', axis=1)
df = df.drop(' gdp_for_year ($) ', axis=1)
df = df.drop('population', axis=1)
df = df.drop('suicides_no', axis=1)
df = df.drop('gdp_per_capita ($)', axis=1)
df = df.drop('year', axis=1)

df = pd.get_dummies(df, columns=['sex', 'age', 'generation'])

df.head()

Unnamed: 0,suicides/100k pop,sex_female,sex_male,age_15-24 years,age_25-34 years,age_35-54 years,age_5-14 years,age_55-74 years,age_75+ years,generation_Boomers,generation_G.I. Generation,generation_Generation X,generation_Generation Z,generation_Millenials,generation_Silent
0,6.71,0,1,1,0,0,0,0,0,0,0,1,0,0,0
1,5.19,0,1,0,0,1,0,0,0,0,0,0,0,0,1
2,4.83,1,0,1,0,0,0,0,0,0,0,1,0,0,0
3,4.59,0,1,0,0,0,0,0,1,0,1,0,0,0,0
4,3.28,0,1,0,1,0,0,0,0,1,0,0,0,0,0


In [5]:
%%time

X = df.drop('suicides/100k pop', axis=1)
y = df['suicides/100k pop']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 247.46955019583245
Mean Absolute Error (MAE): 10.028320733734724
R-squared (R2): 0.291849902919579
CPU times: user 39.7 ms, sys: 3.11 ms, total: 42.8 ms
Wall time: 11.4 ms


In [6]:
prediction_df = pd.DataFrame(data={
    'sex_female': [0],
    'sex_male': [1],
    'age_15-24 years': [1],
    'age_25-34 years': [0],
    'age_35-54 years': [0],
    'age_5-14 years': [0],
    'age_55-74 years': [0],
    'age_75+ years': [0],
    'generation_Boomers': [0],
    'generation_G.I. Generation': [0],
    'generation_Generation X': [1],
    'generation_Generation Z': [0],
    'generation_Millenials': [0],
    'generation_Silent': [0]
})

prediction_df.head()

Unnamed: 0,sex_female,sex_male,age_15-24 years,age_25-34 years,age_35-54 years,age_5-14 years,age_55-74 years,age_75+ years,generation_Boomers,generation_G.I. Generation,generation_Generation X,generation_Generation Z,generation_Millenials,generation_Silent
0,0,1,1,0,0,0,0,0,0,0,1,0,0,0


We now need to encode the nominal data, of which we have sex, age, and generation. Generation and age, however, are ordinal values so those have to be encoded differently.

In [7]:
y_pred_q1 = model.predict(prediction_df)
print(f'model prediction: {y_pred_q1}')

model prediction: [16.72265625]


In [8]:
print(model.coef_)

[-1.10438101e+13 -1.10438101e+13 -6.69736127e+11 -6.69736127e+11
 -6.69736127e+11 -6.69736127e+11 -6.69736127e+11 -6.69736127e+11
 -6.28554032e+10 -6.28554032e+10 -6.28554032e+10 -6.28554032e+10
 -6.28554032e+10 -6.28554032e+10]


There are 14 coefficients -- one for each feature I used to fit the model (because they were one-hot encoded).

### Question 2

In [9]:
df2 = pd.read_csv('suicide-rates.csv')
df2 = df2.drop('country', axis=1)
df2 = df2.drop('country-year', axis=1)
df2 = df2.drop(' gdp_for_year ($) ', axis=1)
df2 = df2.drop('population', axis=1)
df2 = df2.drop('suicides_no', axis=1)
df2 = df2.drop('HDI for year', axis=1)
df2 = df2.drop('gdp_per_capita ($)', axis=1)
df2 = df2.drop('year', axis=1)

age_order = ['5-14 years', '15-24 years', '25-34 years', '35-54 years', '55-74 years', '75+ years']
generation_order = ['G.I. Generation', 'Silent', 'Boomers', 'Generation X', 'Millenials', 'Generation Z']

df2['age'] = OrdinalEncoder(categories=[age_order]).fit_transform(df2[['age']])
df2['generation'] = OrdinalEncoder(categories=[generation_order]).fit_transform(df2[['generation']])

# Since this is not ordinal, we need to one-hot encode
df2 = pd.get_dummies(df2, columns=['sex'])

df2.head()

Unnamed: 0,age,suicides/100k pop,generation,sex_female,sex_male
0,1.0,6.71,3.0,0,1
1,3.0,5.19,1.0,0,1
2,1.0,4.83,3.0,1,0
3,5.0,4.59,0.0,0,1
4,2.0,3.28,2.0,0,1


In [10]:
%%time

X = df2.drop('suicides/100k pop', axis=1)
y = df2['suicides/100k pop']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 250.64226684249996
Mean Absolute Error (MAE): 10.180977152228612
R-squared (R2): 0.2827709693717211
CPU times: user 49.1 ms, sys: 23.5 ms, total: 72.6 ms
Wall time: 8.8 ms


In [11]:
prediction_df2 = pd.DataFrame(data={
    'age': [1.0],
    'generation': [3.0],
    'sex_female': [0],
    'sex_male': [1]
})

prediction_df2.head()

Unnamed: 0,age,generation,sex_female,sex_male
0,1.0,3.0,0,1


In [12]:
y_pred_q2 = model.predict(prediction_df2)
print(f'model prediction: {y_pred_q2}')

model prediction: [14.41992188]


In [13]:
print(model.coef_)

[ 3.75274647e+00 -4.02616116e-01  1.56169525e+13  1.56169525e+13]


There are 4 coefficients -- one for each feature I used to fit the model.

### Question 3

Yes, there was a change in the model performance. The one-hot encoded model predicted a 16.72265625 suicide rate while the numerically encoded one predicted a 14.41992188 suicide rate. The numerically encoded model also ran approximately twice as fast as the one-hot encoded model.

### Question 4

In [14]:
prediction_df3 = pd.DataFrame(data={
    'age': [2.0],
    'generation': [6.0],
    'sex_female': [0],
    'sex_male': [1]
})

prediction_df3.head()

Unnamed: 0,age,generation,sex_female,sex_male
0,2.0,6.0,0,1


In [15]:
y_pred_q4 = model.predict(prediction_df3)
print(f'model prediction: {y_pred_q4}')

model prediction: [16.96484375]


### Question 5

One advantage of regression over classification with nominal features is its ability to handle a wider range of independent variables, such as continuous, discrete, and ordinal variables. This allows you to include variables like age, income, temperature, time, or any other measurable quantity. Classification, on the other hand, typically deals only with categorical variables. While there are ways to handle nominal features, like one-hot encoding for example, analysis is generally limited to the presence or absence of a particular category, rather than considering the precise numeric relationship or magnitude of the feature.

### Question 6

For starters, regression will predict values outside the range [0, 1]. But, going further than that, since regression is able to predict discrete variables, numerical encoding will allow you to preserve the inherent order and magnitute of your feature variables, which will allow it to more aptly capture the relationships and patterns in your data.

### Question 7

I would say definitely the regression model in this case. The reason being that in the previous assignment we created a derived column of low/high suicide rate in order to turn this into a binary classification problem. That low/high suicide rate is basically useless, in my opinion, as it does not provide much information in the context of this problem.

If my customer was some kind of organization which was trying to reduce global suicide rates, the most valuable information to them would be a prediction of what the suicide rate is going to be, not whether it was "high" or "low".

Furthermore, lets assume that this organization was implementing various strategies to reduce the suicide rate. If we went with a classification model, the only feedback they would recieve as to if their actions were having the desired effect is when their cumulative actions exceeded the threshold for the low/high suicide rate. This means they could potentially waste a lot of effort doing the wrong things before they got feedback in the form of a positive confirmation.