In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('datosLimpios.csv')
df.head()

Unnamed: 0,year,suicides_no,population,suicides/100k pop,gdp_for_year,gdp_per_capita,age_15-24,age_25-34,age_35-54,age_5-14,...,country_Thailand,country_Trinidad and Tobago,country_Turkey,country_Turkmenistan,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan
0,1987,21,312900,6.71,2156625000.0,796.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1987,16,308000,5.19,2156625000.0,796.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,1987,14,289700,4.83,2156625000.0,796.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1987,1,21800,4.59,2156625000.0,796.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1987,9,274300,3.28,2156625000.0,796.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


## Probabilidad de Suicidio

Se usará suicides/100k pop como independiente y todas las demas columnas del dataset como dependientes. Se usará el modelo de regresión lineal para predecir la probabilidad de suicidio en un país.

In [25]:
y = df["suicides/100k pop"]
X = df.drop(["suicides/100k pop"], axis=1)

In [26]:
bins = np.linspace(0, 224.97, 21)
labels = ["5%", "10%", "15%", "20%", "25%", "30%", "35%", "40%", "45%", "50%", "55%", "60%", "65%", "70%", "75%", "80%", "85%", "90%", "95%", "100%"]
y['suicides/100k pop'] = pd.cut(df["suicides/100k pop"], bins, labels=labels, include_lowest=True)
y = y['suicides/100k pop']
y.head()

0    5%
1    5%
2    5%
3    5%
4    5%
Name: suicides/100k pop, dtype: category
Categories (20, object): ['5%' < '10%' < '15%' < '20%' ... '85%' < '90%' < '95%' < '100%']

In [27]:
# separar en train y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train)
print(X_test)

       year  suicides_no  population  gdp_for_year  gdp_per_capita  age_15-24  \
8280   2008            5      711888  1.798689e+10          3271.0      False   
3801   2015          529     1567729  4.550396e+11         42830.0      False   
5614   2014          536     2408469  2.605841e+11         15883.0      False   
10408  1988            1       11900  2.363575e+08          2707.0       True   
19371  1986           36      298812  3.874972e+10          4145.0      False   
...     ...          ...         ...           ...             ...        ...   
21575  2010            0       13885  1.381968e+09          8557.0      False   
5390   1995           32     1219884  7.344706e+10          5695.0       True   
860    2007          155     4601874  2.875305e+11          7918.0      False   
15795  2010            0       30688  2.588176e+09          7806.0      False   
23654  1995          273     4106700  6.129397e+11         16444.0      False   

       age_25-34  age_35-54

In [28]:
# normalización de los datos
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)

[[ 0.7923205  -0.25736299 -0.29012401 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.61786429  0.30929564 -0.07192141 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.49992946  0.3168655   0.14243109 ... -0.11541597 -0.11334248
  -0.09658843]
 ...
 [ 0.67438567 -0.09515155  0.70165492 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.02819015 -0.26277004 -0.46380068 ... -0.11541597 -0.11334248
  -0.09658843]
 [-0.74083226  0.03245478  0.57540687 ... -0.11541597 -0.11334248
  -0.09658843]]
[[ 1.61786429 -0.26168863 -0.21979108 ... -0.11541597 -0.11334248
  -0.09658843]
 [-0.0332233  -0.22924634 -0.41483442 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.38199464 -0.24222325 -0.280878   ... -0.11541597 -0.11334248
  -0.09658843]
 ...
 [-0.97670192 -0.2584444  -0.11670689 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.26405981 -0.14057075  1.96057468 ... -0.11541597 -0.11334248
  -0.09658843]
 [ 1.38199464 -0.1459778  -0.32199797 ... -0.11541597 -0.11334248
  -0.09658843]]


In [29]:
from sklearn.linear_model import LogisticRegression

# Instanciando el modelo con n_jobs=-1
logisticRegr = LogisticRegression(n_jobs=-1)

logisticRegr.fit(X_train, y_train)
score = logisticRegr.score(X_test, y_test)
print(score)

0.8291397076443805


In [None]:
from sklearn.model_selection import GridSearchCV

# Definir los parámetros que quieres ajustar
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Crear el objeto GridSearchCV
grid_search = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid, cv=5)

# Ajustar el modelo a los datos
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros encontrados
print(grid_search.best_params_)

# Usar el mejor modelo para predecir
best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test)
print(score)