# Proyecto

## Librerías
Descripción general de las librerías y para que se usan en el proyecto

In [181]:
# importing libraries
import pandas as pd #excel sofisticado
import numpy as np #matlab
import matplotlib.pyplot as plt #plots
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

## Base de datos
Descripción general de la base de datos.

In [182]:
# importamos dataset
DATA_PATH = "https://raw.githubusercontent.com/jduchit/MusicAndMentalHealth/main/Dataset/mxmh_survey_results.csv"
df = pd.read_csv(DATA_PATH)
#print(df)
# vemos las primeras 5 filas
df.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


### *Detalles de la base de datos*

### *Limpieza del Dataset*

In [183]:
def get_df_size(df, header='Dataset dimensions'):
  print(header,
        '\n# Attributes: ', df.shape[1],
        '\n# Entries: ', df.shape[0],'\n')

get_df_size(df)

#Descartamos las filas que tienen valores nulos
df_clean = df.dropna()

#Retiramos la columna timestamp, Primary streaming service y Permissions
df_clean = df_clean.drop(['Timestamp'], axis=1)
df_clean = df_clean.drop(['Primary streaming service'], axis=1)
df_clean = df_clean.drop(['Permissions'], axis=1)

#Obtenemos columnas bianarias, numericas y categoricas
binary_feat = df_clean.nunique()[df_clean.nunique() == 2].keys().tolist()
numeric_feat = [col for col in df_clean.select_dtypes(['float','int']).columns.tolist() if col not in binary_feat]
categorical_feat = [col for col in df_clean.select_dtypes('object').columns.tolist() if col not in binary_feat + numeric_feat]

df_proc = df_clean.copy()
#Codificamos las columnas binarias
le = LabelEncoder()
for i in binary_feat:
    df_proc[i] = le.fit_transform(df_proc[i])
    print(i, '\n', np.unique(df_proc[i].values))
#Dummy variables
df_proc = pd.get_dummies(df_proc, columns=categorical_feat)
get_df_size(df_proc, header='Processed dataset:')
df_proc.head()

Dataset dimensions 
# Attributes:  33 
# Entries:  736 

While working 
 [0 1]
Instrumentalist 
 [0 1]
Composer 
 [0 1]
Exploratory 
 [0 1]
Foreign languages 
 [0 1]
Processed dataset: 
# Attributes:  95 
# Entries:  616 



Unnamed: 0,Age,Hours per day,While working,Instrumentalist,Composer,Exploratory,Foreign languages,BPM,Anxiety,Depression,...,Frequency [Rock]_Rarely,Frequency [Rock]_Sometimes,Frequency [Rock]_Very frequently,Frequency [Video game music]_Never,Frequency [Video game music]_Rarely,Frequency [Video game music]_Sometimes,Frequency [Video game music]_Very frequently,Music effects_Improve,Music effects_No effect,Music effects_Worsen
2,18.0,4.0,0,0,0,0,1,132.0,7.0,7.0,...,1,0,0,0,0,0,1,0,1,0
3,61.0,2.5,1,0,1,1,1,84.0,9.0,7.0,...,0,0,0,1,0,0,0,1,0,0
4,18.0,4.0,1,0,0,1,0,107.0,7.0,2.0,...,0,0,0,0,1,0,0,1,0,0
5,18.0,5.0,1,1,1,1,1,86.0,8.0,8.0,...,0,0,1,1,0,0,0,1,0,0
6,18.0,3.0,1,1,0,1,1,66.0,4.0,8.0,...,0,0,0,0,0,1,0,1,0,0


## *Feature Selection Regresión*

Como hay algunas features que son categóricas y otras que son numéricas, aplicamos una técnica de feature selection adecuada para esta situación. En este caso, se seleeccionó mutual information, que nos da un raking de que tan informativa es una variable del input teniendo en cuenta la salida del modelo. 

In [184]:
from sklearn.feature_selection import mutual_info_regression
#Dividimos df_proc en caracteristicas y salidas 

#Para la caracteristica Anxiety
X_anxiety = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_anxiety = df_proc['Anxiety']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_anxiety_train, X_anxiety_test, y_anxiety_train, y_anxiety_test = train_test_split(X_anxiety, y_anxiety, test_size=0.2, random_state=42)

#Para la caracteristica Depression
X_depression = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_depression = df_proc['Depression']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_depression_train, X_depression_test, y_depression_train, y_depression_test = train_test_split(X_depression, y_depression, test_size=0.2, random_state=42)

#Para la caracteristica Insomnia
X_insomnia = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_insomnia = df_proc['Insomnia']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_insomnia_train, X_insomnia_test, y_insomnia_train, y_insomnia_test = train_test_split(X_insomnia, y_insomnia, test_size=0.2, random_state=42)

#Para la caracteristica OCD
X_ocd = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_ocd = df_proc['OCD']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_ocd_train, X_ocd_test, y_ocd_train, y_ocd_test = train_test_split(X_ocd, y_ocd, test_size=0.2, random_state=42)

In [185]:
#Feature selection
#Anxiety
mi_anxiety = mutual_info_regression(X_anxiety_train, y_anxiety_train)

#Depression
mi_depression = mutual_info_regression(X_depression_train, y_depression_train)

#Insomnia
mi_insomnia = mutual_info_regression(X_insomnia_train, y_insomnia_train)

#OCD
mi_ocd = mutual_info_regression(X_ocd_train, y_ocd_train)

#We combine the mutual information scores across all features into a Panda's dataframe

mi_scores = pd.DataFrame({'Anxiety': mi_anxiety, 'Depression': mi_depression, 'Insomnia': mi_insomnia, 'OCD': mi_ocd}, index=X_anxiety_train.columns)

#Sum the mutual information scores for each feature across all target variables
mi_scores['Overall'] = mi_scores.sum(axis=1)

#Sort the features by their overall scores
mi_scores.sort_values(by='Overall', ascending=False, inplace=True)

#Modify df_proc to keep only the top 50 features
df_proc_reg = df_proc.copy()
df_proc_reg.drop(mi_scores[50:].index, axis=1, inplace=True)
df_proc_reg.head()



Unnamed: 0,Age,While working,Composer,Foreign languages,BPM,Anxiety,Depression,Insomnia,OCD,Fav genre_Classical,...,Frequency [Rap]_Never,Frequency [Rap]_Rarely,Frequency [Rap]_Sometimes,Frequency [Rock]_Never,Frequency [Video game music]_Never,Frequency [Video game music]_Sometimes,Frequency [Video game music]_Very frequently,Music effects_Improve,Music effects_No effect,Music effects_Worsen
2,18.0,0,0,1,132.0,7.0,7.0,10.0,2.0,0,...,0,1,0,0,0,0,1,0,1,0
3,61.0,1,1,1,84.0,9.0,7.0,3.0,3.0,0,...,1,0,0,1,1,0,0,1,0,0
4,18.0,1,0,0,107.0,7.0,2.0,5.0,9.0,0,...,0,0,0,1,0,0,0,1,0,0
5,18.0,1,1,1,86.0,8.0,8.0,7.0,7.0,0,...,0,0,0,0,1,0,0,1,0,0
6,18.0,1,0,1,66.0,4.0,8.0,6.0,0.0,0,...,1,0,0,1,0,1,0,1,0,0


### *División en conjunto de entrenamiento y test para regresión*

In [187]:
#Dividimos df_proc en caracteristicas y salidas
X_reg = df_proc_reg.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)

y_reg = df_proc_reg[['Anxiety', 'Depression', 'Insomnia', 'OCD']]

#Dividimos el conjunto de datos en entrenamiento y prueba
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

## *Feature Selection Clasificación*

In [189]:
from sklearn.feature_selection import mutual_info_classif

#Dividimos df_proc en caracteristicas y salidas
#Para la caracteristica Music effects_Improve
X_improve = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_improve = df_proc['Music effects_Improve']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_improve_train, X_improve_test, y_improve_train, y_improve_test = train_test_split(X_improve, y_improve, test_size=0.2, random_state=42)

#Para la caracteristica Music effects_No effect
X_no_effect = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_no_effect = df_proc['Music effects_No effect']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_no_effect_train, X_no_effect_test, y_no_effect_train, y_no_effect_test = train_test_split(X_no_effect, y_no_effect, test_size=0.2, random_state=42)

#Para la caracteristica Music effects_Worsen
X_worsen = df_proc.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)
y_worsen = df_proc['Music effects_Worsen']

#Dividimos el conjunto de datos en entrenamiento y prueba
X_worsen_train, X_worsen_test, y_worsen_train, y_worsen_test = train_test_split(X_worsen, y_worsen, test_size=0.2, random_state=42)

In [190]:
#Feature selection
#Music effects_Improve
mi_improve = mutual_info_classif(X_improve_train, y_improve_train)

#Music effects_No effect
mi_no_effect = mutual_info_classif(X_no_effect_train, y_no_effect_train)

#Music effects_Worsen
mi_worsen = mutual_info_classif(X_worsen_train, y_worsen_train)

#We combine the mutual information scores across all features into a Panda's dataframe
mi_scores = pd.DataFrame({'Music effects_Improve': mi_improve, 'Music effects_No effect': mi_no_effect, 'Music effects_Worsen': mi_worsen}, index=X_improve_train.columns)

#Sum the mutual information scores for each feature across all target variables
mi_scores['Overall'] = mi_scores.sum(axis=1)

#Sort the features by their overall scores
mi_scores.sort_values(by='Overall', ascending=False, inplace=True)

#Modify df_proc to keep only the top 50 features
df_proc_classif = df_proc.copy()
df_proc_classif.drop(mi_scores[50:].index, axis=1, inplace=True)
df_proc_classif.head()

Unnamed: 0,Age,While working,Composer,Foreign languages,BPM,Anxiety,Depression,Insomnia,OCD,Fav genre_Classical,...,Frequency [Rap]_Never,Frequency [Rap]_Rarely,Frequency [Rap]_Sometimes,Frequency [Rock]_Never,Frequency [Video game music]_Never,Frequency [Video game music]_Sometimes,Frequency [Video game music]_Very frequently,Music effects_Improve,Music effects_No effect,Music effects_Worsen
2,18.0,0,0,1,132.0,7.0,7.0,10.0,2.0,0,...,0,1,0,0,0,0,1,0,1,0
3,61.0,1,1,1,84.0,9.0,7.0,3.0,3.0,0,...,1,0,0,1,1,0,0,1,0,0
4,18.0,1,0,0,107.0,7.0,2.0,5.0,9.0,0,...,0,0,0,1,0,0,0,1,0,0
5,18.0,1,1,1,86.0,8.0,8.0,7.0,7.0,0,...,0,0,0,0,1,0,0,1,0,0
6,18.0,1,0,1,66.0,4.0,8.0,6.0,0.0,0,...,1,0,0,1,0,1,0,1,0,0


### *División en conjunto de entrenamiento y test para clasificación*

In [None]:
X_class = df_proc_classif.drop(['Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen'], axis=1)

y_class = df_proc_classif[['Music effects_Improve', 'Music effects_No effect', 'Music effects_Worsen']]

#Dividimos el conjunto de datos en entrenamiento y prueba
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

### *Balanceo de datos*

## Modelo de Machine Learning
