change na subset (#362)

linogaliana · github-actions[bot] · web-flow · commit 58c712876686 · 2023-06-11T21:32:03.000+02:00
* change na subset

* deprecated

* deprecated

* change method

* Automated changes

* Automated changes

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/content/course/modelisation/3_regression/index.qmd b/content/course/modelisation/3_regression/index.qmd
@@ -223,10 +223,10 @@ un problème de spécification ?
 # 1. Régression linéaire de per_gop sur différentes variables explicatives
 xvars = ['Unemployment_rate_2019', 'Median_Household_Income_2019', 'Percent of adults with less than a high school diploma, 2015-19', "Percent of adults with a bachelor's degree or higher, 2015-19"]
 
-df2 = votes[["per_gop"] + xvars]
+df2 = votes[["per_gop"] + xvars].copy()
 df2['log_income'] = np.log(df2["Median_Household_Income_2019"])
-indices_to_keep = ~df2.isin([np.nan, np.inf, -np.inf]).any(1)
-df2 = df2[indices_to_keep].astype(np.float64)
+df2 = df2.dropna().astype(np.float64)
+
 
 X_train, X_test, y_train, y_test = train_test_split(
     df2.drop(["Median_Household_Income_2019","per_gop"], axis = 1),
@@ -335,10 +335,11 @@ en `log` sinon son échelle risque d'écraser tout effet.
 # 1. Régression linéaire de per_gop sur différentes variables explicatives
 xvars = ['Unemployment_rate_2019', 'Median_Household_Income_2019', 'Percent of adults with less than a high school diploma, 2015-19', "Percent of adults with a bachelor's degree or higher, 2015-19"]
 
-df2 = votes[["per_gop"] + xvars]
+xvars = ['Unemployment_rate_2019', 'Median_Household_Income_2019', 'Percent of adults with less than a high school diploma, 2015-19', "Percent of adults with a bachelor's degree or higher, 2015-19"]
+
+df2 = votes[["per_gop"] + xvars].copy()
 df2['log_income'] = np.log(df2["Median_Household_Income_2019"])
-indices_to_keep = ~df2.isin([np.nan, np.inf, -np.inf]).any(1)
-df2 = df2[indices_to_keep].astype(np.float64)
+df2 = df2.dropna().astype(np.float64)
 
 X = sm.add_constant(df2.drop(["Median_Household_Income_2019","per_gop"], axis = 1))
 results = sm.OLS(df2[['per_gop']], X).fit()
@@ -487,10 +488,10 @@ une mesure de qualité du modèle.
 #1. Modèle logit avec les mêmes variables que précédemment
 xvars = ['Unemployment_rate_2019', 'Median_Household_Income_2019', 'Percent of adults with less than a high school diploma, 2015-19', "Percent of adults with a bachelor's degree or higher, 2015-19"]
 
-df2 = votes[["per_gop"] + xvars]
+df2 = votes[["per_gop"] + xvars].copy()
 df2['log_income'] = np.log(df2["Median_Household_Income_2019"])
-indices_to_keep = ~df2.isin([np.nan, np.inf, -np.inf]).any(1)
-df2 = df2[indices_to_keep].astype(np.float64)
+df2 = df2.dropna().astype(np.float64)
+
 
 df2['y'] = (df2['per_gop']>0.5).astype(int)
 
@@ -511,8 +512,10 @@ print(clf.intercept_, clf.coef_)
 #| include: false
 #| echo: false
 
+from sklearn.metrics import ConfusionMatrixDisplay
+
 # 2. Matrice de confusion
-sklearn.metrics.plot_confusion_matrix(clf, X_test, y_test)
+ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
 
 sc_accuracy = sklearn.metrics.accuracy_score(y_pred, y_test)
 sc_f1 = sklearn.metrics.f1_score(y_pred, y_test)
@@ -564,6 +567,7 @@ En utilisant échantillons d'apprentissage et d'estimation :
 de gagner.
 2. Faire un test de ratio de vraisemblance concernant l'inclusion de la variable de (log)-revenu.
 
+
 ```{python}
 #| include: false
 #| echo: false
@@ -576,8 +580,7 @@ xvars = [
 
 df2 = votes[["per_gop"] + xvars]
 df2['log_income'] = np.log(df2["Median_Household_Income_2019"])
-indices_to_keep = ~df2.isin([np.nan, np.inf, -np.inf]).any(1)
-df2 = df2[indices_to_keep].astype(np.float64)
+df2 = df2.dropna().astype(np.float64)
 
 df2['y'] = (df2['per_gop']>0.5).astype(int)