Un petit coup de neuf sur les consignes et corrections pandas related

linogaliana · linogaliana · commit d56f6e9e4c85 · 2025-11-25T08:30:59.000Z
diff --git a/content/modelisation/01_preprocessing/_exo5.qmd b/content/modelisation/01_preprocessing/_exo5.qmd
@@ -8,10 +8,10 @@
 *Note : Le résultat du label encoding est relativement intuitif, notamment quand on le met en relation avec le vecteur initial.*
 
 3. Regarder la *dummy expansion* de `state_name`
-4. Appliquer un `OrdinalEncoder` à `df[['state_name', 'county_name']]`
+4. Appliquer un `OrdinalEncoder` à `df.loc[: , ['state_name', 'county_name']]`
 *Note : Le résultat du _ordinal encoding_ est cohérent avec celui du label encoding*
 
-5. Appliquer un `OneHotEncoder` à `df[['state_name', 'county_name']]`
+5. Appliquer un `OneHotEncoder` à `df.loc[:, ['state_name', 'county_name']]`
 
 *Note : `scikit` optimise l'objet nécessaire pour stocker le résultat d'un modèle de transformation. Par exemple, le résultat de l'encoding One Hot est un objet très volumineux. Dans ce cas, `scikit` utilise une matrice Sparse.*
 
@@ -28,10 +28,10 @@
 *Note: The label encoding result is relatively intuitive, especially when related to the initial vector.*
 
 3. Observe the dummy expansion of `state_name`
-4. Apply an `OrdinalEncoder` to `df[['state_name', 'county_name']]`
+4. Apply an `OrdinalEncoder` to `df.loc[:, ['state_name', 'county_name']]`
 *Note: The ordinal encoding result is consistent with the label encoding*
 
-5. Apply a `OneHotEncoder` to `df[['state_name', 'county_name']]`
+1. Apply a `OneHotEncoder` to `df.loc[:, ['state_name', 'county_name']]`
 
 *Note: `scikit` optimizes the object required to store the result of a transformation model. For example, the One Hot encoding result is a very large object. In this case, `scikit` uses a Sparse matrix.*
 
diff --git a/content/modelisation/0_preprocessing.qmd b/content/modelisation/0_preprocessing.qmd
@@ -339,7 +339,7 @@ centroids["winner"] =  np.where(centroids['votes_gop'] > centroids['votes_dem'],
 
 centroids['lon'] = centroids['geometry'].x
 centroids['lat'] = centroids['geometry'].y
-centroids = pd.DataFrame(centroids[["county_name",'lon','lat','winner', 'CENSUS_2020_POP',"state_name"]])
+centroids = pd.DataFrame(centroids.loc[: , "county_name",'lon','lat','winner', 'CENSUS_2020_POP',"state_name"])
 groups = centroids.groupby('winner')
 
 df = centroids.copy()
diff --git a/content/modelisation/3_regression.qmd b/content/modelisation/3_regression.qmd
@@ -271,7 +271,7 @@ df2 = df2.dropna().astype(np.float64)
 
 X_train, X_test, y_train, y_test = train_test_split(
     df2.drop(["Median_Household_Income_2021","per_gop"], axis = 1),
-    100*df2[['per_gop']].values.ravel(), test_size=0.2, random_state=0
+    100*df2['per_gop'], test_size=0.2, random_state=0
 )
 
 ols = LinearRegression().fit(X_train, y_train)
@@ -628,7 +628,7 @@ df2['y'] = (df2['per_gop']>0.5).astype(int)
 
 X_train, X_test, y_train, y_test = train_test_split(
     df2.drop(["Median_Household_Income_2021","y"], axis = 1),
-    1*df2[['y']].values.ravel(), test_size=0.2, random_state=0
+    1*df2['y'], test_size=0.2, random_state=0
 )
 
 clf = LogisticRegression().fit(X_train, y_train)
diff --git a/content/modelisation/4_featureselection.qmd b/content/modelisation/4_featureselection.qmd
@@ -421,7 +421,7 @@ df2.replace([np.inf, -np.inf], np.nan, inplace=True)
 #2. Echantillon d'entraînement et échantillon test
 X_train, X_test, y_train, y_test = train_test_split(
     df2.drop(["per_gop"], axis = 1),
-    100*df2[['per_gop']], test_size=0.2, random_state=0
+    100*df2['per_gop'], test_size=0.2, random_state=0
 )
 ```
 
@@ -688,7 +688,7 @@ from sklearn.preprocessing import StandardScaler
 df2.replace([np.inf, -np.inf], np.nan, inplace=True)
 X_train, X_test, y_train, y_test = train_test_split(
     df2.drop(["per_gop"], axis = 1),
-    100*df2[['per_gop']], test_size=0.2, random_state=0
+    100*df2['per_gop'], test_size=0.2, random_state=0
 )
 
 numerical_features = X_train.select_dtypes(include='number').columns.tolist()
diff --git a/content/modelisation/6_pipeline.qmd b/content/modelisation/6_pipeline.qmd
@@ -520,7 +520,7 @@ mutations2 = mutations2.groupby('dep').sample(frac = 0.1, random_state = 123)
 X_train, X_test, y_train, y_test = train_test_split(
     mutations2.drop("Valeur_fonciere", axis = 1),
     mutations2["Valeur_fonciere"],
-    test_size = 0.2, random_state = 123, stratify=mutations2[['dep']]
+    test_size = 0.2, random_state = 123, stratify=mutations2['dep']
 )
 ```
 
diff --git a/content/modelisation/_import_data_ml.qmd b/content/modelisation/_import_data_ml.qmd
@@ -1,16 +1,10 @@
-Ce chapitre utilise toujours le même jeu de données, présenté dans l'[introduction
-de cette partie](index.qmd) : les données de vote aux élections présidentielles américaines
-croisées à des variables sociodémographiques.
-Le code 
-est disponible [sur Github](https://github.com/linogaliana/python-datascientist/blob/main/content/modelisation/get_data.py).
+L'ensemble de la partie _machine learning_ utilise le même jeu de données, présenté dans l'[introduction de cette partie](index.qmd) : les données de vote aux élections présidentielles américaines croisées à des variables sociodémographiques. Le code est disponible [sur Github](https://github.com/linogaliana/python-datascientist/blob/main/content/modelisation/get_data.py).
 
 
 ```{python}
 #| eval: false
 #| echo: true
-!pip install --upgrade xlrd #colab bug verson xlrd
-!pip install geopandas
-!pip install openpyxl
+!pip install geopandas openpyxl plotnine plotly
 ```
 
 ```{python}

Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ df2 = df2.dropna().astype(np.float64)`
`271`	`271`
`272`	`272`	`X_train, X_test, y_train, y_test = train_test_split(`
`273`	`273`	`df2.drop(["Median_Household_Income_2021","per_gop"], axis = 1),`
`274`		`- 100*df2[['per_gop']].values.ravel(), test_size=0.2, random_state=0`
	`274`	`+ 100*df2['per_gop'], test_size=0.2, random_state=0`
`275`	`275`	`)`
`276`	`276`
`277`	`277`	`ols = LinearRegression().fit(X_train, y_train)`
`@@ -628,7 +628,7 @@ df2['y'] = (df2['per_gop']>0.5).astype(int)`
`628`	`628`
`629`	`629`	`X_train, X_test, y_train, y_test = train_test_split(`
`630`	`630`	`df2.drop(["Median_Household_Income_2021","y"], axis = 1),`
`631`		`- 1*df2[['y']].values.ravel(), test_size=0.2, random_state=0`
	`631`	`+ 1*df2['y'], test_size=0.2, random_state=0`
`632`	`632`	`)`
`633`	`633`
`634`	`634`	`clf = LogisticRegression().fit(X_train, y_train)`
Original file line number	Diff line number	Diff line change
`@@ -520,7 +520,7 @@ mutations2 = mutations2.groupby('dep').sample(frac = 0.1, random_state = 123)`
`520`	`520`	`X_train, X_test, y_train, y_test = train_test_split(`
`521`	`521`	`mutations2.drop("Valeur_fonciere", axis = 1),`
`522`	`522`	`mutations2["Valeur_fonciere"],`
`523`		`- test_size = 0.2, random_state = 123, stratify=mutations2[['dep']]`
	`523`	`+ test_size = 0.2, random_state = 123, stratify=mutations2['dep']`
`524`	`524`	`)`
`525`	`525`	```
`526`	`526`