tl;dr : *Inférence du transfert des voix entre candidats.*

On peut obtenir librement les résultats des [premier](https://www.data.gouv.fr/fr/datasets/election-presidentielle-des-10-et-24-avril-2022-resultats-definitifs-du-1er-tour/) et [second](https://www.data.gouv.fr/fr/datasets/election-presidentielle-des-10-et-24-avril-2022-resultats-definitifs-du-2nd-tour/) tours.

Inspirés par cet [article du Monde](https://www.lemonde.fr/les-decodeurs/article/2022/05/04/election-presidentielle-2022-quels-reports-de-voix-entre-les-deux-tours_6124672_4355770.html)
Le but est d'ici obtenir une estimation 
<!-- TEASER_END -->


## Collecte des données

On va utiliser les données disponibles sur https://www.data.gouv.fr en se concentrant sur les résultats définitifs par bureau de vote.

In [1]:
import numpy as np
import pandas as pd

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

In [2]:
import os

In [3]:
fname = '/tmp/T1.xlsx'

if not os.path.isfile(fname):
    url = 'https://static.data.gouv.fr/resources/election-presidentielle-des-10-et-24-avril-2022-resultats-definitifs-du-1er-tour/20220414-152612/resultats-par-niveau-burvot-t1-france-entiere.xlsx' # XLSX
    T1 = pd.read_excel(url)
    T1.to_excel(fname)
else:
    T1 = pd.read_excel(fname)#, low_memory=False)

T1.tail()

Unnamed: 0.1,Unnamed: 0,Code du département,Libellé du département,Code de la circonscription,Libellé de la circonscription,Code de la commune,Libellé de la commune,Code du b.vote,Inscrits,Abstentions,...,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104
69677,69677,ZZ,Français établis hors de France,6,6ème circonscription,229,Zurich,1,24868,14101,...,47,0.19,0.44,12,M,DUPONT-AIGNAN,Nicolas,189,0.76,1.77
69678,69678,ZZ,Français établis hors de France,11,11ème circonscription,231,Taipei,1,1709,942,...,10,0.59,1.32,12,M,DUPONT-AIGNAN,Nicolas,10,0.59,1.32
69679,69679,ZZ,Français établis hors de France,11,11ème circonscription,233,Nour-Soultan,1,117,64,...,0,0.0,0.0,12,M,DUPONT-AIGNAN,Nicolas,1,0.85,1.96
69680,69680,ZZ,Français établis hors de France,2,2ème circonscription,234,Monterrey,1,713,553,...,1,0.14,0.63,12,M,DUPONT-AIGNAN,Nicolas,1,0.14,0.63
69681,69681,ZZ,Français établis hors de France,2,2ème circonscription,235,Bahamas (Nassau),1,136,78,...,0,0.0,0.0,12,M,DUPONT-AIGNAN,Nicolas,1,0.74,1.82


In [4]:
T1.to_csv('/tmp/T1.csv')

In [5]:
T1.columns

Index(['Unnamed: 0', 'Code du département', 'Libellé du département',
       'Code de la circonscription', 'Libellé de la circonscription',
       'Code de la commune', 'Libellé de la commune', 'Code du b.vote',
       'Inscrits', 'Abstentions',
       ...
       'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98',
       'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102',
       'Unnamed: 103', 'Unnamed: 104'],
      dtype='object', length=106)

In [6]:
T1.columns[:23]

Index(['Unnamed: 0', 'Code du département', 'Libellé du département',
       'Code de la circonscription', 'Libellé de la circonscription',
       'Code de la commune', 'Libellé de la commune', 'Code du b.vote',
       'Inscrits', 'Abstentions', '% Abs/Ins', 'Votants', '% Vot/Ins',
       'Blancs', '% Blancs/Ins', '% Blancs/Vot', 'Nuls', '% Nuls/Ins',
       '% Nuls/Vot', 'Exprimés', '% Exp/Ins', '% Exp/Vot', 'N°Panneau'],
      dtype='object')

In [7]:
df = T1[['Nuls', 'Blancs', 'Abstentions', 'Code de la commune', 'Code du b.vote']].copy()
df.head()

Unnamed: 0,Nuls,Blancs,Abstentions,Code de la commune,Code du b.vote
0,1,16,108,1,1
1,1,3,38,2,1
2,5,18,266,4,1
3,6,15,265,4,2
4,8,10,246,4,3


In [8]:
df_1 = T1[['Nuls', 'Blancs', 'Abstentions']].copy()
#df_1 = df_1.rename(columns={"Nuls": "1_Nuls", "Abstentions": "1_Abstentions"})
df_1.head()

Unnamed: 0,Nuls,Blancs,Abstentions
0,1,16,108
1,1,3,38
2,5,18,266
3,6,15,265
4,8,10,246


In [9]:
df_1

Unnamed: 0,Nuls,Blancs,Abstentions
0,1,16,108
1,1,3,38
2,5,18,266
3,6,15,265
4,8,10,246
...,...,...,...
69677,31,40,14101
69678,2,8,942
69679,0,2,64
69680,2,0,553


In [10]:
code_commune =  T1['Code de la commune'].values.astype(str).copy()
code_commune

array(['1', '2', '4', ..., '233', '234', '235'], dtype='<U21')

In [11]:
code_bdv =  T1['Code du b.vote'].values.astype(str).copy()
code_bdv

array(['0001', '0001', '0001', ..., '0001', '0001', '0001'], dtype='<U4')

In [12]:
code_bdv.astype(str)

array(['0001', '0001', '0001', ..., '0001', '0001', '0001'], dtype='<U4')

In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69682 entries, 0 to 69681
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Nuls                69682 non-null  int64 
 1   Blancs              69682 non-null  int64 
 2   Abstentions         69682 non-null  int64 
 3   Code de la commune  69682 non-null  int64 
 4   Code du b.vote      69682 non-null  object
dtypes: int64(4), object(1)
memory usage: 2.7+ MB
None


## extraction des résultats

In [14]:
col_start = 23
col_par_cdt = 7
candidats = T1.iloc[0][col_start::col_par_cdt]
candidats

Sexe           F
Unnamed: 29    M
Unnamed: 36    M
Unnamed: 43    M
Unnamed: 50    F
Unnamed: 57    M
Unnamed: 64    M
Unnamed: 71    F
Unnamed: 78    M
Unnamed: 85    F
Unnamed: 92    M
Unnamed: 99    M
Name: 0, dtype: object

In [15]:
résultats = T1.iloc[0][(col_start+2)::col_par_cdt]
résultats

Prénom          Nathalie
Unnamed: 31       Fabien
Unnamed: 38     Emmanuel
Unnamed: 45         Jean
Unnamed: 52       Marine
Unnamed: 59         Éric
Unnamed: 66     Jean-Luc
Unnamed: 73         Anne
Unnamed: 80      Yannick
Unnamed: 87      Valérie
Unnamed: 94     Philippe
Unnamed: 101     Nicolas
Name: 0, dtype: object

In [16]:
for i_candidat, candidat in enumerate(candidats):
    i_col = col_start + i_candidat*col_par_cdt + 2
    print(i_col, T1.iloc[:, i_col].values)
    df_1[candidat] = T1.iloc[:, i_col].values

25 ['Nathalie' 'Nathalie' 'Nathalie' ... 'Nathalie' 'Nathalie' 'Nathalie']
32 ['Fabien' 'Fabien' 'Fabien' ... 'Fabien' 'Fabien' 'Fabien']
39 ['Emmanuel' 'Emmanuel' 'Emmanuel' ... 'Emmanuel' 'Emmanuel' 'Emmanuel']
46 ['Jean' 'Jean' 'Jean' ... 'Jean' 'Jean' 'Jean']
53 ['Marine' 'Marine' 'Marine' ... 'Marine' 'Marine' 'Marine']
60 ['Éric' 'Éric' 'Éric' ... 'Éric' 'Éric' 'Éric']
67 ['Jean-Luc' 'Jean-Luc' 'Jean-Luc' ... 'Jean-Luc' 'Jean-Luc' 'Jean-Luc']
74 ['Anne' 'Anne' 'Anne' ... 'Anne' 'Anne' 'Anne']
81 ['Yannick' 'Yannick' 'Yannick' ... 'Yannick' 'Yannick' 'Yannick']
88 ['Valérie' 'Valérie' 'Valérie' ... 'Valérie' 'Valérie' 'Valérie']
95 ['Philippe' 'Philippe' 'Philippe' ... 'Philippe' 'Philippe' 'Philippe']
102 ['Nicolas' 'Nicolas' 'Nicolas' ... 'Nicolas' 'Nicolas' 'Nicolas']


In [17]:
T1.iloc[:, i_col]

0        Nicolas
1        Nicolas
2        Nicolas
3        Nicolas
4        Nicolas
          ...   
69677    Nicolas
69678    Nicolas
69679    Nicolas
69680    Nicolas
69681    Nicolas
Name: Unnamed: 101, Length: 69682, dtype: object

In [18]:
T1.iloc[:, i_col].values

array(['Nicolas', 'Nicolas', 'Nicolas', ..., 'Nicolas', 'Nicolas',
       'Nicolas'], dtype=object)

In [19]:
print(df_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69682 entries, 0 to 69681
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Nuls         69682 non-null  int64 
 1   Blancs       69682 non-null  int64 
 2   Abstentions  69682 non-null  int64 
 3   F            69682 non-null  object
 4   M            69682 non-null  object
dtypes: int64(3), object(2)
memory usage: 2.7+ MB
None


In [20]:
df_1['POUTOU']

KeyError: 'POUTOU'

In [None]:
len(df_1), df_1.sum(), df_1.sum().sum()

pandas barplot of sum of columns

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(13, 5))
k = df_1.sum()/df_1.sum().sum()
ax = k.plot.bar(ax=ax)
ax.set_xlabel('Reviewer score (from 1 to 10)')
#ax.set_xlim(1, 10)
#ax.set_xticks(np.arange(1, 10)+.5)
#ax.set_xticklabels(np.arange(1, 10))
ax.set_ylabel('pourcentage');

## 2ème tour

In [None]:
fname = '/tmp/T2.xlsx'

if not os.path.isfile(fname):
    url = 'https://static.data.gouv.fr/resources/election-presidentielle-des-10-et-24-avril-2022-resultats-definitifs-du-2nd-tour/20220428-142301/resultats-par-niveau-burvot-t2-france-entiere.xlsx' # XLSX
    T2 = pd.read_excel(url)
    T2.to_excel(fname)
else:
    T2 = pd.read_excel(fname)

T2.tail()

In [None]:
T2.columns

In [None]:
T2.columns[:23]

In [None]:
T2.columns[23:]

In [None]:
T2.iloc[0, 23:]

In [None]:
col_start = 23
col_par_cdt = 7
candidats = T2.iloc[0][col_start::col_par_cdt]
candidats

In [None]:
df_2 = T2[['Nuls', 'Blancs', 'Abstentions']].copy()
#df_1 = df_1.rename(columns={"Nuls": "1_Nuls", "Abstentions": "1_Abstentions"})
df_2.head()

In [None]:
len(df_2)

In [None]:
for i_candidat, candidat in enumerate(candidats):
    i_col = col_start + i_candidat*col_par_cdt + 2
    print(i_col, T2.iloc[:, i_col].values)
    df_2[candidat] = T2.iloc[:, i_col].values

In [None]:
df_2.head()

In [None]:
fig, ax = plt.subplots(figsize=(13, 5))
k = df_2.sum()/df_2.sum().sum()
ax = k.plot.bar(ax=ax)
ax.set_xlabel('Candidat')
#ax.set_xlim(1, 10)
#ax.set_xticks(np.arange(1, 10)+.5)
#ax.set_xticklabels(np.arange(1, 10)) , rotation=45
ax.set_ylabel('pourcentage');

In [None]:
df_2

## non aux nans

Certains bureaux de vote n'ont pas de votants au premier ou au deuxieme = on les ignore:

In [None]:
(df_1.sum(axis=1)==0).sum(), (df_2.sum(axis=1)==0).sum()

In [None]:
df_1.drop(df_1.loc[df_1.sum(axis=1)==0].index, inplace=True)
df_2.drop(df_1.loc[df_1.sum(axis=1)==0].index, inplace=True)

In [None]:
(df_1.sum(axis=1)==0).sum(), (df_2.sum(axis=1)==0).sum()

In [None]:
df_1.drop(df_2.loc[df_2.sum(axis=1)==0].index, inplace=True)
df_2.drop(df_2.loc[df_2.sum(axis=1)==0].index, inplace=True)

In [None]:
(df_1.sum(axis=1)==0).sum(), (df_2.sum(axis=1)==0).sum()

## statistiques de second ordre

In [None]:
df_12 = pd.DataFrame()

In [None]:
df_12['1_MÉLENCHON'] = df_1['MÉLENCHON'].copy()
df_12['MACRON'] = df_2['MACRON'].copy()

In [None]:
df_12.info()

In [None]:
df_12['1_MÉLENCHON'] = df_12['1_MÉLENCHON']/df_1.sum(axis=1)
df_12['MACRON'] = df_12['MACRON']/df_2.sum(axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(13, 13))
ax = df_12.plot.scatter(x='1_MÉLENCHON', y ='MACRON', ax=ax)
ax.set_xlabel('MACRON (second tour)')
#ax.set_xlim(1, 10)
#ax.set_xticks(np.arange(1, 10)+.5)
#ax.set_xticklabels(np.arange(1, 10))
ax.set_ylabel('MÉLENCHON (premier tour)');

In [None]:
df_2.head()

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(13, 13))
ax = sns.kdeplot(df_12['1_MÉLENCHON'], df_12['MACRON'], thresh=0, levels=10, cbar=False, clip=(0, 1), ax=ax)

In [None]:
#fig, ax = plt.subplots(figsize=(13, 13))
sns.displot(df_12, x='1_MÉLENCHON', y='MACRON')

In [None]:
fig, ax = plt.subplots(figsize=(13, 13))
sns.histplot(df_12, x='1_MÉLENCHON', y='MACRON', ax=ax) # bins=np.linspace(0, 1, 32), 

In [None]:
fig, ax = plt.subplots(figsize=(13, 13))
sns.jointplot(df_12['1_MÉLENCHON'], df_12['MACRON'], ax=ax) # bins=np.linspace(0, 1, 32), 

https://laurentperrinet.github.io/sciblog/posts/2022-02-11-cosyne-reviewer-feedback.html

## fit model de transfert des voix

https://laurentperrinet.github.io/sciblog/posts/2020-04-08-fitting-a-psychometric-curve-using-pytorch.html

In [None]:
df_1.head()

In [None]:
len(df_1.columns)

In [None]:
df_1.values.shape, df_2.values.shape

In [None]:
X_1, X_2 = df_1.values, df_2.values
x_1, x_2 = torch.Tensor(X_1), torch.Tensor(X_2)


In [None]:
x_1.shape

In [None]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(x_1, x_2)

# Random split
train_set_size = int(len(dataset) * 0.8)
test_set_size = len(dataset) - train_set_size
train_set, test_set = torch.utils.data.random_split(dataset, [train_set_size, test_set_size])


In [None]:
train_set

In [None]:
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)

In [None]:
for n_1, n_2 in train_loader:
    break

In [None]:
n_1.shape, n_2.shape

In [None]:
sum_1, sum_2 = n_1.sum(axis=1), n_2.sum(axis=1)
sum_1, sum_2

In [None]:
(n_1/sum_1[:, None]).sum(axis=1)

In [None]:
n_1, n_2 = dataset[test_set.indices]
n_1.shape, n_2.shape

In [None]:
n_1.sum(axis=1), n_2.sum(axis=1)

In [None]:
(n_1.sum(axis=1) == n_2.sum(axis=1)).all()

In [None]:
((n_1.sum(axis=1) == n_2.sum(axis=1))*1.).argmin()

In [None]:
criterion = torch.nn.BCELoss(reduction="sum")
criterion?

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split

#torch.set_default_tensor_type("torch.DoubleTensor")
torch.set_default_tensor_type("torch.FloatTensor")
# https://pytorch.org/docs/master/generated/torch.nn.BCELoss.html
criterion = torch.nn.BCELoss(reduction="mean")

class TransfertVoix(torch.nn.Module):
    def __init__(self, N_1er, N_2eme):
        super(TransfertVoix, self).__init__()
        #self.linear = torch.nn.Linear(N_1er, N_2eme, bias=False)
        #self.linear.weight = torch.nn.Parameter(torch.rand_like(self.linear.weight))
        #self.linear.weight = torch.nn.Parameter(self.linear.weight / self.linear.weight.sum(axis=0, keepdim=True))
        M = torch.rand((N_1er, N_2eme))
        M = M / M.sum(axis=0, keepdim=True)
        self.M = torch.nn.Parameter(M)

    def forward(self, p_1):
        p_2_pred = torch.matmul(p_1, self.M) # self.linear(resultats_1er)
        return p_2_pred

In [None]:
N_1er, N_2eme = len(df_1.columns), len(df_2.columns)
N_1er, N_2eme

In [None]:
trans = TransfertVoix(N_1er, N_2eme)

In [None]:
trans.M.sum(axis=0)

In [None]:
sum_1.min(), sum_1.max()

In [None]:
sum_1, sum_2 = n_1.sum(axis=1), n_2.sum(axis=1)
p_1 = n_1/sum_1[:, None]

In [None]:
plt.plot(p_1.sum(axis=1))

In [None]:
p_1.shape, p_1.sum(axis=1)

In [None]:
p_1.shape, trans.M.shape

In [None]:
torch.matmul(p_1, trans.M).shape

In [None]:
torch.matmul(p_1, trans.M).shape, torch.matmul(p_1, trans.M).sum(1)

In [None]:
p_2_pred = trans(n_1/sum_1[:, None])
p_2_pred


In [None]:
learning_rate = 0.005
beta1, beta2 = 0.9, 0.999
betas = (beta1, beta2)
num_epochs = 2 ** 9 + 1
batch_size = 32
amsgrad = False # gives similar results
amsgrad = True  # gives similar results

def fit_data(
    df_1,
    df_2,
    learning_rate=learning_rate,
    batch_size=batch_size,  # gamma=gamma,
    num_epochs=num_epochs,
    betas=betas,
    split_ratio=.8, seed=42,
    verbose=False, **kwargs
):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    N_1er, N_2eme = len(df_1.columns), len(df_2.columns)
    trans = TransfertVoix(N_1er, N_2eme)
    trans = trans.to(device)

    X_1, X_2 = df_1.values, df_2.values
    x_1, x_2 = torch.Tensor(X_1), torch.Tensor(X_2)

    # split train and test
    from torch.utils.data import TensorDataset, DataLoader
    dataset = TensorDataset(x_1, x_2)
    # Random split
    train_set_size = int(len(dataset) * split_ratio)
    test_set_size = len(dataset) - train_set_size
    train_set, valid_set = random_split(dataset, [train_set_size, test_set_size], generator=torch.Generator().manual_seed(seed))
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

    # apprentissage
    trans.train()
    optimizer = torch.optim.Adam(
        trans.parameters(), lr=learning_rate, betas=betas, amsgrad=amsgrad
    )
    for epoch in range(int(num_epochs)):
        losses = []
        for n_1, n_2 in train_loader:
            n_1, n_2 = n_1.to(device), n_2.to(device)
            
            sum_1, sum_2 = n_1.sum(axis=1), n_2.sum(axis=1)
            
            p_2_pred = trans(n_1/sum_1[:, None])

            print(p_2_pred.shape, p_2_pred.min(), p_2_pred.max())
            loss = criterion(p_2_pred, n_2/sum_2[:, None])#, weights=sum_1/sum_1.sum())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
            with torch.no_grad():
                trans.M = torch.nn.Parameter(trans.M / trans.M.sum(axis=0, keepdim=True))


        if verbose and (epoch % (num_epochs // 32) == 0):
            print(f"Iteration: {epoch} - Loss: {np.sum(losses)/len(theta):.5f}")

    return trans, losses

In [None]:
trans, losses = fit_data(df_1, df_2)

In [None]:
n_1, n_2
# Test
trans.eval()
n_1, n_2 = dataset[test_set.indices]
p_2_pred = trans(n_1/sum_1[:, None])
loss = criterion(p_2_pred, n_2/sum_2[:, None], weights=sum_1/sum_1.sum())
