# étude de l'article 2

In [1]:
import pandas as pd
import numpy as np
import csv 
import matplotlib.pyplot as plt
import spacy
import math
import os
import statsmodels.api as sm

from src.utils import *
from src.data_preprocessing import *
from src.data_processing import *
from src.feature_selection import *
from src.modelisation_arcticle_1 import *
from src.modelisation_arcticle_2 import *

os.chdir('../')

pd.set_option("display.max_columns", None)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismailakrout/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ismailakrout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# pour ne pas charger le NB avec des warnings 
import warnings 
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('01. output/df_freqs_speaker_word.csv')

In [4]:
df_modelisation = read_and_prepare_df_of_the_model(df)

In [5]:
df_modelisation

Unnamed: 0,Speaker,party,variable,c_ijt
0,John Denham,Lab,abil,0
1,George Howarth,Lab,abil,7
2,George Hollingbery,Con,abil,3
3,George Freeman,Con,abil,12
4,George Eustice,Con,abil,13
...,...,...,...,...
434995,Martin Caton,Lab,â£,0
434996,Marsha De Cordova,Lab,â£,1
434997,Mark Todd,Lab,â£,0
434998,Matt Hancock,Con,â£,40


In [6]:
df_modelisation['m_it'] = df_modelisation.groupby(by=['Speaker', 'party'])['c_ijt'].transform('sum')

In [7]:
# On Calcule les termes ds le sigma 
df_modelisation['q_hat_it'] = df_modelisation['c_ijt']/df_modelisation['m_it']

In [8]:
df_modelisation_Lab = df_modelisation.loc[df_modelisation['party'] == 'Lab']
df_modelisation_Con = df_modelisation.loc[df_modelisation['party'] == 'Con']

## plug-in estimator (biased)

In [9]:
df_modelisation_Lab_plugin = plugin_estimator_by_party(df_modelisation_Lab, 'Lab')
df_modelisation_Con_plugin = plugin_estimator_by_party(df_modelisation_Con, 'Con')

In [10]:
df_modelisation_plugin = plugin_estimator(
    df_modelisation_Lab_plugin,
    df_modelisation_Con_plugin
)

In [11]:
# we compute pi
pi = compute_pi_plugin(df_modelisation_plugin)
pi

0.5055121348240272

## leave-out estimator (not biased)

In [12]:
# We compute the q_ijt_column which serves to compute rho_hat_-ijt
df_modelisation_Con_leave_out = compute_q_ijt_column(df_modelisation_Con, 'Con')
df_modelisation_Lab_leave_out  = compute_q_ijt_column(df_modelisation_Lab, 'Lab')

In [13]:
# we compute the rho_hat_-ijt column
df_modelisation_Con_leave_out = compute_rho_hat_ijt_column(df_modelisation_Con_leave_out, df_modelisation_Lab_plugin, 'Con')
df_modelisation_Lab_leave_out = compute_rho_hat_ijt_column(df_modelisation_Lab_leave_out, df_modelisation_Con_plugin, 'Lab')

In [14]:
df_pi_word_Con_leave_out = create_df_pi_word_party(df_modelisation_Con_leave_out, "Con")
df_pi_word_Lab_leave_out = create_df_pi_word_party(df_modelisation_Lab_leave_out, "Lab")

In [15]:
df_modelisation_leave_out = pd.merge(
    df_pi_word_Con_leave_out,
    df_pi_word_Lab_leave_out,
    how='left',
    on=['variable'],
)

In [16]:
df_modelisation_leave_out['pi'] = df_modelisation_leave_out['moitie_Con_pi'] + df_modelisation_leave_out['moitie_Lab_pi']

In [17]:
df_modelisation_leave_out['pi'].sum()

0.5028617617943952

## Test sur un speaker

In [18]:
pi_speaker_Con = df_modelisation_Con_leave_out[['Speaker','produit_q_rho']].groupby(by=['Speaker']).sum().reset_index()

In [19]:
# Pour les Con
pi_speaker_Con = df_modelisation_Con_leave_out[['Speaker','produit_q_rho']].groupby(by=['Speaker']).sum().reset_index()
pi_speaker_Con['party'] = 'Con'

# Pour les Lab
pi_speaker_Lab = df_modelisation_Lab_leave_out[['Speaker','produit_q_rho']].groupby(by=['Speaker']).sum().reset_index()
pi_speaker_Lab['party'] = 'Lab'


In [20]:
pi_speaker = pd.concat([pi_speaker_Con, pi_speaker_Lab])
pi_speaker.rename(columns={'produit_q_rho':'speaker_pi'},inplace=True)
pi_speaker = pi_speaker.loc[pi_speaker['speaker_pi'] > 0]

In [21]:
import plotly.express as px
fig = px.scatter(pi_speaker, y="party", x="speaker_pi", color="party", symbol="party")
fig.update_traces(marker_size=10)
fig.show()