# Notebook to play around with data and models 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_pickle('Data/conclave_participants_features.pkl')

In [3]:
df_living = pd.read_pickle('Data/living_participants.pkl')

In [4]:
def fit_model_cv(data, model, split_col, scale_data=False):
    
    # Copy data and exclude retired cardinals.
    df = data.copy()
    
    for conclave in df[split_col].unique():
        print(conclave)
        # Get data for one conclave.
        conclave_df = df.loc[df[split_col]==conclave].copy()
        
        # Select elected pope record and separate it from train set.
        elected_pope = conclave_df.loc[df['Elected']==1] 
        conclave_df.drop(elected_pope.index, inplace=True)
        
        # Split folds.
        spliter = ShuffleSplit(n_splits=10, test_size=0.7)
        scaler = StandardScaler()
        
        for train_index, test_index in spliter.split(conclave_df):
#             print("TRAIN:", train_index, "TEST:", test_index)
            
            train_df = conclave_df.iloc[train_index, :]
            train_df = train_df.append(elected_pope)
                        
            X = train_df.drop(columns=['Elected', split_col, 'Name']).values
            y = train_df['Elected'].values
            
            if scale_data == True:
                X_scaled = scaler.fit_transform(X)
                model.fit(X_scaled, y)
            else:
                model.fit(X, y)
        
    return model

### Prepare test data, as if conclave were to take place in July 2020

In [5]:
from datetime import datetime, timedelta

datetime(2020, 7, 1)

datetime.datetime(2020, 7, 1, 0, 0)

In [6]:
assumed_conclave_date = datetime(2020, 7, 1)
df_living['Age_at_conclave'] = ((assumed_conclave_date - df_living['Born']) / timedelta(days=365)).astype(int)
df_living['Card_seniority'] = ((assumed_conclave_date - df_living['Born']) / timedelta(days=30)).astype(int)

df_living = df_living.loc[df_living['Age_at_conclave'] < 80]

df_living.head(1)

Unnamed: 0,Name,Born,Consistory,CB,CD,CP,Italian,Emeritus,Archbishop,Other_curia,Prefect,Age_at_conclave,Card_seniority
0,Leonardo Sandri,1943-11-18,2007-11-24,1,0,0,0,0,0,0,1,76,932


In [7]:
cols_to_take = df.drop(columns=['Elected', 'Date', 'Name']).columns

df_test = df_living[cols_to_take]
df_test.head(1)

Unnamed: 0,Italian,Emeritus,Age_at_conclave,Card_seniority,Archbishop,Other_curia,Prefect,CB,CD,CP
0,0,0,76,932,0,0,1,1,0,0


# Prediction

a) Logistic Regression

In [8]:
df.loc[df['Elected']==1]

Unnamed: 0,Name,Date,Elected,Italian,Emeritus,Age_at_conclave,Card_seniority,Archbishop,Other_curia,Prefect,CB,CD,CP
21,Achille Ratti,1922-02-02,1,1,0,64,5,1,0,0,0,0,1
71,Eugenio Pacelli,1939-03-01,1,1,0,63,118,0,1,0,0,0,1
143,Angelo Giuseppe Roncalli,1958-10-25,1,1,0,76,50,1,0,0,0,0,1
217,Giovanni Battista Montini,1963-06-19,1,1,0,65,54,1,0,0,0,0,1
242,Albino Luciani,1978-08-25,1,1,0,65,69,1,0,0,0,0,1
402,Karol Wojtyła,1978-10-14,1,0,0,58,141,1,0,0,0,0,1
469,Joseph Ratzinger,2005-04-19,1,0,0,78,327,0,0,1,1,0,0
650,Jorge Mario Bergoglio,2013-03-12,1,0,0,76,156,1,0,0,0,0,1


In [9]:
logreg = LogisticRegression(warm_start=True, class_weight='balanced')
logreg_fitted = fit_model_cv(df, model=logreg, split_col='Date', scale_data=True)

df_test_scaled = StandardScaler().fit_transform(df_test)

y_pred_logreg = logreg_fitted.predict_proba(df_test_scaled)[:, 1]

df_living['prediction_logreg'] = y_pred_logreg
df_living[['Name', 'prediction_logreg']].sort_values(by=['prediction_logreg'], ascending=False).head(30)

1922-02-02T00:00:00.000000000






1939-03-01T00:00:00.000000000




1958-10-25T00:00:00.000000000




1963-06-19T00:00:00.000000000




1978-08-25T00:00:00.000000000






1978-10-14T00:00:00.000000000




2005-04-19T00:00:00.000000000




2013-03-12T00:00:00.000000000




Unnamed: 0,Name,prediction_logreg
78,Maurice Piat CSSp,0.827213
12,Wilfrid Fox Napier OFM,0.826569
67,Ricardo Blázquez Pérez,0.793666
41,Dominik Duka OP,0.756362
13,Óscar Andrés Rodríguez Maradiaga SDB,0.755531
86,Gregorio Rosa Chávez,0.754698
25,Seán Patrick O'Malley OFMCap,0.714966
90,Pedro Ricardo Barreto Jimeno SJ,0.714047
68,José Luis Lacunza Maestrojuán OAR,0.714047
82,Jean Zerbo,0.713587


b) Gradient Boosting Classifier

In [10]:
GBC = GradientBoostingClassifier(warm_start=True)
GBC_fitted = fit_model_cv(df, model=GBC, split_col='Date')

y_pred_GBC = GBC_fitted.predict_proba(df_test)[:, 1]

df_living['prediction_GBC'] = y_pred_GBC
df_living[['Name', 'prediction_GBC']].sort_values(by=['prediction_GBC'], ascending=False).head()

1922-02-02T00:00:00.000000000
1939-03-01T00:00:00.000000000
1958-10-25T00:00:00.000000000
1963-06-19T00:00:00.000000000
1978-08-25T00:00:00.000000000
1978-10-14T00:00:00.000000000
2005-04-19T00:00:00.000000000
2013-03-12T00:00:00.000000000


Unnamed: 0,Name,prediction_GBC
0,Leonardo Sandri,7.7e-05
91,Désiré Tsarahazana,7.7e-05
89,António Augusto dos Santos Marto,7.7e-05
88,Joseph Coutts,7.7e-05
87,Angelo De Donatis,7.7e-05
