# Notebook to play around with data and models 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_pickle('Data/conclave_participants_features.pkl')

In [3]:
df_living = pd.read_pickle('Data/living_participants.pkl')

In [12]:
def fit_model_cv(data, model, split_col, scale_data=False):
    
    # Copy data and exclude retired cardinals.
    df = data.copy()
    df = df.loc[df['Emeritus']==0]
    
    for conclave in df[split_col].unique():
        print(conclave)
        # Get data for one conclave.
        conclave_df = df.loc[df[split_col]==conclave].copy()
        
        # Select elected pope record and separate it from train set.
        elected_pope = conclave_df.loc[df['Elected']==1] 
        conclave_df.drop(elected_pope.index, inplace=True)
        
        # Split folds.
        spliter = ShuffleSplit(n_splits=100, test_size=0.7)
        scaler = StandardScaler()
        
        for train_index, test_index in spliter.split(conclave_df):
#             print("TRAIN:", train_index, "TEST:", test_index)
            
            train_df = conclave_df.iloc[train_index, :]
            train_df = train_df.append(elected_pope)
                        
            X = train_df.drop(columns=['Elected', split_col, 'Name']).values
            y = train_df['Elected'].values
            
            if scale_data == True:
                X_scaled = scaler.fit_transform(X)
                model.fit(X_scaled, y)
            else:
                model.fit(X, y)
        
    return model

### Prepare test data, as if conclave were to take place in July 2020

In [6]:
from datetime import datetime, timedelta

datetime(2020, 7, 1)

datetime.datetime(2020, 7, 1, 0, 0)

In [7]:
assumed_conclave_date = datetime(2020, 7, 1)
df_living['Age_at_conclave'] = ((assumed_conclave_date - df_living['Born']) / timedelta(days=365)).astype(int)
df_living['Card_seniority'] = ((assumed_conclave_date - df_living['Born']) / timedelta(days=30)).astype(int)
df_living.head(1)

Unnamed: 0,Name,Born,Consistory,CB,CD,CP,Italian,Emeritus,Archbishop,Other curia,Prefect,Age_at_conclave,Card_seniority
0,Leonardo Sandri,1943-11-18,2007-11-24,1,0,0,0,0,0,0,1,76,932


In [8]:
cols_to_take = df.drop(columns=['Elected', 'conclave_year', 'Name']).columns

df_living.rename(columns={'Other curia': 'Other_curia'}, inplace=True)
df_test = df_living[cols_to_take]
df_test.head(1)

Unnamed: 0,Italian,Emeritus,Age_at_conclave,Card_seniority,Archbishop,Other_curia,Prefect,CB,CD,CP
0,0,0,76,932,0,0,1,1,0,0


# Prediction

a) Logistic Regression

In [15]:
logreg = LogisticRegression(warm_start=True, class_weight='balanced')
logreg_fitted = fit_model_cv(df, model=logreg, split_col='conclave_year', scale_data=True)

df_test_scaled = StandardScaler().fit_transform(df_test)

y_pred_logreg = logreg_fitted.predict_proba(df_test_scaled)[:, 1]

df_living['prediction_logreg'] = y_pred_logreg
df_living[['Name', 'prediction_logreg']].sort_values(by=['prediction_logreg'], ascending=False).head()

1922
1939
1958
1978
2005
2013


Unnamed: 0,Name,prediction_logreg
33,Donald William Wuerl,0.770874
10,Wilfrid Fox Napier OFM,0.765739
14,Gabriel Zubeir Wako,0.765739
77,Maurice Piat CSSp,0.760525
56,Ricardo Ezzati Andrello SDB,0.711747


b) Gradient Boosting Classifier

In [16]:
GBC = GradientBoostingClassifier(warm_start=True)
GBC_fitted = fit_model_cv(df, model=GBC, split_col='conclave_year')

y_pred_GBC = GBC_fitted.predict_proba(df_test)[:, 1]

df_living['prediction_GBC'] = y_pred_GBC
df_living[['Name', 'prediction_GBC']].sort_values(by=['prediction_GBC'], ascending=False).head()

1922
1939
1958
1978
2005
2013


Unnamed: 0,Name,prediction_GBC
0,Leonardo Sandri,8.3e-05
91,Giuseppe Petrocchi,8.3e-05
89,Pedro Ricardo Barreto Jimeno SJ,8.3e-05
88,António Augusto dos Santos Marto,8.3e-05
87,Joseph Coutts,8.3e-05
