# Feature analysis - Correlations and regressions

This notebooks contains an analysis of the features computed for each candidate. We compare each feature to the votes obtained using different measures.

- Input: Candidates twitter features
- Output: Correlation results, linear regression results

In [1]:
!pip install pandas pyathena numpy statsmodels sklearn scipy python-dotenv plotly SQLAlchemy pyarrow kaleido nbformat matplotlib



## Imports

In [2]:
import pandas as pd
import logging
import os
import pyathena
import dotenv
import os
import sys
import numpy as np
import plotly.express as px
from itertools import permutations
#from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import scipy
from collections import defaultdict
import tqdm
from sklearn.preprocessing import PowerTransformer
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# from utils.mongodb import *

  from pandas import Int64Index as NumericIndex


In [3]:
logging.basicConfig(format='[%(asctime)s] - %(name)s - %(funcName)s - %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger(__name__)

dotenv.load_dotenv(os.path.join(module_path, '.env'))

conn = pyathena.connect(s3_staging_dir=os.environ["AWS_ATHENA_S3_STAGING_DIR"], 
        region_name=os.environ["AWS_REGION"])

query = """
SELECT * FROM "twitter-constituyente"."constituyentes_full";
"""
candidates_df = pd.read_sql(query, conn)
candidates_df["electoral_district"] = candidates_df["electoral_district"].astype("str")
candidates_ids = candidates_df["user__id_str"].dropna().to_list()


candidates_data_df = candidates_df.dropna(subset=["user__id_str"]).set_index("user__id_str")
candidates_data_df["rm"] = candidates_data_df["electoral_district"].isin(list(map(str, range(8, 15)))).astype("int")
candidates_data_df["district_percentage"] /= 100

district_df = candidates_data_df[["electoral_district"]]
percentage_df = candidates_data_df[["district_percentage"]]

[2022-01-28 16:19:34,114] - botocore.credentials - load - INFO : Found credentials in environment variables.


## Load data

In [4]:
path = "/Users/jose/Library/Mobile Documents/com~apple~CloudDocs/Proyectos/Memoria/code/twitterconstituyente/processing/{}.parquet"

features_df = pd.read_parquet(path.format("final_features"))
features_agg_df = pd.read_parquet(path.format("final_features_list_agg"))
features_first16_df = pd.read_parquet(path.format("final_features_first16weeks"))
features_last2_df = pd.read_parquet(path.format("final_features_last2weeks"))

In [5]:
features_types_df = pd.read_csv("features_types.csv")

all_cols = features_types_df["feature"].to_list()

In [6]:
def normalize_by_activity(df, normalize_cols, activity_col="tweets_made"):  
  for col in normalize_cols:
      if (col != activity_col) and col in df.columns:
        df[col] /= df[activity_col]
        # if division by zero
        df[col].replace([np.inf, -np.inf], 0, inplace=True)


def get_base_votes(df, perc_df=percentage_df, index="user__id_str"):
  base_df = df.set_index(index)[all_cols]
  base_df = base_df[base_df["tweets_made"]>base_df["retweets_made"]]

  normalize_by_activity(base_df, features_types_df[
    features_types_df["type"]!="Network feature"]["feature"].to_list())

  return base_df.join(perc_df)

base_votes_df = get_base_votes(features_df)
base_votes_df.head()

Unnamed: 0_level_0,out_degree,degree,in_degree,eigenvector_centrality,pagerank,harmonic_centrality,tweets_made,retweet_count,favorite_count,user_mentions,...,replies_made,quotes_made,videos,hashtags,pos,neg,sadness,anger,joy,district_percentage
user__id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100366067,128.666667,170.333333,41.666667,0.007319,0.000122,31991.366673,32.555556,0.324232,0.459044,0.406143,...,0.087031,0.151877,0.001706,0.943686,0.054608,0.095563,0.018771,0.023891,0.046075,0.048
1004762296932069377,110.833333,316.555556,205.722222,0.013303,0.001414,65512.58644,75.5,1.041943,1.42752,1.394408,...,0.52465,0.074319,0.029433,0.14496,0.060338,0.28624,0.01766,0.124356,0.043414,0.0552
1004858015017783302,13.222222,43.055556,29.833333,0.012676,6.5e-05,25144.905673,4.055556,1.136986,4.287671,0.39726,...,0.109589,0.082192,0.0,0.136986,0.191781,0.0,0.0,0.0,0.09589,0.0241
1007614099,16.777778,19.0,2.222222,7.6e-05,3.5e-05,3859.242295,3.888889,0.071429,0.142857,0.042857,...,0.028571,0.0,0.0,0.271429,0.014286,0.014286,0.0,0.0,0.0,0.0072
101614836,151.833333,3750.777778,3598.944444,0.056503,0.008407,78382.970894,44.944444,28.030902,95.969098,0.222497,...,0.097651,0.086527,0.018541,0.055624,0.095179,0.100124,0.013597,0.024722,0.049444,0.0278


## Correlation between features

In [7]:
fig = px.imshow(
    np.abs(base_votes_df[base_votes_df.columns[:-1]].corr(method="spearman")), 
    title="Spearman correlation between Twitter influence features",
    width=800, height=800)
fig.update_layout(
    xaxis = dict(tickmode='linear', tickangle=-45),
    yaxis = dict(tickmode='linear')
)
fig.write_image("spearman_corr.pdf")
fig

In [8]:
px.scatter_matrix(base_votes_df[["degree", "pagerank", "harmonic_centrality"]], width=800, height=800)

## Base vote analysis

### Correlation

In [9]:
def normalize_df_by_district(norm_df, col, drop=True):
  if "electoral_district" not in norm_df.columns:
    # raise Exception("Must have district col")
    norm_df = norm_df.join(district_df)
  norm_df[col] /= norm_df.groupby('electoral_district')[col].transform('sum')
  if drop:
    return norm_df.drop(columns=["electoral_district"])
  return norm_df

def custom_log(df, col):
  df[col] = np.log(df[col] + 1)
  return df

def get_vote_correlation_df(df):
  data = []

  base_columns = [col for col in df.columns if col not in ["district_percentage", "electoral_district"]]
  transformations = {
    #"log": lambda df, col: np.log(df+1),
    "log": custom_log,
    "district_normalize": normalize_df_by_district
  }.items()
  for col in tqdm.tqdm(base_columns):
    for i in range(0, len(transformations) + 1):
      for perm in permutations(transformations, i):
        d = df.copy()
        # try:
        for name, step in perm:
          d = step(d, col=col)
        spearman, pspearman = scipy.stats.spearmanr(d[col], d["district_percentage"])
        pearson, ppearson = scipy.stats.pearsonr(d[col], d["district_percentage"])
        data.append({
            "permutation": ', '.join([name for name, step in perm]),
            "column": col,
            "spearman": spearman,
            "spearman_pvalue": pspearman,
            "pearson": pearson,
            "pearson_pvalue": ppearson
        })
        # except Exception as e:
        #   log.error(e)
        #   data.append({
        #       "permutation": ', '.join([name for name, step in perm]),
        #       "column": col,
        #       "error": True
        #   })
    
  df = pd.DataFrame(data)
  df["permutation"] = df["permutation"].apply(lambda u: "raw" if u == "" else u)
  df = df[df["permutation"].isin(["raw", "district_normalize", "log, district_normalize"])]
  df = df.merge(features_types_df, left_on="column", right_on="feature", how="left")
  return df



In [10]:
base_corr_df = get_vote_correlation_df(base_votes_df)
base_corr_df.head()

100%|██████████| 21/21 [00:00<00:00, 146.32it/s]


Unnamed: 0,permutation,column,spearman,spearman_pvalue,pearson,pearson_pvalue,feature,type
0,raw,out_degree,0.183127,4.274982e-07,0.068135,0.06183168,out_degree,Network feature
1,district_normalize,out_degree,0.334836,3.703956e-21,0.13384,0.0002325934,out_degree,Network feature
2,"log, district_normalize",out_degree,0.421911,8.158362e-34,0.280721,4.380077e-15,out_degree,Network feature
3,raw,degree,0.242342,1.636519e-11,0.270721,4.25276e-14,degree,Network feature
4,district_normalize,degree,0.477451,4.450092e-44,0.314174,1.08914e-18,degree,Network feature


#### Spearman

In [11]:
for group, df in base_corr_df.groupby("type"):
  px.bar(df.sort_values("spearman", ascending=False), 
    x="column", y="spearman", color="permutation", 
    barmode="group", title=f"Spearman correlation between candidates percentage<br>of votes and {group}s", labels={"column": "Feature", "spearman":"Spearman", "permutation":"Pre-processing"}).show()

#### Pearson

In [12]:
for group, df in base_corr_df.groupby("type"):
  px.bar(df.sort_values("pearson", ascending=False), 
    x="column", y="pearson", color="permutation", 
    barmode="group", title=f"Pearson correlation between candidates percentage<br>of votes and {group}s", 
    labels={"column": "Feature", "pearson":"Pearson", "permutation":"Pre-processing"}).show()

In [13]:
plot_df = base_corr_df[base_corr_df["permutation"]=="log, district_normalize"].sort_values("spearman")

positions = defaultdict(lambda: 'top left')
positions.update({
    "tweets_made": "top right",
    "photos": "top right",
    "eigenvector_centrality": "middle left",
    "quotes_made": "middle left"
})

plot_df["position"] = plot_df["column"].map(positions)
fig = px.scatter(plot_df,"spearman", "pearson", text="column", width=1000, height=1000)
fig.update_traces(textposition=plot_df["position"])
fig.write_image("spearman_pearson_cols.pdf")
fig

### Regression

In [14]:
def col_to_dummy(df, col):
  df = pd.concat([df, pd.get_dummies(df[col])], axis=1)
  return df.drop(columns=[col])

ops = ["raw", "log", "dn", "logdn", "dnlog", "logdnlog" ,"dnpt", "logdnpt"]

pt = PowerTransformer(method="yeo-johnson")

def preprocess_feature(df, col, prep) -> pd.Series:
  if prep == "raw": 
    return df[col]
  if prep == "log": 
    return  np.log(df[col] + 1)
  if prep == "dn": 
    return normalize_df_by_district(df.copy(), col)[col]
  if prep == "dnpt": 
    return pd.Series(pt.fit_transform(
      normalize_df_by_district(df.copy(), col)[[col]]
    ).flatten())
  if prep == "logdn":
    df[col] = np.log(df[col] + 1)
    return normalize_df_by_district(df.copy(), col)[col]
  if prep == "logdnpt":
    df[col] = np.log(df[col] + 1)
    df[col] = normalize_df_by_district(df.copy(), col)[col]
    return pd.Series(pt.fit_transform(df[[col]]).flatten())
  if prep == "logdnlog": 
    df[col] = np.log(df[col] + 1)
    df[col] = normalize_df_by_district(df.copy(), col)[col]
    return np.log(df[col]+1)
  if prep == "dnlog": 
    return np.log(normalize_df_by_district(df.copy(), col)[col]+1)


scaler = StandardScaler()

def get_vote_regression_df(df, reg_type, col, y, interaction=None, scaling=True):
  if scaling:
    df[col] = pd.Series(scaler.fit_transform(df[[col]]).flatten())
  if interaction:
    df[f"{col}*{interaction}"] = df[col] * df[interaction]

  X2 = sm.add_constant(df)
  est = sm.OLS(y, X2)
  est2 = est.fit()
  return {
    "col": col,
    "features": df.columns.to_list(),
    "model": est2,
    "pvalues": est2.pvalues,
    "params": est2.params,
    "errors": est2.bse,
    "r2": est2.rsquared,
    "type": reg_type,
    "interaction": interaction,
  }
 # single, base, base2, interaction (rm, gender, ind)

# univariada (single)
# con variables de control
# con otras variables de control
# con interaccion con rm, gender e IND



In [15]:
base_notwitter_features = candidates_data_df[["list_number", "party_clean", "gender", "rm", "electoral_district", "coalicion"]]

# en promedio, a mayor cantidad de candidatos menor porcentaje
candidates_count_district = candidates_df["electoral_district"].value_counts().to_dict()
base_notwitter_features["n_candidates"] = base_notwitter_features["electoral_district"].map(candidates_count_district)
scaler = StandardScaler()
base_notwitter_features[["n_candidates"]] = scaler.fit_transform(base_notwitter_features[["n_candidates"]])

base_notwitter_features["gender"] = (base_notwitter_features["gender"]=="M").astype("int")

base_notwitter_features["list_number"] = base_notwitter_features["list_number"].astype("str")

for col in ["list_number", "party_clean", "coalicion"]:
  base_notwitter_features = col_to_dummy(base_notwitter_features, col)

base_notwitter_features["<=2"] = base_notwitter_features["1"] + base_notwitter_features["2"]


base_1_cols = [
  "gender", "rm", "n_candidates",
  "1", "2", "3", "4",
  "CIUDADANOS", "COMUNES", "CONVER.", "EVOPOLI",
  "FREVS", "IGUALDAD", "PCC", "PCCH", "PDC", "PEV",
  "PH", "PL", "PNC", "PPD", "PR", "PRO", "PS", 
  "PTR", "RD", "REPUBL.", "RN", "UDI", "UPA"
]

base_2_cols = [
  "rm", "n_candidates", "<=2",
  "Apruebo Dignidad",
  "Independientes No Neutrales",
  "Lista del Apruebo",
  "Lista del Pueblo",
  "Vamos por Chile"
]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

#### Regression input construction

In [16]:

input_regression_df = base_votes_df.join(base_notwitter_features[list(set(base_1_cols+base_2_cols))]).join(district_df).reset_index()
y = np.log(input_regression_df["district_percentage"])

# all preprocessing
def preprocess_columns(df, cols):
    for col in cols:
        df[f"{col}__raw"] = df[col]
        df[f"{col}__log"] = np.log(df[col] + 1)
        df[f"{col}__dn"] = normalize_df_by_district(df.copy(), col)[col]
        df[f"{col}__dnpt"] = pd.Series(pt.fit_transform(df[[f"{col}__dn"]]).flatten())
        df[f"{col}__logdn"] = normalize_df_by_district(df.copy(), f"{col}__log")[f"{col}__log"]
        df[f"{col}__logdnpt"] = pd.Series(pt.fit_transform(df[[f"{col}__logdn"]]).flatten())
        df[f"{col}__dnlog"] = np.log(df[f"{col}__dn"]+1)
        df[f"{col}__logdnlog"] =  np.log(df[f"{col}__logdn"]+1)

preprocess_columns(input_regression_df, all_cols)

#### Run different models

In [17]:
def run_regressions(df, preps=ops, interactions=(None, "rm", "gender"), custom_base_cols=None, y=y, cols=all_cols):
    regression_results = []

    for prep in tqdm.tqdm(preps):
        for interaction in interactions:
            for col in cols:
                col = f"{col}__{prep}"
                try:
                    if interaction is None:
                        regression_results.append(get_vote_regression_df(df[[col]], 
                        "single", col, y=y, interaction=interaction))
                    if (interaction is None) or (interaction in base_1_cols): 
                        regression_results.append(get_vote_regression_df(df[base_1_cols + [col]], 
                        "base_1", col, y=y, interaction=interaction))
                    if (interaction is None) or (interaction in base_2_cols):
                        regression_results.append(get_vote_regression_df(df[base_2_cols + [col]], 
                        "base_2", col, y=y, interaction=interaction))
                    if (custom_base_cols is not None) and (interaction in custom_base_cols):
                        regression_results.append(get_vote_regression_df(df[custom_base_cols + [col]], 
                        "custom_base", col, y=y, interaction=interaction))
                except Exception as e:
                    log.info(prep)
                    log.info(interaction)
                    log.info(col)
                    raise e

    
    base_results = {
        "base_1": get_vote_regression_df(df[base_1_cols], "base_1", None, y=y, scaling=False),
        "base_2": get_vote_regression_df(df[base_2_cols], "base_2", None, y=y, scaling=False)
    }

    for result in regression_results:
        result["pvalue_col"] = result["pvalues"].loc[result["col"]]
        result["param_col"] = result["params"].loc[result["col"]]
        result["se_col"] = result["errors"].loc[result["col"]]
        result["conf95_col"] = result["se_col"]*1.96
        result["prep"] = result["col"].split("__")[-1]
        result["clean_col"] = result["col"].split("__")[0]
        if result["interaction"]:
            result["param_interaction"] = result["params"].loc[f"{result['col']}*{result['interaction']}"]
            result["pvalue_interaction"] = result["pvalues"].loc[f"{result['col']}*{result['interaction']}"]
            result["se_interaction"] = result["errors"].loc[f"{result['col']}*{result['interaction']}"]
            result["conf95_interaction"] = result["se_interaction"]*1.96
        if result["type"] in ["base_1", "base_2"]:
            result["delta_r2"] = result["r2"] - base_results[result["type"]]["r2"]
    return pd.DataFrame(regression_results)
 
reg_results_df = run_regressions(input_regression_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

#### Weights

In [18]:
weights_multivariate_df = reg_results_df[
    (reg_results_df["interaction"].isnull()) 
    & (reg_results_df["prep"]=="logdnpt")
    & (reg_results_df["type"]=="base_1")
].sort_values("param_col", ascending=False)
fig = px.bar(weights_multivariate_df, "clean_col", "param_col", hover_data=["clean_col", "col", "pvalue_col"], 
    error_y="conf95_col", labels={"param_col": "Regression coefficient", "clean_col": "Feature"},
    title="Linear multivariate regression weights for Twitter features")
fig.update_xaxes(tickangle=-45)
fig

In [124]:
weights_multivariate_df.iloc[0]["model"].summary()

0,1,2,3
Dep. Variable:,district_percentage,R-squared:,0.582
Model:,OLS,Adj. R-squared:,0.564
Method:,Least Squares,F-statistic:,32.32
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,9.1e-115
Time:,12:09:30,Log-Likelihood:,-735.65
No. Observations:,752,AIC:,1535.0
Df Residuals:,720,BIC:,1683.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.3518,0.069,-63.195,0.000,-4.487,-4.217
gender,-0.0365,0.078,-0.470,0.639,-0.189,0.116
rm,-0.1289,0.055,-2.360,0.019,-0.236,-0.022
n_candidates,-0.1839,0.028,-6.462,0.000,-0.240,-0.128
1,0.8017,0.078,10.247,0.000,0.648,0.955
2,0.4488,0.083,5.436,0.000,0.287,0.611
3,0.3006,0.085,3.555,0.000,0.135,0.467
4,0.0104,0.089,0.117,0.907,-0.165,0.186
CIUDADANOS,-0.2968,0.334,-0.890,0.374,-0.952,0.358

0,1,2,3
Omnibus:,4.004,Durbin-Watson:,2.02
Prob(Omnibus):,0.135,Jarque-Bera (JB):,3.845
Skew:,0.168,Prob(JB):,0.146
Kurtosis:,3.1,Cond. No.,25.0


In [20]:
weights_univariate_df = reg_results_df[
    (reg_results_df["interaction"].isnull()) 
    & (reg_results_df["prep"]=="logdnpt")
    & (reg_results_df["type"]=="single")
].sort_values("param_col", ascending=False)
fig = px.bar(weights_univariate_df, "clean_col", "param_col", hover_data=["clean_col", "col", "pvalue_col"], 
    error_y="conf95_col", labels={"param_col": "Regression coefficient", "clean_col": "Feature"},
    title="Linear univariate regression weights for Twitter features", color="prep")
fig.update_xaxes(tickangle=-45)
fig

In [125]:
weights_univariate_df.iloc[0]["model"].summary()

0,1,2,3
Dep. Variable:,district_percentage,R-squared:,0.297
Model:,OLS,Adj. R-squared:,0.296
Method:,Least Squares,F-statistic:,316.9
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,2.15e-59
Time:,12:10:36,Log-Likelihood:,-930.97
No. Observations:,752,AIC:,1866.0
Df Residuals:,750,BIC:,1875.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.1660,0.030,-136.718,0.000,-4.226,-4.106
favorite_count__logdnpt,0.5424,0.030,17.801,0.000,0.483,0.602

0,1,2,3
Omnibus:,2.382,Durbin-Watson:,1.89
Prob(Omnibus):,0.304,Jarque-Bera (JB):,2.329
Skew:,-0.136,Prob(JB):,0.312
Kurtosis:,3.007,Cond. No.,1.0


In [21]:
merged_weights_df = weights_univariate_df.set_index("clean_col").add_suffix("__uni").join(weights_multivariate_df.set_index("clean_col").add_suffix("__multi")).reset_index()

px.scatter(merged_weights_df, 'param_col__uni', 'param_col__multi', text="clean_col", 
labels={"param_col__uni":"Univariate regression coefficient", 
"param_col__multi":"Multivariate regression coefficient"}, width=800, height=800)

#### R2

In [22]:
positions = defaultdict(lambda: 'top left')
# positions.update({
#     "tweets_made": "top right",
#     "photos": "top right",
#     "eigenvector_centrality": "middle left",
#     "quotes_made": "middle left"
# })

merged_weights_df["position"] = merged_weights_df["clean_col"].map(positions)

fig = px.scatter(merged_weights_df.reset_index(), 'r2__uni', 'delta_r2__multi', text="clean_col", 
labels={'r2__uni': 'R2 univariate regression', 'delta_r2__multi': 'Delta R2 multivariate regression'}, 
title="R2", width=800, height=800)
fig.update_traces(textposition=merged_weights_df["position"])
#fig.write_image("test.pdf")
fig

#### Interactions

In [23]:
reg_results_df[
        (reg_results_df["interaction"]=="rm") 
        & (reg_results_df["prep"]=="logdnpt")
        & (reg_results_df["type"]=="base_1")
    ].sort_values("param_interaction", ascending=False)

Unnamed: 0,col,features,model,pvalues,params,errors,r2,type,interaction,pvalue_col,param_col,se_col,conf95_col,prep,clean_col,delta_r2,param_interaction,pvalue_interaction,se_interaction,conf95_interaction
947,degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 5.551458e-289 gender ...,const -4.335626 gender ...,const 0.070394 gender ...,0.565285,base_1,rm,3.449608e-11,0.219786,0.032654,0.064001,logdnpt,degree,0.060774,0.294986,5e-06,0.064086,0.125608
949,in_degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 1.193786e-288 gender ...,const -4.338884 gender ...,const 0.070536 gender ...,0.563243,base_1,rm,1.233783e-10,0.204603,0.031327,0.061401,logdnpt,in_degree,0.058732,0.277824,1.2e-05,0.062924,0.123331
945,out_degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 1.140299e-281 gender...,const -4.359197 gender ...,const 0.072785 gender ...,0.534237,base_1,rm,4.976108e-05,0.130369,0.031942,0.062605,logdnpt,out_degree,0.029726,0.239539,0.000375,0.06702,0.13136
961,favorite_count__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 1.114554e-296 ge...,const -4.330462 gender ...,const 0.068288 gender ...,0.591673,base_1,rm,2.49195e-17,0.279478,0.032172,0.063057,logdnpt,favorite_count,0.087162,0.236085,3.6e-05,0.056751,0.111232
957,tweets_made__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 6.534999e-283 gende...,const -4.371782 gender ...,const 0.072648 gender ...,0.53481,base_1,rm,5.214465e-05,0.135495,0.033288,0.065245,logdnpt,tweets_made,0.0303,0.233999,0.000367,0.065363,0.128111
959,retweet_count__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 4.238531e-292 gen...,const -4.314597 gender ...,const 0.069227 gender ...,0.581113,base_1,rm,6.211313e-15,0.252569,0.031688,0.062109,logdnpt,retweet_count,0.076602,0.213601,0.000167,0.056436,0.110614
953,pagerank__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 2.116981e-293 gender ...,const -4.332791 gender ...,const 0.069177 gender ...,0.579852,base_1,rm,1.802142e-12,0.227547,0.031713,0.062158,logdnpt,pagerank,0.075341,0.201141,0.00015,0.052761,0.103413
951,eigenvector_centrality__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 4.994094...,const -4.319995...,const 0.070396...,0.567007,base_1,rm,1.358421e-10,0.212943,0.03268,0.064053,logdnpt,eigenvector_centrality,0.062496,0.20079,0.000204,0.05379,0.105429
955,harmonic_centrality__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 1.414610e-2...,const -4.375843 ge...,const 0.073371 ge...,0.525591,base_1,rm,1.884215e-05,0.136893,0.031783,0.062295,logdnpt,harmonic_centrality,0.02108,0.178703,0.021786,0.077728,0.152347
967,retweets_made__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 1.823507e-277 gen...,const -4.374686 gender ...,const 0.074236 gender ...,0.514948,base_1,rm,0.008676411,0.085163,0.03236,0.063425,logdnpt,retweets_made,0.010437,0.107073,0.110576,0.067022,0.131364


In [24]:
for interaction in ["rm", "gender"]:
    weights_interaction_df = reg_results_df[
        (reg_results_df["interaction"]==interaction) 
        & (reg_results_df["prep"]=="logdnpt")
        & (reg_results_df["type"]=="base_1")
    ].sort_values("param_interaction", ascending=False)
    fig = px.bar(weights_interaction_df, "clean_col", "param_interaction", hover_data=["clean_col", "col", "pvalue_col"], 
        error_y="conf95_interaction", labels={"param_col": "Regression coefficient", "clean_col": "Feature"},
        title=f"Regression coefficient for interaction term with {interaction}")
    fig.update_xaxes(tickangle=-45)
    fig.show()

In [127]:
for interaction in ["rm", "gender"]:
    weights_interaction_df = reg_results_df[
        (reg_results_df["interaction"]==interaction) 
        & (reg_results_df["prep"]=="logdnpt")
        & (reg_results_df["type"]=="base_1")
    ].sort_values("param_interaction", ascending=False)
    fig = px.bar(weights_interaction_df, "clean_col", "param_interaction", hover_data=["clean_col", "col", "pvalue_col"], 
        error_y="conf95_interaction", labels={"param_col": "Regression coefficient", "clean_col": "Feature"},
        title=f"Regression coefficient for interaction term with {interaction}")
    fig.update_xaxes(tickangle=-45)
    fig.show()

In [136]:
reg_results_df[
        (reg_results_df["interaction"]=="rm") 
        & (reg_results_df["prep"]=="logdnpt")
        & (reg_results_df["type"]=="base_2")
].iloc[0]["model"].summary()

0,1,2,3
Dep. Variable:,district_percentage,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.457
Method:,Least Squares,F-statistic:,64.3
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,1.13e-93
Time:,12:40:02,Log-Likelihood:,-828.57
No. Observations:,752,AIC:,1679.0
Df Residuals:,741,BIC:,1730.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.7758,0.061,-78.347,0.000,-4.895,-4.656
rm,-0.1277,0.061,-2.084,0.037,-0.248,-0.007
n_candidates,-0.2366,0.031,-7.639,0.000,-0.297,-0.176
<=2,0.6199,0.056,11.081,0.000,0.510,0.730
Apruebo Dignidad,0.6211,0.087,7.169,0.000,0.451,0.791
Independientes No Neutrales,0.2842,0.099,2.870,0.004,0.090,0.479
Lista del Apruebo,0.5485,0.086,6.395,0.000,0.380,0.717
Lista del Pueblo,0.9094,0.097,9.364,0.000,0.719,1.100
Vamos por Chile,0.8356,0.083,10.028,0.000,0.672,0.999

0,1,2,3
Omnibus:,15.86,Durbin-Watson:,1.933
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.032
Skew:,0.308,Prob(JB):,0.0002
Kurtosis:,3.404,Cond. No.,6.73


In [135]:
reg_results_df[
        (reg_results_df["interaction"]=="gender") 
        & (reg_results_df["prep"]=="logdnpt")
        & (reg_results_df["type"]=="base_1")
    ].sort_values("r2", ascending=False).tail(1).iloc[0]["model"].summary()

0,1,2,3
Dep. Variable:,district_percentage,R-squared:,0.505
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,22.93
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,1.35e-88
Time:,12:39:59,Log-Likelihood:,-799.05
No. Observations:,752,AIC:,1664.0
Df Residuals:,719,BIC:,1817.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.3878,0.075,-58.502,0.000,-4.535,-4.241
gender,-0.0799,0.085,-0.943,0.346,-0.246,0.086
rm,-0.1877,0.059,-3.161,0.002,-0.304,-0.071
n_candidates,-0.2897,0.031,-9.444,0.000,-0.350,-0.229
1,0.8930,0.085,10.523,0.000,0.726,1.060
2,0.4769,0.090,5.282,0.000,0.300,0.654
3,0.2953,0.092,3.205,0.001,0.114,0.476
4,-0.0041,0.097,-0.042,0.966,-0.195,0.187
CIUDADANOS,0.0119,0.363,0.033,0.974,-0.700,0.724

0,1,2,3
Omnibus:,15.558,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18.312
Skew:,0.266,Prob(JB):,0.000106
Kurtosis:,3.549,Cond. No.,24.9


## List aggregation

In [42]:
candidates_data_df["list_agg"] = "D" + candidates_data_df["electoral_district"].astype("str") + "&" + candidates_data_df["list"]
list_percentage_df = candidates_data_df.groupby(["electoral_district", "list", "list_agg"])["district_percentage"].sum().reset_index().set_index("list_agg")
list_base_votes_df = get_base_votes(features_agg_df, list_percentage_df, "list")

### Correlation

In [43]:
list_base_corr_df = get_vote_correlation_df(list_base_votes_df.drop(columns=["list"]))
list_base_corr_df.head()

100%|██████████| 21/21 [00:00<00:00, 263.81it/s]


Unnamed: 0,permutation,column,spearman,spearman_pvalue,pearson,pearson_pvalue,feature,type
0,raw,out_degree,0.53613,5.976427e-17,0.313792,3.716627e-06,out_degree,Network feature
1,district_normalize,out_degree,0.596472,1.582267e-21,0.356755,1.144643e-07,out_degree,Network feature
2,"log, district_normalize",out_degree,0.578932,4.2449899999999997e-20,0.476478,3.043316e-13,out_degree,Network feature
3,raw,degree,0.576192,6.972126e-20,0.415786,3.83331e-10,degree,Network feature
4,district_normalize,degree,0.658167,2.533381e-27,0.463351,1.60966e-12,degree,Network feature


In [140]:
candidates_data_df["list_agg"].value_counts()

D8&YQ. APRUEBO DIGNIDAD                                               8
D7&XP. VAMOS POR CHILE                                                8
D10&YQ. APRUEBO DIGNIDAD                                              8
D10&YB. LISTA DEL APRUEBO                                             8
D6&XP. VAMOS POR CHILE                                                8
                                                                     ..
D22&ZK. ORGANIZACIONES SOCIALES Y TERRITORIALES DEL WALLMAPU (D22)    1
D7&XG. PARTIDO HUMANISTA                                              1
D6&CANDIDATURA INDEPENDIENTE                                          1
D5&WB. INDEPENDIENTES POR LA REGION DE COQUIMBO (D5)                  1
D1&WE. SUMATE AHORA (D1)                                              1
Name: list_agg, Length: 210, dtype: int64

In [47]:
best_base_corr_df = base_corr_df[base_corr_df["permutation"]=="log, district_normalize"]
best_base_corr_df["corr_type"] = "candidate"

best_list_base_corr_df = list_base_corr_df[list_base_corr_df["permutation"]=="log, district_normalize"]
best_list_base_corr_df["corr_type"] = "agg list"

plot_df = pd.concat([best_base_corr_df, best_list_base_corr_df])
px.bar(plot_df, "column", "spearman", color="corr_type", barmode="group")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Regression

## Last 2 weeks

In [58]:
base_votes_first16_df = get_base_votes(features_first16_df)
base_votes_last2_df = get_base_votes(features_last2_df)

### Correlation

In [61]:
base_corr_first16_df = get_vote_correlation_df(base_votes_first16_df)
base_corr_last2_df = get_vote_correlation_df(base_votes_last2_df.drop(columns=["sadness"]))


100%|██████████| 21/21 [00:00<00:00, 149.10it/s]
100%|██████████| 20/20 [00:00<00:00, 168.89it/s]


In [66]:
corr_dfs = {
"all": base_corr_df.copy(),
"first 16 weeks": base_corr_first16_df.copy(),
"last 2 weeks": base_corr_last2_df.copy()
}
permutation = "log, district_normalize"
for time_range, df in corr_dfs.items():
    df["time_range"] = time_range
    corr_dfs[time_range] = df[df["permutation"]==permutation]

plot_df = pd.concat(corr_dfs.values())
plot_df.head()

Unnamed: 0,permutation,column,spearman,spearman_pvalue,pearson,pearson_pvalue,feature,type,time_range
2,"log, district_normalize",out_degree,0.421911,8.158362e-34,0.280721,4.380077e-15,out_degree,Network feature,all
5,"log, district_normalize",degree,0.517349,1.0431250000000001e-52,0.392133,4.74537e-29,degree,Network feature,all
8,"log, district_normalize",in_degree,0.463117,3.0271699999999996e-41,0.397101,8.211435e-30,in_degree,Network feature,all
11,"log, district_normalize",eigenvector_centrality,0.324875,6.059219999999999e-20,0.273405,2.331899e-14,eigenvector_centrality,Network feature,all
14,"log, district_normalize",pagerank,0.480441,1.097755e-44,0.323359,9.189365e-20,pagerank,Network feature,all


In [67]:
px.bar(plot_df, "column", "spearman", color="time_range", barmode="group")

### Regression

In [92]:

input_regression_first16_df = base_votes_last2_df.join(base_notwitter_features[list(set(base_1_cols+base_2_cols))]).join(district_df).reset_index()
y_first16 = np.log(input_regression_first16_df["district_percentage"])
preprocess_columns(input_regression_first16_df, all_cols)
input_regression_first16_df["last2weeks"] = 0

input_regression_last2_df = base_votes_last2_df.join(base_notwitter_features[list(set(base_1_cols+base_2_cols))]).join(district_df).reset_index()
y_last2 = np.log(input_regression_last2_df["district_percentage"])
preprocess_columns(input_regression_last2_df, all_cols)
input_regression_last2_df["last2weeks"] = 1

input_regression_temp_df = pd.concat([input_regression_first16_df, input_regression_last2_df]).reset_index(drop=True)
y = pd.concat([y_first16, y_last2]).reset_index(drop=True)

In [98]:
input_regression_temp_df.isnull().sum().sort_values().tail(10)

degree__logdnpt       0
in_degree__dnlog      0
degree__logdnlog      0
in_degree__raw        0
sadness__dnlog       94
sadness__dnpt        94
sadness__logdn       94
sadness__logdnpt     94
sadness__dn          94
sadness__logdnlog    94
dtype: int64

In [111]:
temp_base_columns = base_1_cols + ["last2weeks"]

In [141]:
reg_result_df = run_regressions(input_regression_temp_df, ["logdnpt"], ["last2weeks"], temp_base_columns, y=y, cols=[col for col in all_cols if col not in ["sadness"]])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [118]:
reg_result_df[
    (reg_result_df["interaction"]=="last2weeks") 
    & (reg_result_df["prep"]=="logdnpt")
    & (reg_result_df["type"]=="custom_base")
]

Unnamed: 0,col,features,model,pvalues,params,errors,r2,type,interaction,pvalue_col,param_col,se_col,conf95_col,prep,clean_col,param_interaction,pvalue_interaction,se_interaction,conf95_interaction
0,out_degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e+00...,const -4.419570e+00...,const 6.780063e-02...,0.489973,custom_base,last2weeks,1.373137e-07,0.179284,0.033797,0.066243,logdnpt,out_degree,2.749335e-16,1.0,0.042961,0.084204
1,degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e+00 gen...,const -4.435345e+00 gen...,const 6.509075e-02 gen...,0.530116,custom_base,last2weeks,5.002434e-21,0.335044,0.034861,0.068328,logdnpt,degree,-5.493338e-16,1.0,0.041236,0.080822
2,in_degree__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e+00 ...,const -4.423762e+00 ...,const 6.527990e-02 ...,0.526892,custom_base,last2weeks,2.69048e-18,0.286963,0.032299,0.063307,logdnpt,in_degree,-1.591239e-16,1.0,0.041377,0.081099
3,eigenvector_centrality__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const ...,const -...,const ...,0.51907,custom_base,last2weeks,3.000718e-15,0.252136,0.031479,0.061698,logdnpt,eigenvector_centrality,-1.793209e-15,1.0,0.041718,0.081767
4,pagerank__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e+00 g...,const -4.403989e+00 g...,const 6.401371e-02 g...,0.544677,custom_base,last2weeks,8.106413e-23,0.30819,0.030625,0.060024,logdnpt,pagerank,1.74285e-15,1.0,0.040592,0.07956
5,harmonic_centrality__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.0...,const -4.4...,const 6.7...,0.49844,custom_base,last2weeks,1.025543e-09,0.201423,0.032695,0.064081,logdnpt,harmonic_centrality,-3.491759e-16,1.0,0.042603,0.083502
6,tweets_made__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e+0...,const -4.452774e+0...,const 6.720319e-0...,0.502099,custom_base,last2weeks,3.003734e-12,0.261346,0.037018,0.072556,logdnpt,tweets_made,-1.434887e-15,1.0,0.042447,0.083197
7,retweet_count__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e...,const -4.427930e...,const 6.437242e...,0.540016,custom_base,last2weeks,2.162484e-21,0.298636,0.030774,0.060316,logdnpt,retweet_count,1.263412e-15,1.0,0.040799,0.079966
8,favorite_count__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000...,const -4.426414...,const 6.371725...,0.549237,custom_base,last2weeks,2.361848e-25,0.33685,0.031547,0.061831,logdnpt,favorite_count,5.58032e-17,1.0,0.040388,0.079161
9,user_mentions__logdnpt,"[gender, rm, n_candidates, 1, 2, 3, 4, CIUDADA...",<statsmodels.regression.linear_model.Regressio...,const 0.000000e...,const -4.416915e...,const 6.926197e...,0.470759,custom_base,last2weeks,0.05079077,0.064056,0.032758,0.064205,logdnpt,user_mentions,5.30821e-18,1.0,0.043763,0.085775


In [150]:
weights_interaction_df = reg_result_df[
    (reg_result_df["interaction"]=="last2weeks") 
    & (reg_result_df["prep"]=="logdnpt")
    & (reg_result_df["type"]=="custom_base")
].sort_values("param_interaction", ascending=False)
fig = px.bar(weights_interaction_df, "clean_col", "param_interaction", hover_data=["clean_col", "col", "pvalue_col"], 
    error_y="conf95_interaction", labels={"param_col": "Regression coefficient", "clean_col": "Feature"},
    title=f"Regression coefficient for interaction term with last2weeks")
fig.update_xaxes(tickangle=-45)
fig.show()

In [149]:
input_regression_temp_df["last2weeks"].value_counts()

0    548
1    548
Name: last2weeks, dtype: int64

In [151]:
weights_interaction_df.iloc[1]["model"].summary()

0,1,2,3
Dep. Variable:,district_percentage,R-squared:,0.54
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,39.0
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,4.05e-155
Time:,13:22:10,Log-Likelihood:,-1108.2
No. Observations:,1096,AIC:,2282.0
Df Residuals:,1063,BIC:,2447.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.4279,0.064,-68.786,0.000,-4.554,-4.302
gender,0.0283,0.066,0.425,0.671,-0.102,0.159
rm,-0.0879,0.046,-1.922,0.055,-0.178,0.002
n_candidates,-0.2151,0.023,-9.269,0.000,-0.261,-0.170
1,0.8405,0.065,12.875,0.000,0.712,0.969
2,0.5459,0.072,7.568,0.000,0.404,0.687
3,0.3315,0.073,4.515,0.000,0.187,0.476
4,0.0412,0.076,0.539,0.590,-0.109,0.191
CIUDADANOS,-0.1192,0.242,-0.492,0.623,-0.595,0.356

0,1,2,3
Omnibus:,12.127,Durbin-Watson:,1.96
Prob(Omnibus):,0.002,Jarque-Bera (JB):,12.521
Skew:,0.225,Prob(JB):,0.00191
Kurtosis:,3.269,Cond. No.,1.41e+16


In [115]:
reg_result_df["interaction"]

0     last2weeks
1     last2weeks
2     last2weeks
3     last2weeks
4     last2weeks
5     last2weeks
6     last2weeks
7     last2weeks
8     last2weeks
9     last2weeks
10    last2weeks
11    last2weeks
12    last2weeks
13    last2weeks
14    last2weeks
15    last2weeks
16    last2weeks
17    last2weeks
18    last2weeks
19    last2weeks
Name: interaction, dtype: object

## Political Pagerank

### Correlation

### Regression