In [2]:
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
from statsmodels.othermod.betareg import BetaModel
from statsmodels.genmod.families.links import cloglog,log,logit,probit,nbinom
import statsmodels.formula.api as smf

import itertools
import statsmodels.api as sm
import pandas as pd

In [9]:
df = pd.read_csv('mafia_win_rate_heatmap_data.csv')
df = df[df['num_edges'] > 0].copy() # we assume that there would be at least one connection in the friendship. This is also because we wanted to use log(num edge), but it cannot be log(0)
df['log_num_edges'] = np.log(df['num_edges'])
df['red_ratio_sq'] = df['red_ratio'] ** 2
df['num_edges_sq'] = df['num_edges'] ** 2
df['log_num_edges'] = np.log(df['num_edges'])
df['interaction_term'] = df['red_ratio'] *  df['num_edges']
df['num_edges_sqrt'] = np.sqrt(df['num_edges'])



In [11]:
# CHECK NA AND INF 
print(df['log_num_edges'].isna().sum())
print(np.isinf(df['log_num_edges']).sum())

0
0


In [18]:
model1 = smf.ols(
    'mafia_win_rate ~ red_ratio + num_edges + red_ratio*num_edges + red_ratio_sq + num_edges_sq',
    data=df
).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:         mafia_win_rate   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.777
Method:                 Least Squares   F-statistic:                     660.2
Date:                Thu, 08 May 2025   Prob (F-statistic):          2.23e-304
Time:                        22:00:29   Log-Likelihood:                 2186.5
No. Observations:                 945   AIC:                            -4361.
Df Residuals:                     939   BIC:                            -4332.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               0.4863    

In [None]:
model2 = smf.ols('mafia_win_rate ~ red_ratio + red_ratio_sq + log_num_edges',data=df).fit()
print(model2.summary())

Model Summary:
                            OLS Regression Results                            
Dep. Variable:         mafia_win_rate   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.721
Method:                 Least Squares   F-statistic:                     812.3
Date:                Thu, 08 May 2025   Prob (F-statistic):          1.43e-260
Time:                        21:46:02   Log-Likelihood:                 2078.1
No. Observations:                 945   AIC:                            -4148.
Df Residuals:                     941   BIC:                            -4129.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.3601      0

In [19]:
model3 = smf.ols('mafia_win_rate ~ num_edges + red_ratio + red_ratio_sq + num_edges_sq +interaction_term + num_edges_sqrt',data=df).fit()
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:         mafia_win_rate   R-squared:                       0.780
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     553.2
Date:                Thu, 08 May 2025   Prob (F-statistic):          5.50e-304
Time:                        22:01:15   Log-Likelihood:                 2188.9
No. Observations:                 945   AIC:                            -4364.
Df Residuals:                     938   BIC:                            -4330.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.4668      0.010  

We see that model3 is the best in terms of AIC and Rsquared, thus we will build our predict function using model 3 outputs. 

In [25]:
beta_0 = model3.params['Intercept']
beta_1 = model3.params['num_edges']
beta_2 = model3.params['red_ratio']
beta_3 = model3.params['red_ratio_sq']
beta_4 = model3.params['num_edges_sq']
beta_5 = model3.params['interaction_term']
beta_6 = model3.params['num_edges_sqrt']
r_squared = model3.rsquared


def predict(red_ratio, num_edges):
    return (
        beta_0
        + beta_1 * num_edges
        + beta_2 * red_ratio
        + beta_3 * red_ratio**2 
        + beta_4 * num_edges**2 
        + beta_5 * num_edges*red_ratio 
        + beta_6 * math.sqrt(num_edges)
    )

def plot_vs_red_ratio(fixed_edges):
    red_vals = np.linspace(0, 1, 100)
    predicted = predict(red_vals, fixed_edges)
    actual = df[np.abs(df['num_edges'] - fixed_edges) <= 2]

    plt.figure(figsize=(8, 5))
    plt.plot(red_vals, predicted, label='Predicted (Model)', color='blue')
    plt.scatter(actual['red_ratio'], actual['mafia_win_rate'], color='orange', alpha=0.6, label='Simulated Data')
    plt.title(f'Mafia Win Rate vs Red Ratio (Edges = {fixed_edges})')
    plt.xlabel('Red Edge Ratio')
    plt.ylabel('Mafia Win Rate')
    plt.ylim(0, 1)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_vs_num_edges(fixed_red_ratio):
    edge_vals = np.linspace(df['num_edges'].min(), df['num_edges'].max(), 100)
    predicted = predict(fixed_red_ratio, edge_vals)
    actual = df[np.abs(df['red_ratio'] - fixed_red_ratio) <= 0.01]

    plt.figure(figsize=(8, 5))
    plt.plot(edge_vals, predicted, label='Predicted (Model)', color='green')
    plt.scatter(actual['num_edges'], actual['mafia_win_rate'], color='orange', alpha=0.6, label='Simulated Data')
    plt.title(f'Mafia Win Rate vs Number of Edges (Red Ratio = {fixed_red_ratio})')
    plt.xlabel('Number of Edges')
    plt.ylabel('Mafia Win Rate')
    plt.ylim(0, 1)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [26]:
predict(0.7, 37)

0.5608354697901756