## 심화통계

In [16]:
# Import Libraries
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

# 시각화 OS별 한글폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif platform.system() == 'Mac':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정
    
print("="*60)
print("라이브러리 로드 완료!")
print("한글 폰트 설정 완료!")
print("="*60)

Windows
라이브러리 로드 완료!
한글 폰트 설정 완료!


In [17]:
# Load Dataset
sf_master   = pd.read_csv("./data/stat/sf_master.csv")              # sf_master
print("="*60)
print("데이터셋 로드 완료!")
print("="*60)

데이터셋 로드 완료!


In [18]:
print("[sf_master]", sf_master.duplicated().sum())
print("[sf_master]", sf_master.shape)
print(sf_master.columns)

[sf_master] 0
[sf_master] (483539, 42)
Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'offices_c_id', 'office_id',
       'office_city', 'rel_cf_id_x', 'relationship_growth', 'cat_fr_type',
       'num_fr_type', 'raised_amount_usd', 'is_fr_raised_private',
       'rel_cf_id_y', 'rel_p_id', 'n_founding_max', 'degree_max', 'stem_ratio',
       'co_founders', 'top_university', 'us_born_ratio', 'title_diversity',
       'funding_round_id', 'success_flag', 'round_tempo_months', 'open_rate'],
      dtype='object')


### 여러변수로 성공률 설명하기

##### 심화통계 
<span style = "font-size: 15px">
<b>성공 vs 실패 다변량 로지스틱</b><br>
</span>

<span style = "font-size:  13px;">
표본 단위(GRAIN)검증<br>
각 변수의 독립적 영향이 남는가<br>
* grain: 1행 1스타트업<br>
</span>

In [19]:
# 1) 타깃(성공여부) 정리: 0/1만 사용
sf_master = sf_master[sf_master["success_flag"].isin([0, 1])].copy()

# 2) 라운드-스타트업 유일성 확인(사용자 확인 사항)
dup_obj_fr = sf_master[["objects_cfpr_id", "funding_round_id"]].duplicated().sum()
print("dup (objects_cfpr_id, funding_round_id) =", dup_obj_fr)

# 3) 스타트업 중복(= 여러 라운드 존재) 확인
dup_obj = sf_master["objects_cfpr_id"].duplicated().sum()
print("dup objects_cfpr_id =", dup_obj)

# 4) 성공여부가 스타트업 내에서 변하는지 확인
flag_var = sf_master.groupby("objects_cfpr_id")["success_flag"].nunique().value_counts().sort_index()
print("success_flag nunique per startup:\n", flag_var)
# success_flag nunique가 전부 1이면: 스타트업 단위 집계가 합리적

dup (objects_cfpr_id, funding_round_id) = 0
dup objects_cfpr_id = 20919
success_flag nunique per startup:
 success_flag
1    31707
Name: count, dtype: int64


<span style = "font-size:  13px;">
스타트업 단위로 집계<br>
<b>의사결정 근거</b><br>
* 동일 스타트업의 여러 라운드 중 어떤 행을 고르느냐에 따라 결측이 달라질 수 있음<br>
* 첫 행/마지막 행을 임의로 택하기보다 해당 스타트업에서 관측 가능한 값(비결측)을 최대한 살리는 coalesce 집계가 안전<br>
* 성공여부는 스타트업에서 고정이므로 max로 집계해도 동일<br>
</span>

In [20]:
def coalesce_first_nonnull(s: pd.Series):
    s2 = s.dropna()
    return s2.iloc[0] if len(s2) else np.nan

# 이번 분석에 필요한 컬럼만 선별(속도/메모리 절약)
USE_COLS = [
    "objects_cfpr_id",
    "success_flag",

    # 스타트업 특성
    "obj_category_filled",
    "obj_state_filled",
    "round_tempo_months",
    "is_obj_funding_total_usd_private",
    "title_diversity",
    "relationships",

    # 창업자 특성
    "degree_max",
    "stem_ratio",
    "top_university",
    "n_founding_max"
]

missing_cols = [c for c in USE_COLS if c not in sf_master.columns]
if missing_cols:
    print("[경고] 데이터에 없는 컬럼:", missing_cols)

use_cols_exist = [c for c in USE_COLS if c in sf_master.columns]
df = sf_master[use_cols_exist].copy()

# 스타트업 단위 집계
agg_dict = {c: coalesce_first_nonnull for c in df.columns if c not in ["objects_cfpr_id"]}
agg_dict["success_flag"] = "max"

df_s = (
    df.groupby("objects_cfpr_id", as_index=False)
      .agg(agg_dict)
)

print("startup-level shape:", df_s.shape)
print("dup objects_cfpr_id in df_s:", df_s["objects_cfpr_id"].duplicated().sum())


startup-level shape: (31707, 12)
dup objects_cfpr_id in df_s: 0


<span style = "font-size: 15px">
범주 축약<br>
</span>
<span style = "font-size: 13px">
<b>1. 산업/지역: Top-K + Others + Unknown</b><br>
* 범주가 너무 많으면 계수가 폭증하고 일부 범주는 표본이 희박해 분산이 커짐<br>
* Top-K를 유지하고 나머지는 Others로 묶는 것이 안정적<br>
* 지역(state)은 결측이 많을 수 있으므로 Unknown을 명시<br>
</span>

In [21]:
def topk_with_other_and_unknown(s: pd.Series, k: int, other="Others", unknown="Unknown"):
    s2 = s.fillna(unknown).astype(str)
    topk = s2.value_counts().head(k).index
    return s2.where(s2.isin(topk), other)

df_s["cat_top10"]   = topk_with_other_and_unknown(df_s["obj_category_filled"], k=10)
df_s["state_top10"] = topk_with_other_and_unknown(df_s["obj_state_filled"], k=10)


<span style = "font-size: 15px">
 결측 처리<br>
</span>
<span style = "font-size: 13px">
<b>2. 결측이 많은 연속형: 분위수 bin + Unknown</b><br>
* round_tempo_months, title_diversity, stem_ratio, top_university는 결측이 많을 수 있음<br>
* 단순 대치보다 “Unknown”을 분리하면 (1) 표본 유지 (2) 결측 집단의 구조적 차이를 모델이 흡수 가능<br>
* 분위수 bin은 OR 해석이 직관적입니다(“상위 25% vs 하위 25%”)<br>
</span>

In [22]:
def qbin_with_unknown(x: pd.Series, q=4, unknown="Unknown"):
    out = pd.Series([unknown]*len(x), index=x.index, dtype="object")
    mask = x.notna()
    if mask.sum() == 0:
        return out
    # duplicates='drop': 값이 너무 중복이면 bin 수가 줄어들 수 있음
    out.loc[mask] = pd.qcut(x.loc[mask], q=q, duplicates="drop").astype(str)
    return out

# 결측 많은 변수들은 bin 처리
if "round_tempo_months" in df_s.columns:
    df_s["tempo_q"] = qbin_with_unknown(df_s["round_tempo_months"], q=4)

if "title_diversity" in df_s.columns:
    df_s["titlediv_q"] = qbin_with_unknown(df_s["title_diversity"], q=4)

if "stem_ratio" in df_s.columns:
    df_s["stem_q"] = qbin_with_unknown(df_s["stem_ratio"], q=4)

if "top_university" in df_s.columns:
    df_s["topuni_q"] = qbin_with_unknown(df_s["top_university"], q=4)

# 창업경험: 분포가 치우치면 구간화가 해석에 유리
if "n_founding_max" in df_s.columns:
    nf = df_s["n_founding_max"]
    df_s["founding_bin"] = pd.Series(["Unknown"]*len(nf), index=nf.index, dtype="object")
    m = nf.notna()
    df_s.loc[m, "founding_bin"] = pd.cut(
        nf.loc[m],
        bins=[0, 1, 2, 4, np.inf],
        labels=["1", "2", "3-4", "5+"],
        include_lowest=True
    ).astype(str)

# 학력: 0~4 + Unknown (범주형 처리)
if "degree_max" in df_s.columns:
    df_s["degree_cat"] = df_s["degree_max"].map(
        {0:"0", 1:"1", 2:"2", 3:"3", 4:"4"}
    ).astype("object")
    df_s.loc[df_s["degree_max"].isna(), "degree_cat"] = "Unknown"


<span style = "font-size: 15px">
스케일링 함수<br>
</span>
<span style = "font-size: 13px">
<b>3. 관계규모(relationships): log1p + 표준화(1SD OR)</b><br>
* 관계수는 긴 꼬리(대형 outlier)가 많을 가능성이 큼 → log1p로 완화<br>
* 표준화하면 OR이 “1 SD 증가당”으로 비교 가능<br>
</span>

In [23]:
def zscore(s: pd.Series):
    return (s - s.mean()) / s.std(ddof=0)

df_s["log_rel"] = np.log1p(df_s["relationships"].clip(lower=0))
df_s["log_rel_z"] = zscore(df_s["log_rel"])


<span style = "font-size: 15px">
로지스틱 회귀 실행 + OR 테이블 생성 유틸<br>
</span>

In [24]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

ALPHA = 0.05

def fit_logit(df_in: pd.DataFrame, formula: str):
    # # GLM(Binomial)을 쓰면 robust covariance 지정
    # model = smf.glm(formula=formula, data=df_in, family=sm.families.Binomial()).fit(cov_type="HC1")
    # return model

     # 1) 먼저 non-robust로 적합(여기서는 거의 안 터짐)
    m = smf.glm(formula=formula, data=df_in, family=sm.families.Binomial()).fit()

    # 2) robust(=HC1) 표준오차는 사후에 시도
    try:
        m = m.get_robustcov_results(cov_type="HC1")
    except Exception as e:
        print("[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.")
        print("원인:", type(e).__name__, str(e)[:120])

    # “Singular matrix”가 나와도 코드가 멈추지 않고, 어떤 변수가 문제인지 후속 조치를 할 수 있는 상태가 됨

    return m

def or_table(model, alpha=0.05):
    coef = model.params
    se = model.bse
    p = model.pvalues
    ci = model.conf_int(alpha=alpha)
    out = pd.DataFrame({
        "coef(logit)": coef,
        "OR": np.exp(coef),
        "CI_low": np.exp(ci[0]),
        "CI_high": np.exp(ci[1]),
        "p_value": p,
    })
    out["결론(alpha=0.05)"] = np.where(out["p_value"] < alpha, "[귀무가설 기각]", "[귀무가설 기각 못함]")
    return out.sort_values("p_value")


### Model A

<span style = "font-size: 15px;">
Model A: 스타트업 특성이 성공률에 미치는 영향<br>
스타트업 특성: 산업, 지역, 사무실, 성장템포, 금액정보 공개율, 직무다양성, 규모<br>
</span>

<span style = "font-size: 13px;">
모델 A(core): 표본을 최대한 보존<br>
</span>

In [25]:
formula_A_core = """
success_flag
~ C(cat_top10)
+ C(state_top10)
+ is_obj_funding_total_usd_private
+ log_rel_z
"""

mA_core = fit_logit(df_s, formula_A_core)
print(mA_core.summary())

tab_A_core = or_table(mA_core, alpha=ALPHA)
tab_A_core.head(30)


[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'
                 Generalized Linear Model Regression Results                  
Dep. Variable:           success_flag   No. Observations:                31707
Model:                            GLM   Df Residuals:                    31684
Model Family:                Binomial   Df Model:                           22
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -12849.
Date:                    월, 19 1 2026   Deviance:                       25698.
Time:                        09:50:17   Pearson chi2:                 3.22e+04
No. Iterations:                     6   Pseudo R-squ. (CS):            0.08863
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      

Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
Intercept,-1.548683,0.212528,0.196126,0.230301,0.0,[귀무가설 기각]
log_rel_z,0.696509,2.006735,1.940125,2.075631,0.0,[귀무가설 기각]
C(state_top10)[T.Others],-0.651318,0.521358,0.470289,0.577974,3.23772e-35,[귀무가설 기각]
is_obj_funding_total_usd_private,-0.64185,0.526318,0.465382,0.595232,1.551108e-24,[귀무가설 기각]
C(state_top10)[T.Unknown],-0.418117,0.658285,0.606234,0.714805,2.5551730000000003e-23,[귀무가설 기각]
C(cat_top10)[T.web],0.323346,1.381743,1.233572,1.547712,2.309926e-08,[귀무가설 기각]
C(state_top10)[T.PA],-0.731756,0.481063,0.362112,0.639089,4.435451e-07,[귀무가설 기각]
C(cat_top10)[T.software],0.23525,1.265225,1.153158,1.388184,6.645478e-07,[귀무가설 기각]
C(state_top10)[T.FL],-0.652048,0.520978,0.391524,0.693234,7.683774e-06,[귀무가설 기각]
C(cat_top10)[T.enterprise],0.281702,1.325384,1.156553,1.51886,5.076694e-05,[귀무가설 기각]


<span style = "font-size: 15px;">
모델 A(extended): 성장템포/직무다양성까지 포함(결측은 bin+Unknown으로 유지)<br>
</span>

In [26]:
terms = [
    "C(cat_top10)",
    "C(state_top10)",
    "is_obj_funding_total_usd_private",
    "log_rel_z",
]

if "tempo_q" in df_s.columns:
    terms.append("C(tempo_q)")
if "titlediv_q" in df_s.columns:
    terms.append("C(titlediv_q)")

formula_A_ext = "success_flag ~ " + " + ".join(terms)

mA_ext = fit_logit(df_s, formula_A_ext)
tab_A_ext = or_table(mA_ext, alpha=ALPHA)
tab_A_ext.head(40)

# 해석
# OR > 1: 해당 특성이 있을수록 성공 odds 증가
# OR < 1: 성공 odds 감소
# Unknown 범주의 OR은 “정보가 관측되는 집단과 다른 구조적 차이”일 수 있음(인과 해석 금지)


[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'


Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
log_rel_z,0.660061,1.93491,1.864445,2.008038,1.9053409999999997e-266,[귀무가설 기각]
Intercept,-1.536183,0.215201,0.168015,0.27564,4.833364e-34,[귀무가설 기각]
C(state_top10)[T.Others],-0.638788,0.527932,0.476044,0.585476,1.036392e-33,[귀무가설 기각]
is_obj_funding_total_usd_private,-0.628234,0.533533,0.471603,0.603596,1.8733520000000003e-23,[귀무가설 기각]
C(state_top10)[T.Unknown],-0.410774,0.663137,0.610591,0.720206,1.803399e-22,[귀무가설 기각]
C(cat_top10)[T.web],0.332241,1.394089,1.244212,1.56202,1.03284e-08,[귀무가설 기각]
C(cat_top10)[T.software],0.244517,1.277005,1.163567,1.401502,2.581633e-07,[귀무가설 기각]
C(state_top10)[T.PA],-0.726501,0.483598,0.36394,0.642598,5.468317e-07,[귀무가설 기각]
C(state_top10)[T.FL],-0.637336,0.528699,0.397184,0.703761,1.257499e-05,[귀무가설 기각]
C(cat_top10)[T.enterprise],0.270875,1.311111,1.143532,1.503248,0.000103508,[귀무가설 기각]


### Model B

<span style = "font-size: 15px;">
Model B: 창업자 특성이 성공률에 미치는 영향<br>
창업자 특성: 학력, 전공, 학교, 창업경험, 마일스톤<br>
→ “창업자 변수들이 결측이 많더라도 표본을 버리지 않도록” bin/Unknown을 사용<br>
</span>

<span style = "font-size : 13px;">
* “창업자 특성의 순수 효과”를 보려면, 스타트업 규모/네트워크(relationships) 같은 교란을 일부 통제하는 편이 합리적<br>
* “창업자 정보만으로 설명력”을 보려면 통제를 빼는 변형도 가능<br>
</span>

In [None]:
terms_B = ["log_rel_z"]  # (선택) 관계규모를 통제변수로 두고 창업자 영향만 보려면 포함
# 통제 제외하고 싶으면 위 줄을 지우세요.

if "degree_cat" in df_s.columns:
    terms_B.append("C(degree_cat)")
if "stem_q" in df_s.columns:
    terms_B.append("C(stem_q)")
if "topuni_q" in df_s.columns:
    terms_B.append("C(topuni_q)")
if "founding_bin" in df_s.columns:
    terms_B.append("C(founding_bin)")

formula_B = "success_flag ~ " + " + ".join(terms_B)

mB = fit_logit(df_s, formula_B)
tab_B = or_table(mB, alpha=ALPHA)

print(mB.summary())
tab_B.head(40)


[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'
                 Generalized Linear Model Regression Results                  
Dep. Variable:           success_flag   No. Observations:                31707
Model:                            GLM   Df Residuals:                    31695
Model Family:                Binomial   Df Model:                           11
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13033.
Date:                    월, 19 1 2026   Deviance:                       26067.
Time:                        09:50:19   Pearson chi2:                 3.24e+04
No. Iterations:                   100   Pseudo R-squ. (CS):            0.07798
Covariance Type:            nonrobust                                         
                                 coef    std err          z      P>|z|      [0.025

Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
log_rel_z,0.761829,2.142191,2.072263,2.214479,0.0,[귀무가설 기각]
Intercept,-2.118664,0.120192,0.066369,0.217665,2.704099e-12,[귀무가설 기각]
C(founding_bin)[T.2],0.173659,1.18965,1.020919,1.386268,0.02606449,[귀무가설 기각]
C(stem_q)[T.Unknown],-0.034633,0.965959,0.935248,0.997679,0.03564971,[귀무가설 기각]
C(founding_bin)[T.Unknown],-0.034633,0.965959,0.935248,0.997679,0.03564971,[귀무가설 기각]
C(topuni_q)[T.Unknown],-0.034633,0.965959,0.935248,0.997679,0.03564971,[귀무가설 기각]
C(degree_cat)[T.3],0.463645,1.589858,0.869875,2.905761,0.1318404,[귀무가설 기각 못함]
C(degree_cat)[T.1],0.630634,1.878802,0.768774,4.591592,0.1666024,[귀무가설 기각 못함]
C(degree_cat)[T.Unknown],0.400364,1.492368,0.820843,2.713261,0.1892932,[귀무가설 기각 못함]
C(degree_cat)[T.2],0.338635,1.403031,0.765809,2.570482,0.2729845,[귀무가설 기각 못함]


In [30]:
terms_B = []  

if "degree_cat" in df_s.columns:
    terms_B.append("C(degree_cat)")
if "stem_q" in df_s.columns:
    terms_B.append("C(stem_q)")
if "topuni_q" in df_s.columns:
    terms_B.append("C(topuni_q)")
if "founding_bin" in df_s.columns:
    terms_B.append("C(founding_bin)")

formula_B = "success_flag ~ " + " + ".join(terms_B)

mB = fit_logit(df_s, formula_B)
tab_B = or_table(mB, alpha=ALPHA)

print(mB.summary())
tab_B.head(40)


[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'
                 Generalized Linear Model Regression Results                  
Dep. Variable:           success_flag   No. Observations:                31707
Model:                            GLM   Df Residuals:                    31696
Model Family:                Binomial   Df Model:                           10
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -14136.
Date:                    월, 19 1 2026   Deviance:                       28272.
Time:                        10:34:15   Pearson chi2:                 3.15e+04
No. Iterations:                   100   Pseudo R-squ. (CS):            0.01157
Covariance Type:            nonrobust                                         
                                 coef    std err          z      P>|z|      [0.025

Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
Intercept,-1.670422,0.188168,0.105206,0.336551,1.791476e-08,[귀무가설 기각]
C(founding_bin)[T.2],0.2193479,1.245264,1.075624,1.44166,0.003328736,[귀무가설 기각]
C(degree_cat)[T.4],0.6696323,1.953519,1.061025,3.596746,0.0315421,[귀무가설 기각]
C(degree_cat)[T.3],0.6375393,1.89182,1.048282,3.414142,0.0343023,[귀무가설 기각]
C(degree_cat)[T.1],0.7837737,2.18972,0.930082,5.155323,0.07280481,[귀무가설 기각 못함]
C(degree_cat)[T.2],0.5157644,1.674918,0.926748,3.027092,0.08763001,[귀무가설 기각 못함]
C(founding_bin)[T.3-4],0.1658141,1.180354,0.972296,1.432933,0.09374058,[귀무가설 기각 못함]
C(founding_bin)[T.5+],0.2682847,1.307719,0.909959,1.879348,0.1470587,[귀무가설 기각 못함]
C(degree_cat)[T.Unknown],0.3337826,1.39624,0.773341,2.520862,0.2681724,[귀무가설 기각 못함]
C(stem_q)[T.Unknown],-150488100000.0,0.0,0.0,inf,0.6536019,[귀무가설 기각 못함]


### Model C

<span style = "font-size: 15px">
Model C: 스타트업 + 창업자의 상호작용(시너지)<br>
</span>
<span style="font-size: 13px;">
<b>시너지의 통계적 정의</b><br>
* “함께 적용될 때 더 큰 시너지”는 회귀에서 상호작용항(interaction) 으로 정의하는 것이 표준<br>
</span>

In [28]:
# A 확장 + B 변수를 모두 포함(상호작용 없이)
terms_C_base = [
    "C(cat_top10)",
    "C(state_top10)",
    "is_obj_funding_total_usd_private",
    "log_rel_z",
]

if "tempo_q" in df_s.columns:
    terms_C_base.append("C(tempo_q)")
if "titlediv_q" in df_s.columns:
    terms_C_base.append("C(titlediv_q)")

if "degree_cat" in df_s.columns:
    terms_C_base.append("C(degree_cat)")
if "stem_q" in df_s.columns:
    terms_C_base.append("C(stem_q)")
if "topuni_q" in df_s.columns:
    terms_C_base.append("C(topuni_q)")
if "founding_bin" in df_s.columns:
    terms_C_base.append("C(founding_bin)")

formula_C_base = "success_flag ~ " + " + ".join(terms_C_base)

mC_base = fit_logit(df_s, formula_C_base)
tab_C_base = or_table(mC_base, alpha=ALPHA)
tab_C_base.head(40)


[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'


Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
log_rel_z,0.6610129,1.936753,1.866096,2.010085,2.958087e-266,[귀무가설 기각]
C(state_top10)[T.Others],-0.6404601,0.52705,0.475089,0.584693,1.1340020000000001e-33,[귀무가설 기각]
is_obj_funding_total_usd_private,-0.6269736,0.534206,0.471534,0.605208,7.046194e-23,[귀무가설 기각]
C(state_top10)[T.Unknown],-0.4131018,0.661595,0.608987,0.718748,1.487583e-22,[귀무가설 기각]
Intercept,-1.925611,0.145787,0.075693,0.280788,8.510558e-09,[귀무가설 기각]
C(cat_top10)[T.web],0.3293966,1.390129,1.240442,1.55788,1.455499e-08,[귀무가설 기각]
C(cat_top10)[T.software],0.244432,1.276896,1.163245,1.401651,2.757701e-07,[귀무가설 기각]
C(state_top10)[T.PA],-0.7226078,0.485485,0.366641,0.642851,4.550037e-07,[귀무가설 기각]
C(state_top10)[T.FL],-0.6423553,0.526052,0.396414,0.698085,8.600024e-06,[귀무가설 기각]
C(cat_top10)[T.enterprise],0.2705787,1.310723,1.142438,1.503797,0.0001137073,[귀무가설 기각]


<span style = "font-size: 15px;">
시너지 후보(상호작용) 3개부터 시작: 과적합/폭발 방지<br>
</span>
<span style = "font-size: 13px;">
* 산업(Top10) × 학력(6범주) 같은 상호작용을 한 번에 넣으면 파라미터가 급증하고 희소셀로 불안정해질 수 있음<br>
* “가설적으로 가장 의미 있는” 상호작용부터 소수만 넣고, 우도비(LR) 검정으로 추가 효용을 확인<br>
</span>

In [29]:
from scipy import stats

def lr_test(m_small, m_big):
    # GLM은 llf 제공
    LR = 2 * (m_big.llf - m_small.llf)
    df = m_big.df_model - m_small.df_model
    p = stats.chi2.sf(LR, df)
    return LR, df, p

# 상호작용 포함 모델
interaction_terms = [
    "is_obj_funding_total_usd_private:C(topuni_q)" if "topuni_q" in df_s.columns else None,
    "is_obj_funding_total_usd_private:C(degree_cat)" if "degree_cat" in df_s.columns else None,
    "log_rel_z:C(founding_bin)" if "founding_bin" in df_s.columns else None,
]
interaction_terms = [t for t in interaction_terms if t is not None]

formula_C_int = formula_C_base + " + " + " + ".join(interaction_terms)

mC_int = fit_logit(df_s, formula_C_int)
tab_C_int = or_table(mC_int, alpha=ALPHA)

LR, ddf, p_lr = lr_test(mC_base, mC_int)
print(f"[LR test] base vs interaction: LR={LR:.3f}, df={ddf}, p={p_lr:.4g}")

tab_C_int.head(60)

# LR-test p < 0.05이면: “상호작용(시너지) 추가”가 모델 적합을 유의미하게 개선
# 상호작용항 개별 p-value가 유의하면: 어떤 조합이 시너지를 만드는지 OR로 보고 가능

[경고] HC1 robust SE 계산 실패 → nonrobust SE로 진행합니다.
원인: AttributeError 'GLMResults' object has no attribute 'get_robustcov_results'
[LR test] base vs interaction: LR=12.092, df=9, p=0.2082


Unnamed: 0,coef(logit),OR,CI_low,CI_high,p_value,결론(alpha=0.05)
C(state_top10)[T.Others],-0.6394836,0.527565,0.475701,0.585083,9.144605999999999e-34,[귀무가설 기각]
C(state_top10)[T.Unknown],-0.4156796,0.659892,0.607461,0.716847,7.489041e-23,[귀무가설 기각]
log_rel_z,0.6237438,1.865901,1.608215,2.164876,1.938526e-16,[귀무가설 기각]
Intercept,-2.016057,0.13318,0.066908,0.265091,9.457217e-09,[귀무가설 기각]
C(cat_top10)[T.web],0.3265161,1.38613,1.236761,1.55354,1.991907e-08,[귀무가설 기각]
C(cat_top10)[T.software],0.2458671,1.27873,1.165062,1.403487,2.261212e-07,[귀무가설 기각]
C(state_top10)[T.PA],-0.7214448,0.48605,0.365888,0.645674,6.385459e-07,[귀무가설 기각]
C(state_top10)[T.FL],-0.6419825,0.526248,0.395396,0.700404,1.076074e-05,[귀무가설 기각]
C(cat_top10)[T.enterprise],0.2703375,1.310407,1.143036,1.502285,0.0001055611,[귀무가설 기각]
C(state_top10)[T.NY],-0.2471599,0.781016,0.68887,0.885487,0.0001140185,[귀무가설 기각]
