## 심화통계

In [18]:
# Import Libraries
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

# 시각화 OS별 한글폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif platform.system() == 'Mac':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정
    
print("="*60)
print("라이브러리 로드 완료!")
print("한글 폰트 설정 완료!")
print("="*60)

Windows
라이브러리 로드 완료!
한글 폰트 설정 완료!


In [19]:
# Load Dataset
sf_master   = pd.read_csv("./data/stat/sf_master.csv")              # sf_master
print("="*60)
print("데이터셋 로드 완료!")
print("="*60)

데이터셋 로드 완료!


In [20]:
print("[sf_master]", sf_master.duplicated().sum())
print("[sf_master]", sf_master.shape)
print(sf_master.columns)

[sf_master] 0
[sf_master] (483539, 42)
Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'offices_c_id', 'office_id',
       'office_city', 'rel_cf_id_x', 'relationship_growth', 'cat_fr_type',
       'num_fr_type', 'raised_amount_usd', 'is_fr_raised_private',
       'rel_cf_id_y', 'rel_p_id', 'n_founding_max', 'degree_max', 'stem_ratio',
       'co_founders', 'top_university', 'us_born_ratio', 'title_diversity',
       'funding_round_id', 'success_flag', 'round_tempo_months', 'open_rate'],
      dtype='object')


### 여러변수로 성공률 설명하기

##### 심화통계 
<span style = "font-size: 15px">
<b>성공 vs 실패 다변량 로지스틱</b><br>
</span>

<span style = "font-size:  13px;">
스타트업 특성: 산업, 지역, 사무실, 성장템포, 금액정보 공개율 <br>
창업자 특성: 학력, 전공, 학교, 창업경험, 직무다양성, 규모, 마일스톤<br>
각 변수의 독립적 영향이 남는가<br>
grain: 1행 1스타트업<br>
</span>

In [21]:
# grain 중복 체크 → ✅이상없음
print("grain unique:", sf_master[["objects_cfpr_id", "funding_round_id"]].duplicated().sum())

grain unique: 0


In [24]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# -----------------------
# 1) 전처리 도우미
# -----------------------
def prep_base(sf_master: pd.DataFrame) -> pd.DataFrame:
    d = sf_master.copy()

    # 1. target → success_flag: binary
    # 숫자로 변환 (문자/불리언/기타 섞여도 대응)
    d["success_flag"] = pd.to_numeric(d["success_flag"], errors="coerce")

    # inf/-inf 방어
    d.loc[~np.isfinite(d["success_flag"]), "success_flag"] = np.nan

    # 0/1 외 값이 있으면 NaN 처리 (예: 2, -1 등)
    d.loc[~d["success_flag"].isin([0, 1]), "success_flag"] = np.nan

    # 타깃 결측은 모델링 불가 → 여기서만 드랍(최소 드랍)
    d = d.dropna(subset=["success_flag"]).copy()
    d["success_flag"] = d["success_flag"].astype(int)

    # 2. 범주형: 결측 -> Unknown
    for c in ["obj_category_filled", "obj_state_filled"]:
        d[c] = d[c].astype("object").fillna("Unknown")

    # 3. binary
    d["is_obj_funding_total_usd_private"] = d["is_obj_funding_total_usd_private"].astype(int)

    # 4. 숫자형: relationships
    d["relationships"] = pd.to_numeric(d["relationships"], errors="coerce").fillna(0)
    d["log_relationships"] = np.log1p(d["relationships"]) # 관계수 로그변환

    # 5. round_tempo_months: 결측 매우 많을 수 있으니 결측 플래그 + 대체(중앙값)
    d["round_tempo_months"] = pd.to_numeric(d["round_tempo_months"], errors="coerce")
    d["round_tempo_isna"] = d["round_tempo_months"].isna().astype(int)
    med = d["round_tempo_months"].median(skipna=True) # 중앙값
    
    # 전부 결측인 경우 방어
    if np.isnan(med):
        med = 0.0
    d["round_tempo_filled"] = d["round_tempo_months"].fillna(med)
    d["log_round_tempo"] = np.log1p(d["round_tempo_filled"])
    
    # 사무실 특성(회사 단위)
    office_feat = build_office_features(sf_master)
    office_feat["has_office"] = (office_feat["n_offices"] >= 1).astype(int)
    # d = d.merge(office_feat, on="objects_cfpr_id", how="left")
    d = d.merge(
    office_feat[["objects_cfpr_id", "n_offices", "office_city_rep", "has_office"]],
    on="objects_cfpr_id",
    how="left"
)

    # 결측 방어
    d["office_city_rep"] = d["office_city_rep"].fillna("Unknown").astype("object")
    d["n_offices"] = d["n_offices"].fillna(0).astype(int)
    d["has_office"] = d["has_office"].fillna(0.0).astype(int)

    return d

# 스타트업 특성(사무실)
def build_office_features(sf_master: pd.DataFrame) -> pd.DataFrame:
    x = sf_master[["objects_cfpr_id", "office_id", "office_city"]].copy()

    # office_id: 숫자/문자 혼재 방어
    x["office_id"] = pd.to_numeric(x["office_id"], errors="coerce")

    # office_city: 결측/공백 방어
    x["office_city"] = x["office_city"].astype("object")
    x["office_city"] = x["office_city"].where(x["office_city"].notna(), np.nan)

    # (1) 사무실 수: 회사별 office_id 고유 개수
    office_cnt = (
        x.groupby("objects_cfpr_id")["office_id"]
         .nunique(dropna=True)
         .rename("n_offices")
         .reset_index()
    )

    # (2) 대표 도시: 회사별 office_city 최빈값(mode)
    def _mode_city(s: pd.Series):
        s = s.dropna()
        if s.empty:
            return "Unknown"
        m = s.mode()
        return m.iloc[0] if len(m) > 0 else "Unknown"

    office_city_rep = (
        x.groupby("objects_cfpr_id")["office_city"]
         .apply(_mode_city)
         .rename("office_city_rep")
         .reset_index()
    )

    out = office_cnt.merge(office_city_rep, on="objects_cfpr_id", how="outer")

    # 파생: 로그 변환(0도 안전)
    out["n_offices"] = out["n_offices"].fillna(0).astype(int)
    out["has_office"] = (out["n_offices"] >= 1).astype(int)

    return out


# 창업자 특성 전처리
def prep_founder_subset(sf_master: pd.DataFrame) -> pd.DataFrame:
    d = sf_master.copy()

    # 창업자 특성 6가지
    num_cols = ["degree_max", "stem_ratio", "top_university", "n_founding_max", "title_diversity", "relationships"]
    for c in num_cols:
        d[c] = pd.to_numeric(d[c], errors="coerce")

    # 창업자 특성 6개가 모두 있는 케이스만(모델B/C용)
    d = d.dropna(subset=["degree_max", "stem_ratio", "top_university", "n_founding_max", "title_diversity"])

    # 변환
    d["log_relationships"] = np.log1p(d["relationships"].fillna(0))
    d["log_title_diversity"] = np.log1p(d["title_diversity"])
    d["log_n_founding_max"] = np.log1p(d["n_founding_max"])

    d["top_university"] = d["top_university"].astype(int)
    d["success_flag"] = d["success_flag"].astype(int)

    # 스타트업 통제변수도 같이 쓰려면 base 전처리도 적용
    d = prep_base(d)

    return d

# GLM(Binomial) 적합 함수
def fit_glm_binom(formula: str, data: pd.DataFrame):
    # GLM(Binomial) + robust SE (HC3)
    m = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit(cov_type="HC3")
    return m

# 결과 테이블
def result_table(model) -> pd.DataFrame:
    # OR(odds ratio) 테이블
    params = model.params
    conf = model.conf_int()
    pvals = model.pvalues

    out = pd.DataFrame({
        "coef": params,
        "OR": np.exp(params),
        "CI_low(OR)": np.exp(conf[0]),
        "CI_high(OR)": np.exp(conf[1]),
        "p_value": pvals
    })
    out["판정(alpha=0.05)"] = np.where(out["p_value"] < 0.05, "귀무가설 기각", "귀무가설 기각 못함")
    return out.sort_values("p_value")

In [25]:
# -----------------------
# 2) Model A: 전체 표본(스타트업 특성)
# 기업 특성 자체의 영향(전체 표본 기준)
# -----------------------
df_A = prep_base(sf_master)

print(df_A["obj_category_filled"].nunique(), df_A["obj_state_filled"].nunique())
display(df_A["obj_category_filled"].value_counts().head(10))
display(df_A["obj_state_filled"].value_counts().head(10))

46 55


obj_category_filled
software       9544
biotech        5981
web            4347
mobile         3666
enterprise     3232
ecommerce      2549
advertising    2316
games_video    2155
cleantech      1961
hardware       1734
Name: count, dtype: int64

obj_state_filled
Unknown    16350
CA         14392
NY          3559
MA          3303
TX          1609
WA          1394
CO           926
PA           916
FL           817
IL           752
Name: count, dtype: int64

In [26]:
df_A.shape

(52626, 49)

In [27]:
# TopK + Others  함수
def topk_with_other(s: pd.Series, k=10, other="Others"):
    s = s.fillna("Unknown").astype(str)
    top = s.value_counts().nlargest(k).index
    return s.where(s.isin(top), other)

In [28]:
# OR(odds ratio) 요약
def short_or_table(result, topn=30):
    params = result.params
    conf = result.conf_int()
    p = result.pvalues
    out = pd.DataFrame({
        "coef": params,
        "OR": np.exp(params),
        "CI_low": np.exp(conf[0]),
        "CI_high": np.exp(conf[1]),
        "p_value": p,
        "판정(alpha=0.05)": np.where(p < 0.05, "귀무가설 기각", "귀무가설 기각 못함"),
    }).sort_values("p_value")
    return out.head(topn)

In [29]:
# 메모리 절약
# 모델에 필요한 컬럼만 남겨서 patsy가 불필요한 메모리 안 쓰게
use_cols = [
    "success_flag",
    "obj_category_filled", "obj_state_filled",
    "log_round_tempo"   , "round_tempo_isna",
    "is_obj_funding_total_usd_private",
    "log_relationships", 
    "office_city_rep", "has_office"
]
df_A = df_A[use_cols].copy()

# 1) df_A에서 Top5 컬럼 생성
df_A["cat_top5"] = topk_with_other(df_A["obj_category_filled"], k=5, other="Others")
df_A["state_top5"] = topk_with_other(df_A["obj_state_filled"], k=5, other="Others")

# office_city도 TopK로 축약
df_A["office_city_top5"] = topk_with_other(df_A["office_city_rep"], k=5, other="Others")

# 2) category dtype으로 메모리/속도 개선(최적화)
for c in ["cat_top5", "state_top5", "office_city_top5"]:
    df_A[c] = df_A[c].astype("category")

# 3) 축약 모델(Top5만)
formula_A = (
    "success_flag ~ C(cat_top5) + C(state_top5) + C(office_city_top5) "
    "+ has_office "
    "+ log_round_tempo + round_tempo_isna "
    "+ is_obj_funding_total_usd_private "
    "+ log_relationships"
)

model_A = smf.logit(formula_A, data=df_A).fit(disp=False, method="lbfgs", maxiter=200)
print(model_A.summary())

                           Logit Regression Results                           
Dep. Variable:           success_flag   No. Observations:                52626
Model:                          Logit   Df Residuals:                    52605
Method:                           MLE   Df Model:                           20
Date:                   수, 24 12 2025   Pseudo R-squ.:                 0.08573
Time:                        09:10:15   Log-Likelihood:                -25861.
converged:                       True   LL-Null:                       -28285.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                           coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                               -2.3182      0.194    -11.948      0.000      -2.699      -1.938
C(cat_top5)[T.biotech]                  -0.1528      

In [30]:
df_A.columns

Index(['success_flag', 'obj_category_filled', 'obj_state_filled',
       'log_round_tempo', 'round_tempo_isna',
       'is_obj_funding_total_usd_private', 'log_relationships',
       'office_city_rep', 'has_office', 'cat_top5', 'state_top5',
       'office_city_top5'],
      dtype='object')

In [None]:
display(df_A["cat_top5"].value_counts().head())
display(df_A["state_top5"].value_counts().head())
display(df_A["office_city_top5"].value_counts().head())

cat_top5
Others      25856
software     9544
biotech      5981
web          4347
mobile       3666
Name: count, dtype: int64

state_top5
Unknown    16350
CA         14392
Others     13413
NY          3559
MA          3303
Name: count, dtype: int64

office_city_top5
Others           40409
Unknown           3701
San Francisco     3626
New York          2744
London            1257
Name: count, dtype: int64

In [31]:
# -----------------------
# 3) Model B: 창업자 정보 있는 소표본(창업자 특성만) 
# 창업자 특성의 ‘겉보기’ 영향(하지만 기업 특성 교란 가능) → 스크리닝
# -----------------------
df_founder = prep_founder_subset(sf_master)

use_cols_B = [
    "success_flag",
    "degree_max",
    "stem_ratio",
    "top_university",
    "log_n_founding_max",
    "log_title_diversity",
    "log_relationships",
]
df_founder = df_founder[use_cols_B].copy()

# 종속변수(타겟): success_flag
# 독립변수(설명변수): 창업자 특성
formula_B = (
    "success_flag ~ degree_max + stem_ratio + top_university "
    "+ log_n_founding_max + log_title_diversity + log_relationships"
)

model_B = smf.logit(formula_B, data=df_founder).fit(disp=False, method="lbfgs", maxiter=200)
print(model_B.summary())

model_B_hc3 = smf.logit(formula_B, data=df_founder).fit(
    disp=False, method="lbfgs", maxiter=200,
    cov_type="HC3"
)
print(model_B_hc3.summary())
display(short_or_table(model_B_hc3, topn=30))

                           Logit Regression Results                           
Dep. Variable:           success_flag   No. Observations:                 7158
Model:                          Logit   Df Residuals:                     7151
Method:                           MLE   Df Model:                            6
Date:                   수, 24 12 2025   Pseudo R-squ.:                 0.05988
Time:                        09:10:19   Log-Likelihood:                -4272.2
converged:                       True   LL-Null:                       -4544.3
Covariance Type:            nonrobust   LLR p-value:                2.428e-114
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -2.7502      0.137    -20.023      0.000      -3.019      -2.481
degree_max              0.0172      0.031      0.559      0.576      -0.043       0.077
stem_ratio      

Unnamed: 0,coef,OR,CI_low,CI_high,p_value,판정(alpha=0.05)
Intercept,-2.750237,0.063913,0.048998,0.083367,1.7505519999999998e-91,귀무가설 기각
log_title_diversity,0.88302,2.418192,1.934507,3.022813,8.827043e-15,귀무가설 기각
log_relationships,0.204651,1.227096,1.05791,1.423339,0.006857149,귀무가설 기각
log_n_founding_max,0.130209,1.139067,1.002657,1.294035,0.04542012,귀무가설 기각
top_university,-0.09098,0.913036,0.795357,1.048126,0.196251,귀무가설 기각 못함
stem_ratio,-0.049622,0.951589,0.841737,1.075778,0.4278593,귀무가설 기각 못함
degree_max,0.017171,1.017319,0.958487,1.079762,0.5721038,귀무가설 기각 못함


In [32]:
# -----------------------
# 4) Model C: 동일 소표본에서 스타트업+창업자 같이 → 창업자 특성의 독립성 검증
# 기업 특성을 통제한 뒤에도 남는 창업자 특성의 독립적 영향 
# -----------------------
df_both = prep_founder_subset(sf_master)
# 기업 범주형 Top5 축약(동일 소표본에서 다시 계산)
df_both["cat_top5"] = topk_with_other(df_both["obj_category_filled"], k=5, other="Others")
df_both["state_top5"] = topk_with_other(df_both["obj_state_filled"], k=5, other="Others")
df_both["office_city_top5"] = topk_with_other(df_both["office_city_rep"], k=5, other="Others")

df_both["cat_top5"] = df_both["cat_top5"].astype("category")
df_both["state_top5"] = df_both["state_top5"].astype("category")
df_both["office_city_top5"] = df_both["office_city_top5"].astype("category")



use_cols_C = [
    "success_flag",
    # 스타트업 특성(통제) + 사무실 수, 위치, 공동창업자 수 
    "cat_top5", "state_top5",
    "office_city_top5", "has_office",
    "log_round_tempo", "round_tempo_isna",
    "is_obj_funding_total_usd_private",
    # 창업자 특성
    "degree_max", "stem_ratio", "top_university",
    "log_n_founding_max", "log_title_diversity", "log_relationships",
]
df_both = df_both[use_cols_C].copy()

# 종속변수(타겟): success_flag
# 독립변수(설명변수): 스타트업 특성 + 창업자 특성
formula_C = (
    "success_flag ~ C(cat_top5) + C(state_top5) + C(office_city_top5) "
    "+ has_office "
    "+ log_round_tempo + round_tempo_isna "
    "+ is_obj_funding_total_usd_private "
    "+ degree_max + stem_ratio + top_university "
    "+ log_n_founding_max + log_title_diversity + log_relationships"
)

model_C = smf.logit(formula_C, data=df_both).fit(disp=False, method="lbfgs", maxiter=200)
print(model_C.summary())

                           Logit Regression Results                           
Dep. Variable:           success_flag   No. Observations:                 7158
Model:                          Logit   Df Residuals:                     7132
Method:                           MLE   Df Model:                           25
Date:                   수, 24 12 2025   Pseudo R-squ.:                 0.08059
Time:                        09:10:25   Log-Likelihood:                -4178.1
converged:                       True   LL-Null:                       -4544.3
Covariance Type:            nonrobust   LLR p-value:                2.052e-138
                                           coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                               -2.5066      0.457     -5.488      0.000      -3.402      -1.611
C(cat_top5)[T.advertising]               0.3636      