# Pipeline 간단 버전 - 무엇을 하는가?

**목표**: AI/ML 스타트업의 약속 애매모호함(vagueness)이 펀딩 성공에 미치는 영향 분석

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Setup paths
BASE_DIR = Path("..").resolve()  # Go up from 'data' to 'empirics'
RAW_DATA_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"

print(f"Base directory: {BASE_DIR}")
print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")

# Check if processed data exists (from pipeline)
PANEL_FILE = PROCESSED_DATA_DIR / "analysis_panel.csv"
if PANEL_FILE.exists():
    print(f"\n✅ Found processed panel data: {PANEL_FILE}")
    USE_PROCESSED = True
else:
    print(f"\n⚠️  No processed panel data found")
    print(f"   Run pipeline first: python ../code/pipeline_xarray.py")
    USE_PROCESSED = False

Base directory: /Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics
Raw data directory: /Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics/data/raw
Processed data directory: /Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics/data/processed

✅ Found processed panel data: /Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics/data/processed/analysis_panel.csv


## 데이터 로드 옵션

**Option 1**: Pipeline으로 생성된 processed data 사용 (권장)  
**Option 2**: Raw data에서 직접 로드 (파일이 있을 경우만)

## Option 1: Pipeline으로 생성된 Analysis Panel 사용 (권장)

이미 모든 전처리가 완료된 데이터를 사용합니다.

In [2]:
if USE_PROCESSED:
    # Load processed analysis panel from pipeline
    panel = pd.read_csv(PANEL_FILE)
    
    print(f"✅ Loaded analysis panel: {len(panel)} observations")
    print(f"   Columns: {list(panel.columns)}")
    print(f"\n   Unique companies: {panel['company_id'].nunique()}")
    print(f"   Series A: {sum(panel['round'] == 'Series A')}")
    print(f"   Series B: {sum(panel['round'] == 'Series B')}")
    
    # Display sample
    display_cols = ['company_id', 'round', 'vagueness', 'high_integration_cost', 
                    'funding_success', 'deal_size']
    if 'company_name' in panel.columns:
        display_cols.insert(1, 'company_name')
    
    print("\n   Sample data:")
    print(panel[display_cols].head(10))
else:
    print("⚠️  No processed data available. Please run pipeline first:")
    print("   cd ../code")
    print("   python pipeline_xarray.py")

✅ Loaded analysis panel: 0 observations
   Columns: ['company_id', 'round', 'series_b_dummy', 'vagueness', 'vagueness_category', 'high_integration_cost', 'integration_cost_label', 'funding_success', 'deal_size', 'deal_date', 'series_a_amount', 'log_series_a_amount', 'employees', 'year_founded', 'total_raised', 'investors', 'post_valuation']

   Unique companies: 0
   Series A: 0
   Series B: 0

   Sample data:
Empty DataFrame
Columns: [company_id, round, vagueness, high_integration_cost, funding_success, deal_size]
Index: []


## Option 2: Raw Data에서 직접 로드 (참고용)

⚠️ **이 방법은 Deal*.dat 파일이 있어야 합니다**  
현재 데이터가 없으면 작동하지 않습니다. Pipeline 사용을 권장합니다.

---

### 데이터 파일 확인

In [3]:
# Check for raw data files
import os

print("📂 Checking raw data directory:")
print(f"   Location: {RAW_DATA_DIR}\n")

company_files = list(RAW_DATA_DIR.glob("Company*.dat"))
deal_files = list(RAW_DATA_DIR.glob("Deal*.dat"))

print(f"Company files: {len(company_files)}")
for f in company_files:
    print(f"  - {f.name}")

print(f"\nDeal files: {len(deal_files)}")
for f in deal_files:
    print(f"  - {f.name}")

if len(deal_files) == 0:
    print("\n❌ No Deal*.dat files found!")
    print("   See DATA_REQUIREMENTS.md for instructions on getting deal data")
else:
    print(f"\n✅ Found {len(deal_files)} Deal file(s)")

📂 Checking raw data directory:
   Location: /Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics/data/raw

Company files: 5
  - Company20251001.dat
  - Company20220101.dat
  - Company20230501.dat
  - Company20220501.dat
  - Company20211201.dat

Deal files: 1
  - Deal20230501.dat

✅ Found 1 Deal file(s)


In [8]:
import pandas as pd
import numpy as np
import xarray as xr
import re
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from pathlib import Path

# ------------------------------------------------
# 1. Load & clean PitchBook raw data
# ------------------------------------------------
# 💡 실제 데이터 경로에 맞게 수정
file_path = Path("/Users/hyunjimoon/MIT Dropbox/Angie.H Moon/tolzul/Front/On/💌찰리스캇 러브레터 플젝/strategic ambiguity/empirics/data/raw/Company20211201.dat")

df = pd.read_csv(file_path, sep="|", low_memory=False)
print(f"✅ Loaded data with shape: {df.shape}")

# Select relevant columns safely
cols = [
    "CompanyID","CompanyName","Description","Keywords","Employees",
    "YearFounded","Universe","FirstFinancingSize","FirstFinancingDate",
    "LastFinancingDealType","LastFinancingDate","TotalRaised"
]
df = df[[c for c in cols if c in df.columns]].copy()

# Basic cleaning
df["Employees"] = pd.to_numeric(df["Employees"], errors="coerce")
df["FirstFinancingSize"] = pd.to_numeric(df["FirstFinancingSize"], errors="coerce")
df["TotalRaised"] = pd.to_numeric(df["TotalRaised"], errors="coerce")
df = df.dropna(subset=["CompanyID","Description"]).reset_index(drop=True)

# ------------------------------------------------
# 2. Convert to xarray.Dataset
# ------------------------------------------------
ds = xr.Dataset.from_dataframe(df.set_index("CompanyID"))
ds = ds.assign_coords(
    company=("CompanyID", ds["CompanyName"].values),
    sector=("CompanyID", ds["Keywords"].values),
    universe=("CompanyID", ds["Universe"].values),
    year_founded=("CompanyID", ds["YearFounded"].values)
)


✅ Loaded data with shape: (420661, 94)


  ds["prior_exit"] = ("CompanyID", np.zeros(ds.dims["CompanyID"]))



                            OLS Regression Results                            
Dep. Variable:     early_funding_musd   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.050
Method:                 Least Squares   F-statistic:                     1878.
Date:                Mon, 27 Oct 2025   Prob (F-statistic):               0.00
Time:                        07:54:21   Log-Likelihood:             6.9825e+05
No. Observations:              106453   AIC:                        -1.396e+06
Df Residuals:                  106449   BIC:                        -1.396e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0038      0.000     30.16

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix

In [9]:
ds

In [11]:

# ------------------------------------------------
# 3. Feature engineering
# ------------------------------------------------
def compute_vagueness(text):
    if not isinstance(text, str): 
        return np.nan
    hedge_words = ["maybe","approximately","somewhat","likely","possibly","potential","around","roughly","hope","aim"]
    total = len(text.split())
    hits = sum(text.lower().count(h) for h in hedge_words)
    return hits / total if total > 0 else np.nan

ds["vagueness"] = ("CompanyID", [compute_vagueness(t) for t in ds["Description"].values])
ds["early_funding_musd"] = ds["FirstFinancingSize"] / 1e6

# Later success flag
patterns = ["series b", "series c", "later"]
later_success = [
    int(any(p in str(x).lower() for p in patterns))
    for x in ds["LastFinancingDealType"].values
]
ds["later_success"] = ("CompanyID", later_success)

# later_success = ds["LastFinancingDealType"].astype(str).str.contains("Series B|Series C|Later", case=False, na=False).astype(int)
ds["later_success"] = ("CompanyID", later_success)

# Controls
ds["employees_log"] = np.log1p(ds["Employees"])
ds["prior_exit"] = ("CompanyID", np.zeros(ds.dims["CompanyID"]))

# Industry integration (hardware/robotics, etc.)
high_integrated = ["hardware","robot","device","chip","biotech","instrument","sensor"]
def tag_integration(k):
    if not isinstance(k, str): return np.nan
    return int(any(w in k.lower() for w in high_integrated))
ds["high_integration_cost"] = ("CompanyID", [tag_integration(k) for k in ds["Keywords"].values])

# ------------------------------------------------
# 4. Convert to tidy frame
# ------------------------------------------------
dfm = ds[[
    "vagueness","early_funding_musd","later_success","high_integration_cost",
    "employees_log","year_founded"
]].to_dataframe().dropna(subset=["vagueness","early_funding_musd"]).reset_index()

# ------------------------------------------------
# 5. Hypothesis 1 – OLS
# ------------------------------------------------
model1 = smf.ols("early_funding_musd ~ vagueness + employees_log + year_founded", data=dfm).fit()
print("\n=====================  Model H1  =====================")
print(model1.summary())


  ds["prior_exit"] = ("CompanyID", np.zeros(ds.dims["CompanyID"]))



                            OLS Regression Results                            
Dep. Variable:     early_funding_musd   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.050
Method:                 Least Squares   F-statistic:                     1878.
Date:                Mon, 27 Oct 2025   Prob (F-statistic):               0.00
Time:                        07:59:05   Log-Likelihood:             6.9825e+05
No. Observations:              106453   AIC:                        -1.396e+06
Df Residuals:                  106449   BIC:                        -1.396e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0038      0.000     30.16

In [None]:
# Notebook or script에서
from 05_create_deliverable import diagnose_h1

fig, table, tests = diagnose_h1(df_h1, output_path='results/h1_diagnostic.png')
print(f"H1 {tests['verdict']}: slope={tests['trend_slope']:.4f}, p={tests['trend_p']:.4f}")
print(table)

SyntaxError: invalid syntax (2023517669.py, line 2)

In [12]:

# ------------------------------------------------
# 6. Hypothesis 2 – Logit with interaction
# ------------------------------------------------
df_h2 = dfm.dropna(subset=["later_success","high_integration_cost"])
model2 = smf.logit(
    "later_success ~ vagueness * high_integration_cost + early_funding_musd + employees_log + year_founded",
    data=df_h2
).fit()
print("\n=====================  Model H2  =====================")
print(model2.summary())

# ------------------------------------------------
# 7. Plot interaction (optional)
# ------------------------------------------------
v = np.linspace(df_h2["vagueness"].min(), df_h2["vagueness"].max(), 100)
X_low = pd.DataFrame({
    "vagueness": v,
    "high_integration_cost": 0,
    "early_funding_musd": df_h2["early_funding_musd"].mean(),
    "employees_log": df_h2["employees_log"].mean(),
    "year_founded": df_h2["year_founded"].mean()
})
X_high = X_low.copy()
X_high["high_integration_cost"] = 1

pred_low = model2.predict(X_low)
pred_high = model2.predict(X_high)

plt.figure(figsize=(6,4))
plt.plot(v, pred_low, label="Modular (low integration cost)")
plt.plot(v, pred_high, '--', label="Integrated (high integration cost)")
plt.xlabel("Vagueness")
plt.ylabel("Predicted Later Success (prob.)")
plt.title("H2 Interaction: Vagueness × Integration cost")
plt.legend()
plt.tight_layout()
plt.show()

# ------------------------------------------------
# 8. Save results
# ------------------------------------------------
results_ds = xr.Dataset(
    {
        "coef_H1": (["var"], model1.params.values),
        "p_H1": (["var"], model1.pvalues.values),
        "coef_H2": (["var"], model2.params.values),
        "p_H2": (["var"], model2.pvalues.values)
    },
    coords={"var": list(model1.params.index)}
)
out_path = Path(file_path).parent / "model_results.nc"
results_ds.to_netcdf(out_path)
print(f"\n✅ Saved regression coefficients to {out_path}")

  return 1/(1+np.exp(-X))


         Current function value: inf
         Iterations: 35


  return np.sum(np.log(self.cdf(q * linpred)))


LinAlgError: Singular matrix

In [13]:
df_h2 = dfm.dropna(subset=[
    "later_success", "high_integration_cost",
    "vagueness", "early_funding_musd",
    "employees_log", "year_founded"
])

print(df_h2[["high_integration_cost",
             "vagueness",
             "early_funding_musd",
             "employees_log",
             "year_founded"]].nunique())


high_integration_cost        2
vagueness                  205
early_funding_musd       50039
employees_log             3124
year_founded               215
dtype: int64


In [20]:
# 1) 유일값 전체 보기 (정렬 포함)
sorted(df_h2["year_founded"].unique())

[1800.0,
 1801.0,
 1803.0,
 1804.0,
 1805.0,
 1807.0,
 1809.0,
 1810.0,
 1811.0,
 1813.0,
 1814.0,
 1815.0,
 1816.0,
 1817.0,
 1819.0,
 1820.0,
 1821.0,
 1822.0,
 1824.0,
 1825.0,
 1826.0,
 1827.0,
 1828.0,
 1829.0,
 1830.0,
 1831.0,
 1832.0,
 1833.0,
 1834.0,
 1835.0,
 1836.0,
 1837.0,
 1838.0,
 1839.0,
 1840.0,
 1841.0,
 1842.0,
 1843.0,
 1844.0,
 1845.0,
 1846.0,
 1847.0,
 1848.0,
 1849.0,
 1850.0,
 1851.0,
 1852.0,
 1853.0,
 1854.0,
 1855.0,
 1856.0,
 1857.0,
 1859.0,
 1860.0,
 1861.0,
 1862.0,
 1863.0,
 1864.0,
 1865.0,
 1866.0,
 1867.0,
 1868.0,
 1869.0,
 1870.0,
 1871.0,
 1872.0,
 1873.0,
 1874.0,
 1875.0,
 1876.0,
 1877.0,
 1878.0,
 1879.0,
 1880.0,
 1881.0,
 1882.0,
 1883.0,
 1884.0,
 1885.0,
 1886.0,
 1887.0,
 1888.0,
 1889.0,
 1890.0,
 1891.0,
 1892.0,
 1893.0,
 1894.0,
 1895.0,
 1896.0,
 1897.0,
 1898.0,
 1899.0,
 1900.0,
 1901.0,
 1902.0,
 1903.0,
 1904.0,
 1905.0,
 1906.0,
 1907.0,
 1908.0,
 1909.0,
 1910.0,
 1911.0,
 1912.0,
 1913.0,
 1914.0,
 1915.0,
 1916.0,
 1917.0,
 

In [22]:

suspect = df_h2[df_h2["year_founded"] <1850]
print(suspect.head())         # 일부만 보고 싶을 때
# 또는
suspect[["company_id", "company_name", "year_founded"]]

      CompanyID  vagueness  early_funding_musd  later_success  \
310    10025-29   0.000000        7.000000e-03              0   
430    10036-99   0.011111        8.545791e-04              0   
525    10043-29   0.000000        3.030000e-05              0   
536   100440-28   0.000000        8.785168e-07              0   
1070   10083-61   0.000000        2.500000e-05              0   

      high_integration_cost  employees_log  year_founded  \
310                     0.0      10.375520        1844.0   
430                     0.0       8.039480        1847.0   
525                     0.0       5.817111        1829.0   
536                     0.0       7.550135        1828.0   
1070                    0.0       4.189655        1838.0   

                         company  \
310                National City   
430                       Lazard   
525                   Woodstream   
536   Martin Belaysoud Expansion   
1070    1838 Investment Advisors   

                               

KeyError: "['company_id', 'company_name'] not in index"

In [None]:

# ------------------------------------------------
# 6. Hypothesis 2 – Logit with interaction
# ------------------------------------------------
df_h2 = dfm.dropna(subset=["later_success","high_integration_cost"])
model2 = smf.logit(
    "later_success ~ vagueness * high_integration_cost + early_funding_musd + employees_log + year_founded",
    data=df_h2
).fit()
print("\n=====================  Model H2  =====================")
print(model2.summary())

# ------------------------------------------------
# 7. Plot interaction (optional)
# ------------------------------------------------
v = np.linspace(df_h2["vagueness"].min(), df_h2["vagueness"].max(), 100)
X_low = pd.DataFrame({
    "vagueness": v,
    "high_integration_cost": 0,
    "early_funding_musd": df_h2["early_funding_musd"].mean(),
    "employees_log": df_h2["employees_log"].mean(),
    "year_founded": df_h2["year_founded"].mean()
})
X_high = X_low.copy()
X_high["high_integration_cost"] = 1

pred_low = model2.predict(X_low)
pred_high = model2.predict(X_high)

plt.figure(figsize=(6,4))
plt.plot(v, pred_low, label="Modular (low integration cost)")
plt.plot(v, pred_high, '--', label="Integrated (high integration cost)")
plt.xlabel("Vagueness")
plt.ylabel("Predicted Later Success (prob.)")
plt.title("H2 Interaction: Vagueness × Integration cost")
plt.legend()
plt.tight_layout()
plt.show()

# ------------------------------------------------
# 8. Save results
# ------------------------------------------------
results_ds = xr.Dataset(
    {
        "coef_H1": (["var"], model1.params.values),
        "p_H1": (["var"], model1.pvalues.values),
        "coef_H2": (["var"], model2.params.values),
        "p_H2": (["var"], model2.pvalues.values)
    },
    coords={"var": list(model1.params.index)}
)
out_path = Path(file_path).parent / "model_results.nc"
results_ds.to_netcdf(out_path)
print(f"\n✅ Saved regression coefficients to {out_path}")

Unnamed: 0,CompanyID,vagueness,early_funding_musd,later_success,high_integration_cost,employees_log,year_founded,company,sector,universe
0,100002-52,0.0,4.000000e-06,0,0.0,5.455321,1988.0,Inova (Product Marketing),"jewelry, nutritional supplements, product mark...",Private Equity
1,100003-15,0.0,2.500000e-08,0,0.0,2.302585,2011.0,Premama,"fertility care, maternity food, postnatal care...","Debt Financed, Private Equity, Venture Capital"
2,100004-77,0.0,1.884050e-06,0,1.0,,1999.0,World Energy Labs,"diagnostic technology, electrochemical device,...",Venture Capital
3,100006-12,0.0,5.000000e-07,0,0.0,2.484907,2012.0,ChainSync,"franchise operations technology, franchisee ma...",Pre-venture
4,100006-57,0.0,1.400000e-06,0,0.0,,2012.0,IzumoBASE,"software defined storage, storage management a...",Venture Capital
...,...,...,...,...,...,...,...,...,...,...
169843,99998-56,0.0,2.292950e-07,0,1.0,1.791759,2003.0,Imiplex,"biosensor, dna technology, nanostructure tools...",Venture Capital
169844,99999-01,0.0,1.800000e-06,0,0.0,,2005.0,Life Mist Technologies,"aerosol technology, atomization system, bacter...",Venture Capital
169845,99999-19,0.0,1.500000e-07,1,0.0,2.564949,2010.0,Innovative Supply Solutions,"clinical drug supply chain management, clinica...",Venture Capital
169846,99999-64,0.0,2.500000e-07,1,0.0,,2011.0,RiboNova,"precision medicine, rare disease treatment, ri...",Venture Capital


In [5]:
# 회사 데이터 로드
company = pd.read_csv(f"{RAW_DATA_DIR}/Company20230501.dat", sep='|', nrows=SAMPLE, low_memory=False)
print(f"전체 회사: {len(company)}개")

# AI/ML 회사만 필터링
ai_keywords = ['AI', 'ML', 'machine learning', 'artificial intelligence']
company['is_ai_ml'] = company.apply(
    lambda row: any(kw.lower() in str(row.get('Description', '')).lower() 
                    for kw in ai_keywords), axis=1
)
ai_companies = company[company['is_ai_ml']].copy()

print(f"AI/ML 회사: {len(ai_companies)}개")
ai_companies[['CompanyID', 'CompanyName', 'Description']].head()

NameError: name 'SAMPLE' is not defined

## Step 2: 애매모호함(Vagueness) 계산

Description에서 애매한 단어 vs 명확한 단어 비율

In [11]:
def calc_vagueness(desc):
    if pd.isna(desc):
        return 50
    text = str(desc).lower()
    vague_words = ['approximately', 'around', 'flexible', 'scalable']
    precise_words = ['precisely', 'exactly', 'guaranteed', 'specific']
    
    vague_count = sum(text.count(w) for w in vague_words)
    precise_count = sum(text.count(w) for w in precise_words)
    
    return max(0, min(100, 50 + 10 * (vague_count - precise_count)))

ai_companies['vagueness'] = ai_companies['Description'].apply(calc_vagueness)

print(f"평균 vagueness: {ai_companies['vagueness'].mean():.1f}")
ai_companies[['CompanyName', 'vagueness']].head(10)

평균 vagueness: 50.1


Unnamed: 0,CompanyName,vagueness
4,Pollo Regio,50
6,Pequeno Mexico Operating Company,50
7,Yogurtland Franchising,50
8,Inova US,50
11,Career Educational Services,50
16,G2See,50
18,ChainSync,50
25,Craft Equipment Company,50
26,Woodham Mortimer,50
28,ImPress Systems,50


## Step 3: Deal 데이터 - VC 투자 찾기

In [12]:
# Deal 데이터 로드
deal = pd.read_csv(f"{DATA_DIR}/Deal20230501.dat", sep='|', nrows=SAMPLE, low_memory=False)
print(f"전체 딜: {len(deal)}개")

# VC 투자만 필터링
vc_deals = deal[deal['DealType'].str.contains('VC', case=False, na=False)].copy()
print(f"VC 딜: {len(vc_deals)}개")

# Series A (초기) vs Series B (후기) 분류
early_rounds = ['1st Round', 'Seed Round', 'Angel']
later_rounds = ['2nd Round', '3rd Round', '4th Round']

vc_deals['round'] = np.where(
    vc_deals['VCRound'].isin(early_rounds), 'Series A',
    np.where(vc_deals['VCRound'].isin(later_rounds), 'Series B', None)
)

vc_deals = vc_deals[vc_deals['round'].notna()]
print(f"Series A: {sum(vc_deals['round'] == 'Series A')}개")
print(f"Series B: {sum(vc_deals['round'] == 'Series B')}개")

vc_deals[['CompanyID', 'CompanyName', 'round', 'DealSize']].head()

전체 딜: 1000개
VC 딜: 258개
Series A: 127개
Series B: 100개


Unnamed: 0,CompanyID,CompanyName,round,DealSize
3,100001-08,Zana,Series A,
26,100003-15,Premama,Series A,1.399999
27,100003-15,Premama,Series B,3.25
29,100003-15,Premama,Series B,3.500001
30,100003-15,Premama,Series B,5.9


## Step 4: 펀딩 성공 정의

In [13]:
# 성공 = 딜 완료 & 금액 > 0
vc_deals['DealSize'] = pd.to_numeric(vc_deals['DealSize'], errors='coerce').fillna(0)
vc_deals['funding_success'] = (
    (vc_deals['DealSize'] > 0) & 
    (vc_deals['DealStatus'].str.contains('Completed', case=False, na=False))
).astype(int)

print(f"성공한 딜: {vc_deals['funding_success'].sum()}개")
print(f"성공률: {vc_deals['funding_success'].mean():.1%}")

성공한 딜: 123개
성공률: 54.2%


## Step 5: Company + Deal 조인 - 분석 패널 생성

In [14]:
# CompanyID로 조인
ai_companies_indexed = ai_companies.set_index('CompanyID')

panel = vc_deals.merge(
    ai_companies_indexed[['vagueness']],
    left_on='CompanyID',
    right_index=True,
    how='inner'
)

print(f"\n분석 패널: {len(panel)}개 관측치")
print(f"Series A: {sum(panel['round'] == 'Series A')}개")
print(f"Series B: {sum(panel['round'] == 'Series B')}개")

panel[['CompanyName', 'round', 'vagueness', 'funding_success']].head(10)


분석 패널: 65개 관측치
Series A: 28개
Series B: 37개


Unnamed: 0,CompanyName,round,vagueness,funding_success
39,G2See,Series A,50,0
67,ImPress Systems,Series A,50,1
127,UOKO,Series A,50,1
128,UOKO,Series B,50,1
129,UOKO,Series B,50,1
130,UOKO,Series B,50,1
194,Hyakusenrenma,Series B,50,0
195,Hyakusenrenma,Series B,50,1
196,Hyakusenrenma,Series B,50,1
207,Maven,Series B,50,1


## Step 6: 핵심 질문 - Vagueness가 펀딩 성공에 미치는 영향은?

**가설**: 초기(Series A)에는 애매한 약속이 좋지만, 후기(Series B)에는 명확한 약속이 좋다

In [15]:
# 라운드별 성공률 비교
summary = panel.groupby(['round', pd.cut(panel['vagueness'], bins=[0, 50, 100], labels=['Low', 'High'])]).agg({
    'funding_success': ['count', 'sum', 'mean']
}).round(3)

summary.columns = ['Count', 'Successes', 'Success_Rate']
print("\n성공률 by Round & Vagueness:")
print(summary)


성공률 by Round & Vagueness:
                    Count  Successes  Success_Rate
round    vagueness                                
Series A Low           28         12         0.429
         High           0          0           NaN
Series B Low           37         27         0.730
         High           0          0           NaN


## Step 7: 회귀 분석 - 통계적 검증

In [16]:
import statsmodels.formula.api as smf

# 변수 준비
panel['vagueness_scaled'] = panel['vagueness'] / 100
panel['series_b_dummy'] = (panel['round'] == 'Series B').astype(int)

# 회귀식: funding_success ~ vagueness + round + interaction
formula = 'funding_success ~ vagueness_scaled + series_b_dummy + vagueness_scaled:series_b_dummy'

try:
    model = smf.logit(formula, data=panel).fit(disp=False)
except:
    model = smf.ols(formula, data=panel).fit()

print("\n회귀 분석 결과:")
print(model.summary())


회귀 분석 결과:
                            OLS Regression Results                            
Dep. Variable:        funding_success   R-squared:                       0.093
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     6.434
Date:                Wed, 22 Oct 2025   Prob (F-statistic):             0.0137
Time:                        14:17:56   Log-Likelihood:                -42.689
No. Observations:                  65   AIC:                             89.38
Df Residuals:                      63   BIC:                             93.73
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------

## 핵심 해석

- **vagueness_scaled**: Series A에서 vagueness 1단위 증가 시 성공 확률 변화
- **vagueness_scaled:series_b_dummy**: Series B에서는 효과가 반대로 (reversal)
- p < 0.05이면 통계적으로 유의미