## JOIN

In [39]:
# Import Libraries
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

# 시각화 OS별 한글폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif platform.system() == 'Mac':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정
    
print("="*60)
print("라이브러리 로드 완료!")
print("한글 폰트 설정 완료!")
print("="*60)

Windows
라이브러리 로드 완료!
한글 폰트 설정 완료!


In [40]:
# Load Dataset
obj   = pd.read_csv("./data/clean/clean_objects_final.csv")         # objects
frs   = pd.read_csv("./data/clean/clean_fr_final.csv")              # funding_rounds
inv   = pd.read_csv("./data/clean/clean_investments_final.csv")     # investments
acq   = pd.read_csv("./data/clean/clean_acquisitions_final.csv")    # acquisitions
ipo   = pd.read_csv("./data/clean/clean_ipos_final.csv")            # ipos
rel   = pd.read_csv("./data/clean/clean_relationships_final.csv")   # relationships
off   = pd.read_csv("./data/clean/clean_offices_final.csv")         # offices
print("="*60)
print("데이터셋 로드 완료!")
print("="*60)

데이터셋 로드 완료!


In [41]:
print("obj: ", obj.shape)
print("frs: ", frs.shape)
print("inv: ", inv.shape)
print("acq: ", acq.shape)
print("ipo: ", ipo.shape)
print("rel: ", rel.shape)
print("off: ", off.shape)

obj:  (462620, 45)
frs:  (52928, 17)
inv:  (80902, 4)
acq:  (9562, 11)
ipo:  (1259, 16)
rel:  (402412, 10)
off:  (112718, 11)


### START-UP

#### 투자 성공률 (핵심 테이블: FRS,INV, ACQ, IPO, OFF)
<span style="font-size: 15px;">
grain: 스타트업 id 
</span>

In [42]:
cols_use = [
    #objects
    "objects_cfpr_id"       , "founded_at"          , "closed_at"           ,
    "description"           , "country_code"        , "obj_city_fixed"      , 
    "first_investment_at"   , "last_investment_at"  , "investment_rounds"   ,
    "invested_companies"    , "first_funding_at"    , "last_funding_at"     ,
    "funding_rounds"        , "funding_total_usd"   , "relationships"       ,
    "cat_obj_status"        , "obj_category_filled" , "cat_obj_overview"    ,
    "obj_state_filled"      , "is_obj_funding_total_usd_private"  
]
obj = obj[cols_use].copy()
print(obj.columns)

Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private'],
      dtype='object')


##### ◼ 파생변수 생성

<span style="font-size: 15px;">
<code>success_flag</code> : 성공적인 EXIT을 달성 플래그
</span>

In [43]:
cond1 = obj["objects_cfpr_id"].astype(str).str.startswith("c:")     # 회사 조건
cond2 = obj["objects_cfpr_id"].isin(ipo["ipos_c_id"].dropna())      # IPO 성공 조건
cond3 = obj["objects_cfpr_id"].isin(acq["acquired_c_id"].dropna())  # 인수(M&A) 성공 조건

In [44]:
# cond4: funding_rounds의 num_fr_type 조건
frs["funded_at"] = pd.to_datetime(frs["funded_at"], errors="coerce") # 날짜 변환

# 회사별 num_fr_type 범위
fr_type_range = (
    frs.dropna(subset=["fr_c_id", "num_fr_type"])
       .groupby("fr_c_id")["num_fr_type"]
       .agg(num_fr_type_min="min", num_fr_type_max="max")
)

# cond4 해석:
# 라운드 타입이 최소 1단계라도 진행(= max - min >= 1)
# max_type < 5 
cond4_1 = (fr_type_range["num_fr_type_max"] - fr_type_range["num_fr_type_min"]) >= 1
cond4_2 = fr_type_range["num_fr_type_max"] < 5
fr_type_range["is_round_on"] = cond4_1 & cond4_2 # cond4 라운드 진행 조건
fr_type_range

Unnamed: 0_level_0,num_fr_type_min,num_fr_type_max,is_round_on
fr_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c:1,1,3,True
c:1001,1,1,False
c:10014,0,0,False
c:10015,1,99,False
c:100155,2,99,False
...,...,...,...
c:99853,0,0,False
c:9989,0,0,False
c:9994,0,0,False
c:9995,0,0,False


In [45]:
# obj에 회사별 cond4 붙이기
obj_tmp = obj.merge(fr_type_range[["is_round_on"]], left_on="objects_cfpr_id", right_index=True, how="left")
obj_tmp["is_round_on"] = obj_tmp["is_round_on"].fillna(False)

# 성공 플래그 (회사면서, IPO or 인수 or 라운드)
obj["success_flag"] = (cond1 & (cond2 | cond3 | obj_tmp["is_round_on"])).astype(int)
obj

Unnamed: 0,objects_cfpr_id,founded_at,closed_at,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,relationships,cat_obj_status,obj_category_filled,cat_obj_overview,obj_state_filled,is_obj_funding_total_usd_private,success_flag
0,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1
1,c:10,,,,USA,culver city,,,0,0,,,,,6,acquired,games_video,Software & Technology,CA,1,1
2,c:100,,,,USA,san mateo,,,0,0,,,,,12,acquired,games_video,Software & Technology,CA,1,1
3,c:10000,2008-07-26,,,,,,,0,0,,,0.0,0.0,0,operating,network_hosting,Software & Technology,,0,0
4,c:10001,2008-07-26,,,,,,,0,0,,,0.0,0.0,0,operating,games_video,Software & Technology,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462615,r:9995,,,,,,,,0,0,,,0.0,0.0,0,operating,software,Software & Technology,,0,0
462616,r:9996,,,,,,,,0,0,,,0.0,0.0,0,operating,software,Software & Technology,,0,0
462617,r:9997,,,,,,,,0,0,,,0.0,0.0,0,operating,software,Software & Technology,,0,0
462618,r:9998,,,,,,,,0,0,,,0.0,0.0,0,operating,software,Software & Technology,,0,0


In [46]:
obj[["objects_cfpr_id", "success_flag"]].head()

Unnamed: 0,objects_cfpr_id,success_flag
0,c:1,1
1,c:10,1
2,c:100,1
3,c:10000,0
4,c:10001,0


<span style="font-size: 15px;">
<code>size_bin</code> : 기업의 규모 구간
</span>

In [47]:
# 회사 정보만 필터링
cond1 = obj["objects_cfpr_id"].astype(str).str.startswith("c:")
obj_size_bin = obj.loc[cond1].copy()
display(obj_size_bin["objects_cfpr_id"].str[0].value_counts())

# quantile 기반 bins
q = obj_size_bin["relationships"].quantile([0.25, 0.5, 0.75])
print("bin 기준\n", q)

bins = [-1, q[0.25], q[0.5], q[0.75], obj_size_bin["relationships"].max()]
labels = ["초소형팀", "소형팀", "중형팀", "대형팀"]

obj_size_bin["size_bin"] = pd.cut(
    obj_size_bin["relationships"],
    bins=bins,
    labels=labels,
    include_lowest=True  # 최소값도 포함
)

display(obj_size_bin["size_bin"].value_counts())
# 초소형 66885 | 소형 70569 | 중형 23258 | 대형 35837

# obj에 size_bin 병합 → 회사가 아닌 row는 size_bin이 NaN으로 남음
obj = obj.merge(
    obj_size_bin[["objects_cfpr_id", "size_bin"]],
    on="objects_cfpr_id",
    how="left"
)
display(obj[["objects_cfpr_id", "relationships", "size_bin"]].head())

objects_cfpr_id
c    196549
Name: count, dtype: int64

bin 기준
 0.25    0.0
0.50    1.0
0.75    2.0
Name: relationships, dtype: float64


size_bin
소형팀     70569
초소형팀    66885
대형팀     35837
중형팀     23258
Name: count, dtype: int64

Unnamed: 0,objects_cfpr_id,relationships,size_bin
0,c:1,17,대형팀
1,c:10,6,대형팀
2,c:100,12,대형팀
3,c:10000,0,초소형팀
4,c:10001,0,초소형팀


<span style="font-size: 15px;">
<code>round_tempo_months</code> : 투자 템포(개월)
</span>

In [48]:
# 회사별 라운드 간격 계산
round_tempo = (frs.sort_values(['fr_c_id','funded_at']))
round_tempo['prev_round_date'] = round_tempo.groupby('fr_c_id')['funded_at'].shift(1)

round_tempo['round_tempo_days'] = (round_tempo['funded_at'] - round_tempo['prev_round_date']).dt.days
round_tempo["round_tempo_days"] = round_tempo["round_tempo_days"].astype("Int64") # 정수형으로 변경

round_tempo['round_tempo_months'] = round(round_tempo['round_tempo_days'] / 30) 
display(round_tempo[["fr_c_id","funded_at", "round_tempo_days","round_tempo_months"]])

Unnamed: 0,fr_c_id,funded_at,round_tempo_days,round_tempo_months
818,c:1,2005-10-01,,
819,c:1,NaT,,
2109,c:1,NaT,,
1517,c:1001,NaT,,
6048,c:10014,NaT,,
...,...,...,...,...
5207,c:9989,NaT,,
2958,c:9994,NaT,,
5527,c:9994,NaT,,
2967,c:9995,NaT,,


In [49]:
# 회사별 median round_tempo_months 계산
tempo_by_company = (
    round_tempo
    .groupby("fr_c_id")
    .agg(    
        round_tempo_months=("round_tempo_months", "median") 
    )
    .reset_index()
    .rename(columns={"fr_c_id": "objects_cfpr_id"})
)
tempo_by_company["round_tempo_months"] = tempo_by_company["round_tempo_months"].astype("Int64") # 정수형으로 변경
tempo_by_company[tempo_by_company["round_tempo_months"].notna()].head()

Unnamed: 0,objects_cfpr_id,round_tempo_months
19,c:10054,10
121,c:103,13
141,c:1038,19
166,c:10424,12
216,c:10512,48


In [50]:
tempo_by_company.columns

Index(['objects_cfpr_id', 'round_tempo_months'], dtype='object')

In [51]:
# obj에 회사별 median round_tempo_months 병합
obj = obj.merge(
    tempo_by_company,
    on="objects_cfpr_id",
    how="left"
)

In [52]:
obj.columns

Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'round_tempo_months'],
      dtype='object')

In [53]:
cond1 = (obj["round_tempo_months"].notna())
obj.loc[cond1, ["objects_cfpr_id", "round_tempo_months"]].head()

Unnamed: 0,objects_cfpr_id,round_tempo_months
66,c:10054,10
485,c:103,13
620,c:1038,19
748,c:10424,12
1015,c:10512,48


<span style="font-size: 15px;">
<code>time_to_last_round</code> : 첫 투자 → 마지막 투자 기간(개월)
</span>

In [54]:
obj["first_funding_at"] = pd.to_datetime(obj["first_funding_at"], errors="coerce")
obj["last_funding_at"]  = pd.to_datetime(obj["last_funding_at"],  errors="coerce")

# object에 있는 first_funding_at, last_funding_at 사용
obj["time_to_last_round_months"] = (
    (obj["last_funding_at"] - obj["first_funding_at"]).dt.days / 30
)
obj["time_to_last_round_months"] = obj["time_to_last_round_months"].round().astype("Int64") # 정수형으로 변경

# 유효하지 않은 값(NaT, 음수)을 NaN으로 변경
cond1 = obj["first_funding_at"].notna()
cond2 = obj["last_funding_at"].notna()
cond3 = (obj["last_funding_at"] >= obj["first_funding_at"]) # 마지막 < 첫 번째
mask_valid = ( 
    cond1 & cond2 & cond3 
)

obj.loc[~mask_valid, "time_to_last_round_months"] = np.nan
display(obj.loc[mask_valid, ["first_funding_at", "last_funding_at", "time_to_last_round_months"]].head())

Unnamed: 0,first_funding_at,last_funding_at,time_to_last_round_months
0,2005-10-01,2008-05-19,32
19,2008-10-10,2013-08-13,59
26,2003-11-01,2003-11-01,0
48,2007-10-01,2007-10-01,0
74,2011-11-04,2011-11-04,0


##### ◼ funding_rounds

In [55]:
frs.columns

Index(['funding_round_id', 'fr_c_id', 'funded_at', 'funding_round_type',
       'funding_round_code', 'raised_amount_usd', 'pre_money_valuation_usd',
       'post_money_valuation_usd', 'participants', 'is_first_round',
       'is_last_round', 'funded_year', 'funded_quarter', 'cat_fr_type',
       'num_fr_type', 'log_participants', 'is_fr_raised_private'],
      dtype='object')

In [56]:
cols_use = [
    #funding_rounds
    "funding_round_id"  , "fr_c_id"         , "funded_at"           , 
    "raised_amount_usd" , "participants"    , "is_first_round"      , 
    "is_last_round"     , "num_fr_type"     , "is_fr_raised_private",
    "cat_fr_type"  
]
frs = frs[cols_use].copy()
print(frs.columns)

Index(['funding_round_id', 'fr_c_id', 'funded_at', 'raised_amount_usd',
       'participants', 'is_first_round', 'is_last_round', 'num_fr_type',
       'is_fr_raised_private', 'cat_fr_type'],
      dtype='object')


##### ◼ investments

In [57]:
inv.columns

Index(['investments_id', 'funding_round_id', 'invested_c_id',
       'investor_cfp_id'],
      dtype='object')

In [58]:
cols_use = [
    #investments
    "investments_id", "funding_round_id", "invested_c_id", "investor_cfp_id"
]
inv = inv[cols_use].copy()
print(inv.columns)

Index(['investments_id', 'funding_round_id', 'invested_c_id',
       'investor_cfp_id'],
      dtype='object')


##### ◼ acquisitions

In [59]:
acq.columns

Index(['acquisition_id', 'acquiring_c_id', 'acquired_c_id', 'term_code',
       'price_amount', 'price_currency_code', 'acquired_at',
       'is_acq_price_private', 'is_acquisitions_acq_at_missing',
       'price_amount_usd', 'acqusition_currency_rate'],
      dtype='object')

In [60]:
cols_use = [
    #acquisitions
    "acquisition_id"    , "acquired_c_id"       , "acquiring_c_id"  , 
    "acquired_at"       , "is_acq_price_private", "price_amount_usd" 
]
acq = acq[cols_use].copy()
print(acq.columns)

Index(['acquisition_id', 'acquired_c_id', 'acquiring_c_id', 'acquired_at',
       'is_acq_price_private', 'price_amount_usd'],
      dtype='object')


In [61]:
acq["acquired_at"] = pd.to_datetime(acq["acquired_at"], errors='coerce')
acq_by_company = (
    acq
    .groupby("acquired_c_id")
    .agg(
            acquisition_id        = ("acquisition_id","first"),
            acquiring_c_id        = ("acquiring_c_id","first"),
            acquired_at           = ("acquired_at","min"), # 첫번째 이벤트
            is_acq_price_private  = ("is_acq_price_private","max"),
            price_amount_usd      = ("price_amount_usd","sum")
        )
)
acq_by_company

Unnamed: 0_level_0,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd
acquired_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c:10,1,c:11,2007-05-30,0,20000000.0
c:100,20,c:377,2005-05-29,1,0.0
c:1001,1901,c:5,2009-08-10,0,47500000.0
c:10014,3878,c:23054,2010-09-30,1,0.0
c:100265,6106,c:38215,2011-09-06,1,0.0
...,...,...,...,...,...
c:9949,9692,c:267149,2013-09-25,0,400000000.0
c:99685,7240,c:161312,2011-08-01,0,4800000.0
c:997,85,c:29,2007-09-01,1,350000000.0
c:99737,6463,c:67724,2011-11-30,1,0.0


##### ◼ ipos

In [62]:
ipo.columns

Index(['ipo_id', 'ipos_c_id', 'valuation_amount', 'valuation_currency_code',
       'raised_amount', 'raised_currency_code', 'public_at', 'stock_symbol',
       'is_ipos_public_at_missing', 'stock_normalized',
       'is_ipos_valuation_private', 'valuation_amount_usd',
       'valuation_currency_rate', 'is_ipos_raised_private',
       'ipo_raised_amount_usd', 'ipo_raised_currency_rate'],
      dtype='object')

In [63]:
cols_use = [
    #ipos
    "ipo_id"                , "ipos_c_id"                   , "public_at"               , 
    "valuation_amount_usd"  , "is_ipos_valuation_private"   , "is_ipos_raised_private"  , 
    "ipo_raised_amount_usd" 
]
ipo = ipo[cols_use].copy()
print(ipo.columns)

Index(['ipo_id', 'ipos_c_id', 'public_at', 'valuation_amount_usd',
       'is_ipos_valuation_private', 'is_ipos_raised_private',
       'ipo_raised_amount_usd'],
      dtype='object')


In [64]:
ipo["public_at"] = pd.to_datetime(ipo["public_at"], errors='coerce')
ipo_by_company = (
    ipo
    .groupby("ipos_c_id")
    .agg(
            ipo_id                      = ("ipo_id","first"),
            first_public_at             = ("public_at","min"),   # 첫번째 이벤트
            valuation_amount_usd        = ("valuation_amount_usd","sum"),
            ipo_raised_amount_usd       = ("ipo_raised_amount_usd","sum"),
            is_ipos_valuation_private   = ("is_ipos_valuation_private","max"),
            is_ipos_raised_private      = ("is_ipos_raised_private","max")
        )
)
ipo_by_company

Unnamed: 0_level_0,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private
ipos_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c:100844,1022,NaT,0.0,0.0,1,1
c:10106,1110,NaT,0.0,0.0,1,1
c:10166,840,NaT,0.0,0.0,1,1
c:10222,804,NaT,0.0,0.0,1,1
c:10241,513,NaT,0.0,0.0,1,1
...,...,...,...,...,...,...
c:966,28,NaT,0.0,0.0,1,1
c:9743,1153,NaT,0.0,0.0,1,1
c:9786,528,NaT,0.0,0.0,1,1
c:988,1146,NaT,0.0,0.0,1,1


##### ◼ offices

In [65]:
off.columns

Index(['offices_c_id', 'office_id', 'description', 'city', 'state_code',
       'country_code', 'latitude', 'longitude', 'offices_description_fixed',
       'cat_offices_description', 'offices_state_filled'],
      dtype='object')

In [66]:
cols_use = [
    #offices
    "offices_c_id"  , "office_id"  , "city"       , 
    "country_code"  , "latitude"   , "longitude"  , 
    "cat_offices_description" 
]
off = off[cols_use].copy()
print(off.columns)

Index(['offices_c_id', 'office_id', 'city', 'country_code', 'latitude',
       'longitude', 'cat_offices_description'],
      dtype='object')


In [67]:
off_by_company = (
    off
    .groupby("offices_c_id")
    .agg(
            n_offices        = ("office_id","nunique")
        )
)
off_by_company

Unnamed: 0_level_0,n_offices
offices_c_id,Unnamed: 1_level_1
c:1,2
c:10,1
c:100,1
c:10002,2
c:10003,1
...,...
f:9994,1
f:9995,1
f:9997,1
f:9998,1


##### 📌 join (start-up main)
<table style="font-size: 12px;">
  <tr>
    <th>구분</th>
    <th>설명</th>
  </tr>
  <tr>
    <td><b>grain</b></td>
    <td>스타트업 id</td>
  </tr>
  <tr>
    <td><b>contents</b></td>
    <td>스타트업 성공률 분석의 기준이 되는 테이블</td>
  </tr>
  <tr>
    <td><b>tables</b></td>
    <td>objects, funding_rounds, investments, acquisitions, ipos</td>
  </tr>
</table>


In [68]:
print(obj.shape)
print(obj.columns)

(462620, 24)
Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'round_tempo_months', 'time_to_last_round_months'],
      dtype='object')


In [69]:
# frs, inv
# acq_by_company
# ipo_by_company

merge_tmp = (
    obj
    # 1) obj ← frs
    .merge(
        frs,
        left_on="objects_cfpr_id",
        right_on="fr_c_id",
        how="left"
    )
    # 2) frs ← inv
    .merge(
        inv,
        on="funding_round_id",
        how="left"
    )
    # 3) inv ← acq
    .merge(
        acq_by_company,
        left_on="invested_c_id",
        right_on="acquired_c_id",
        how="left"
    )
    # 4) inv ← ipo
    .merge(
        ipo_by_company,
        left_on="invested_c_id",
        right_on="ipos_c_id",
        how="left"
    )
)
display(merge_tmp.head())
print("merge_tmp shape:", merge_tmp.shape)
print("merge_tmp columns:", merge_tmp.columns)

Unnamed: 0,objects_cfpr_id,founded_at,closed_at,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,relationships,cat_obj_status,obj_category_filled,cat_obj_overview,obj_state_filled,is_obj_funding_total_usd_private,success_flag,size_bin,round_tempo_months,time_to_last_round_months,funding_round_id,fr_c_id,funded_at,raised_amount_usd,participants,is_first_round,is_last_round,num_fr_type,is_fr_raised_private,cat_fr_type,investments_id,invested_c_id,investor_cfp_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private
0,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1289.0,c:1,f:430,,,NaT,,,,NaT,,,,
1,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1290.0,c:1,f:3,,,NaT,,,,NaT,,,,
2,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1291.0,c:1,f:4,,,NaT,,,,NaT,,,,
3,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1292.0,c:1,f:430,,,NaT,,,,NaT,,,,
4,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1293.0,c:1,f:3,,,NaT,,,,NaT,,,,


merge_tmp shape: (531081, 48)
merge_tmp columns: Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'round_tempo_months', 'time_to_last_round_months', 'funding_round_id',
       'fr_c_id', 'funded_at', 'raised_amount_usd', 'participants',
       'is_first_round', 'is_last_round', 'num_fr_type',
       'is_fr_raised_private', 'cat_fr_type', 'investments_id',
       'invested_c_id', 'investor_cfp_id', 'acquisition_id', 'acquiring_c_id',
       'acquired_at', 'is_acq_price_private', 'price_amount_usd', 'ipo_id',
       'first_public_at', 'valuation_amount_usd', 'ipo_ra

<span style="font-size: 15px;">
<code>reinvest_flag</code> : 재투자여부
</span>

In [None]:
# 투자자가 동일한 회사에 동일한 라운드 타입으로 재투자했는지 여부 확인
reinvest_check = (
    merge_tmp
    .groupby(["investor_cfp_id", "cat_fr_type"])["invested_c_id"]
    .nunique()
    .gt(1)   # gt → greater than 1
    .rename("reinvest_flag")
    .reset_index()
)

# merge_tmp에 reinvest_flag 병합
merge_tmp = merge_tmp.merge(
    reinvest_check,
    on=["investor_cfp_id", "cat_fr_type"],
    how="left"
)

In [71]:
display(merge_tmp[["objects_cfpr_id", "investor_cfp_id", "cat_fr_type", "invested_c_id", "reinvest_flag"]])

Unnamed: 0,objects_cfpr_id,investor_cfp_id,cat_fr_type,invested_c_id,reinvest_flag
0,c:1,f:430,series-a,c:1,True
1,c:1,f:3,series-a,c:1,True
2,c:1,f:4,series-b,c:1,True
3,c:1,f:430,series-b,c:1,True
4,c:1,f:3,series-b,c:1,True
...,...,...,...,...,...
531076,r:9995,,,,
531077,r:9996,,,,
531078,r:9997,,,,
531079,r:9998,,,,


In [72]:
print(merge_tmp.shape)
print(merge_tmp.columns)

(531081, 49)
Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'round_tempo_months', 'time_to_last_round_months', 'funding_round_id',
       'fr_c_id', 'funded_at', 'raised_amount_usd', 'participants',
       'is_first_round', 'is_last_round', 'num_fr_type',
       'is_fr_raised_private', 'cat_fr_type', 'investments_id',
       'invested_c_id', 'investor_cfp_id', 'acquisition_id', 'acquiring_c_id',
       'acquired_at', 'is_acq_price_private', 'price_amount_usd', 'ipo_id',
       'first_public_at', 'valuation_amount_usd', 'ipo_raised_amount_usd',
       'is_ipos_va

In [73]:
merge_tmp.to_csv("./data/join/success_master.csv", encoding="utf-8", index=False)
print("="*60)
print("csv 추출 완료!")
print("="*60)

csv 추출 완료!


In [74]:
# final_merge (off_by_company)
startup_info = (
    # 5) merge_tmp ← off
    merge_tmp
    .merge(
        off_by_company,
        left_on="objects_cfpr_id",
        right_on="offices_c_id",
        how="left"
    )  
)
display(startup_info.head())
print("startup_info shape:", startup_info.shape)
print("startup_info columns:", startup_info.columns)

Unnamed: 0,objects_cfpr_id,founded_at,closed_at,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,relationships,cat_obj_status,obj_category_filled,cat_obj_overview,obj_state_filled,is_obj_funding_total_usd_private,success_flag,size_bin,round_tempo_months,time_to_last_round_months,funding_round_id,fr_c_id,funded_at,raised_amount_usd,participants,is_first_round,is_last_round,num_fr_type,is_fr_raised_private,cat_fr_type,investments_id,invested_c_id,investor_cfp_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private,reinvest_flag,n_offices
0,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1289.0,c:1,f:430,,,NaT,,,,NaT,,,,,True,2.0
1,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1290.0,c:1,f:3,,,NaT,,,,NaT,,,,,True,2.0
2,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1291.0,c:1,f:4,,,NaT,,,,NaT,,,,,True,2.0
3,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1292.0,c:1,f:430,,,NaT,,,,NaT,,,,,True,2.0
4,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,17,operating,web,Software & Technology,WA,0,1,대형팀,,32,889.0,c:1,NaT,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1293.0,c:1,f:3,,,NaT,,,,NaT,,,,,True,2.0


startup_info shape: (531081, 50)
startup_info columns: Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'round_tempo_months', 'time_to_last_round_months', 'funding_round_id',
       'fr_c_id', 'funded_at', 'raised_amount_usd', 'participants',
       'is_first_round', 'is_last_round', 'num_fr_type',
       'is_fr_raised_private', 'cat_fr_type', 'investments_id',
       'invested_c_id', 'investor_cfp_id', 'acquisition_id', 'acquiring_c_id',
       'acquired_at', 'is_acq_price_private', 'price_amount_usd', 'ipo_id',
       'first_public_at', 'valuation_amount_usd', '

In [75]:
startup_info.to_csv("./data/join/startup_info.csv", encoding="utf-8", index=False)
print("="*60)
print("csv 추출 완료!")
print("="*60)

csv 추출 완료!


##### ◼ 추가 파생변수 생성

<span style="font-size: 15px;">
<code>diff_tot_cur_rel</code> : 관계 변화량
</span>

In [76]:
# # 전체에서 현재 관계 수가 얼마나 바뀌었나
# startup_info["diff_tot_cur_rel"] = (
#     mna_size_rel["total_rel_count"] - mna_size_rel["current_rel_count"]
# )

# mna_size_rel_sorted = mna_size_rel.sort_values(
#     by="diff_tot_cur_rel",
#     ascending=False
# )

# display(mna_size_rel_sorted.sort_values(by="diff_tot_cur_rel", ascending=False))