## JOIN

In [2]:
# Import Libraries
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

# 시각화 OS별 한글폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif platform.system() == 'Mac':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정
    
print("="*60)
print("라이브러리 로드 완료!")
print("한글 폰트 설정 완료!")
print("="*60)

Windows
라이브러리 로드 완료!
한글 폰트 설정 완료!


In [3]:
# Load Dataset
obj   = pd.read_csv("./data/clean/clean_objects_final.csv")         # objects
frs   = pd.read_csv("./data/clean/clean_fr_final.csv")              # funding_rounds
inv   = pd.read_csv("./data/clean/clean_investments_final.csv")     # investments
acq   = pd.read_csv("./data/clean/clean_acquisitions_final.csv")    # acquisitions
ipo   = pd.read_csv("./data/clean/clean_ipos_final.csv")            # ipos
rel   = pd.read_csv("./data/clean/clean_relationships_final.csv")   # relationships
off   = pd.read_csv("./data/clean/clean_offices_final.csv")         # offices
print("="*60)
print("데이터셋 로드 완료!")
print("="*60)

데이터셋 로드 완료!


In [4]:
print("obj: ", obj.shape)
print("frs: ", frs.shape)
print("inv: ", inv.shape)
print("acq: ", acq.shape)
print("ipo: ", ipo.shape)
print("rel: ", rel.shape)
print("off: ", off.shape)

obj:  (462620, 45)
frs:  (52928, 17)
inv:  (80902, 4)
acq:  (9562, 11)
ipo:  (1259, 16)
rel:  (402412, 10)
off:  (112718, 11)


### START-UP

#### 투자 성공률 (핵심 테이블: FRS,INV, ACQ, IPO, OFF)
<span style="font-size: 15px;">
grain: 스타트업 id 
</span>

##### ◼ 파생변수 생성

<span style="font-size: 15px;">
<code>success_flag</code> : 성공적인 EXIT을 달성 플래그
</span>

In [5]:
cond1 = obj["objects_cfpr_id"].astype(str).str.startswith("c:")     # 회사 조건
cond2 = obj["objects_cfpr_id"].isin(ipo["ipos_c_id"].dropna())      # IPO 성공 조건
cond3 = obj["objects_cfpr_id"].isin(acq["acquired_c_id"].dropna())  # 인수(M&A) 성공 조건

In [6]:
# cond4: funding_rounds의 num_fr_type 조건
frs["funded_at"] = pd.to_datetime(frs["funded_at"], errors="coerce") # 날짜 변환

# 회사별 num_fr_type 범위
fr_type_range = (
    frs.dropna(subset=["fr_c_id", "num_fr_type"])
       .groupby("fr_c_id")["num_fr_type"]
       .agg(num_fr_type_min="min", num_fr_type_max="max")
)

# cond4 해석:
# 라운드 타입이 최소 1단계라도 진행(= max - min >= 1)
# max_type < 5 
cond4_1 = (fr_type_range["num_fr_type_max"] - fr_type_range["num_fr_type_min"]) >= 1
cond4_2 = fr_type_range["num_fr_type_max"] < 5
fr_type_range["is_round_on"] = cond4_1 & cond4_2 # cond4 라운드 진행 조건
fr_type_range

Unnamed: 0_level_0,num_fr_type_min,num_fr_type_max,is_round_on
fr_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c:1,1,3,True
c:1001,1,1,False
c:10014,0,0,False
c:10015,1,99,False
c:100155,2,99,False
...,...,...,...
c:99853,0,0,False
c:9989,0,0,False
c:9994,0,0,False
c:9995,0,0,False


In [7]:
# obj에 회사별 cond4 붙이기
obj_tmp = obj.merge(fr_type_range[["is_round_on"]], left_on="objects_cfpr_id", right_index=True, how="left")
obj_tmp["is_round_on"] = obj_tmp["is_round_on"].fillna(False)

# 성공 플래그 (회사면서, IPO or 인수 or 라운드)
obj["success_flag"] = (cond1 & (cond2 | cond3 | obj_tmp["is_round_on"])).astype(int)
obj

Unnamed: 0,objects_cfpr_id,entity_type,parent_c_id,normalized_name,category_code,status,founded_at,closed_at,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,is_obj_parent_id_missing,is_obj_category_missing,is_obj_founded_missing,is_obj_closed_missing,is_obj_overview_missing,is_obj_state_missing,is_obj_inv_rounds_missing,is_obj_inv_comp_missing,cat_obj_status,obj_overview_fixed,cat_obj_overview,obj_region_fixed,cat_obj_region,obj_state_filled,obj_city_fixed,obj_category_filled,is_obj_funding_total_usd_private,is_obj_funding_rounds_private,success_flag
0,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,5,17,1,0,0,1,0,0,0,0,operating,"wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. wetpaints own online property, wetpaint entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million facebook fans, is a proof point to the companys success in building and engaging audiences. media companies can license wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. founded by internet pioneer ben elowitz, and with offices in new york and seattle, wetpaint is backed by accel partners, the investors behind facebook",Software & Technology,seattle,seattle,WA,seattle,web,0,0,1
1,c:10,Company,,flektor,games_video,acquired,,,,"Flektor is a rich-media mash-up platform that enables consumers to create, remix and share photos and videos on the internet without the need for advanced video-editing skills or software.\n\nFox Interactive Media, a division of News Corporation, announced that it had completed the purchase of Flektor on May 30, 2007. The estimated puchase price is $15-20 million.","flektor, photo, video",USA,CA,Culver City,Los Angeles,,,0,0,,,,,,,0,6,1,0,1,1,0,0,0,0,acquired,"flektor is a rich-media mash-up platform that enables consumers to create, remix and share photos and videos on the internet without the need for advanced video-editing skills or software. fox interactive media, a division of news corporation, announced that it had completed the purchase of flektor on may 30, 2007. the estimated puchase price is $15-20 million",Software & Technology,los angeles,los angeles,CA,culver city,games_video,1,1,1
2,c:100,Company,,there,games_video,acquired,,,,"There.com is an online virtual world where anyone can explore, meet friends and play games. It was founded in 1998 by Will Harvey, a Stanford computer science Ph.D. and game developer, and Jeffrey Ventrella, an expert on artificial life from MIT's Media Lab. The duo raised approximately $37 million - including $20 million from employees, $11 million from angel investors and $6 million from Sutter Hill Ventures. In 2005 the company was spun off under Makena Technologies, and in March 2010 There closed to the public. In May 2011, There announced it would reopen as a 18+ Cloud-based service. As of Nov 2013, There is open.\n\nThere.com is a subscription service with a monthly fee of $10.00. Additional in-game accessories can be purchased for separate fees.\n\nOther online virtual worlds include [Kaneva](http://www.crunchbase.com/company/kaneva), [Second Life](http://www.crunchbase.com/company/secondlife) and [Cyworld](http://www.crunchbase.com/company/cyworld).","virtualworld, there, teens",USA,CA,San Mateo,SF Bay,,,0,0,,,,,2003-02-01,2011-09-23,4,12,1,0,1,1,0,0,0,0,acquired,"there.com is an online virtual world where anyone can explore, meet friends and play games. it was founded in 1998 by will harvey, a stanford computer science ph.d. and game developer, and jeffrey ventrella, an expert on artificial life from mit's media lab. the duo raised approximately $37 million - including $20 million from employees, $11 million from angel investors and $6 million from sutter hill ventures. in 2005 the company was spun off under makena technologies, and in march 2010 there closed to the public. in may 2011, there announced it would reopen as a 18+ cloud-based service. as of nov 2013, there is open. there.com is a subscription service with a monthly fee of $10.00. additional in-game accessories can be purchased for separate fees. other online virtual worlds include [kaneva](http://www.crunchbase.com/company/kaneva), [second life](http://www.crunchbase.com/company/secondlife) and [cyworld](http://www.crunchbase.com/company/cyworld)",Software & Technology,sf bay area,sf bay area,CA,san mateo,games_video,1,1,1
3,c:10000,Company,,mywebbo,network_hosting,operating,2008-07-26,,,"BRAND NEW ONLINE SOCIAL NETWORKING WEBSITE,FOR MAKING NEW FRIENDS OR CHATTING TO OLD ONE'S.\n\nPACKED WITH NEW FEATURES SUCH AS RATING PROFILES , RATING MUSIC,VIDEO'S AND PICTURES ,UPLOADING MUSIC ,VIDEO'S PICTURES , CREATING CLASSIFIED ADS ,SHOUTOUT BOX!, AND ONLINE CHAT AREA FOR MAKING NEW FRIENDS OR SIMPLY CHATTING TO YOUR OLD ONE'S ,THERE ARE LOADS OF GREAT FEATURES FOR ANYONE TO TRY .. PLUS MANY MORE TO COME .","social-network, new, website, web, friends, chat, people",,,,unknown,,,0,0,,,0.0,0.0,,,0,0,1,0,0,1,0,1,0,0,operating,"brand new online social networking website,for making new friends or chatting to old one's. packed with new features such as rating profiles , rating music,video's and pictures ,uploading music ,video's pictures , creating classified ads ,shoutout box!, and online chat area for making new friends or simply chatting to your old one's ,there are loads of great features for anyone to try .. plus many more to come",Software & Technology,,,,,network_hosting,0,0,0
4,c:10001,Company,,the movie streamer,games_video,operating,2008-07-26,,,"This company shows free movies online on their website which, in fact, is not illegal since they are not the ones hosting the videos.","watch, full-length, moives, online, for, free, streaming, videos, tv-shows",,,,unknown,,,0,0,,,0.0,0.0,,,0,0,1,0,0,1,0,1,0,0,operating,"this company shows free movies online on their website which, in fact, is not illegal since they are not the ones hosting the videos",Software & Technology,,,,,games_video,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462615,r:9995,Product,c:14164,sitelink listing feed for brokerages,,operating,,,,,,,,,unknown,,,0,0,,,0.0,0.0,,,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,,,,software,0,0,0
462616,r:9996,Product,c:14164,edclink listing feed for economic development groups,,operating,,,,,,,,,unknown,,,0,0,,,0.0,0.0,,,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,,,,software,0,0,0
462617,r:9997,Product,c:14164,cmail broadcast email marketing,,operating,,,,,,,,,unknown,,,0,0,,,0.0,0.0,,,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,,,,software,0,0,0
462618,r:9998,Product,c:14164,catylistcrm contact database,,operating,,,,,,,,,unknown,,,0,0,,,0.0,0.0,,,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,,,,software,0,0,0


In [8]:
obj[["objects_cfpr_id", "success_flag"]].head()

Unnamed: 0,objects_cfpr_id,success_flag
0,c:1,1
1,c:10,1
2,c:100,1
3,c:10000,0
4,c:10001,0


<span style="font-size: 15px;">
<code>size_bin</code> : 기업의 규모 구간
</span>

In [9]:
# 회사 정보만 필터링
cond1 = obj["objects_cfpr_id"].astype(str).str.startswith("c:")
obj_size_bin = obj.loc[cond1].copy()
display(obj_size_bin["objects_cfpr_id"].str[0].value_counts())

# quantile 기반 bins
q = obj_size_bin["relationships"].quantile([0.25, 0.5, 0.75])
print("bin 기준\n", q)

bins = [-1, q[0.25], q[0.5], q[0.75], obj_size_bin["relationships"].max()]
labels = ["초소형팀", "소형팀", "중형팀", "대형팀"]

obj_size_bin["size_bin"] = pd.cut(
    obj_size_bin["relationships"],
    bins=bins,
    labels=labels,
    include_lowest=True  # 최소값도 포함
)

display(obj_size_bin["size_bin"].value_counts())
# 초소형 66885 | 소형 70569 | 중형 23258 | 대형 35837

# obj에 size_bin 병합 → 회사가 아닌 row는 size_bin이 NaN으로 남음
obj = obj.merge(
    obj_size_bin[["objects_cfpr_id", "size_bin"]],
    on="objects_cfpr_id",
    how="left"
)
display(obj[["objects_cfpr_id", "relationships", "size_bin"]].head())

objects_cfpr_id
c    196549
Name: count, dtype: int64

bin 기준
 0.25    0.0
0.50    1.0
0.75    2.0
Name: relationships, dtype: float64


size_bin
소형팀     70569
초소형팀    66885
대형팀     35837
중형팀     23258
Name: count, dtype: int64

Unnamed: 0,objects_cfpr_id,relationships,size_bin
0,c:1,17,대형팀
1,c:10,6,대형팀
2,c:100,12,대형팀
3,c:10000,0,초소형팀
4,c:10001,0,초소형팀


<span style="font-size: 15px;">
<code>round_tempo_months</code> : 투자 템포(개월)
</span>

In [10]:
# 회사별 라운드 간격 계산
round_tempo = (frs.sort_values(['fr_c_id','funded_at']))
round_tempo['prev_round_date'] = round_tempo.groupby('fr_c_id')['funded_at'].shift(1)

round_tempo['round_tempo_days'] = (round_tempo['funded_at'] - round_tempo['prev_round_date']).dt.days
round_tempo["round_tempo_days"] = round_tempo["round_tempo_days"].astype("Int64") # 정수형으로 변경

round_tempo['round_tempo_months'] = round(round_tempo['round_tempo_days'] / 30) 
display(round_tempo[["fr_c_id","funded_at", "round_tempo_days","round_tempo_months"]])

Unnamed: 0,fr_c_id,funded_at,round_tempo_days,round_tempo_months
818,c:1,2005-10-01,,
819,c:1,2007-01-01,457,15.0
2109,c:1,2008-05-19,504,17.0
1517,c:1001,2008-02-26,,
6048,c:10014,2008-09-01,,
...,...,...,...,...
5207,c:9989,2009-02-01,326,11.0
2958,c:9994,2007-01-01,,
5527,c:9994,2008-05-01,486,16.0
2967,c:9995,2008-08-25,,


In [11]:
# 회사별 median round_tempo_months 계산
tempo_by_company = (
    round_tempo
    .groupby("fr_c_id")
    .agg(    
        round_tempo_months=("round_tempo_months", "median") 
    )
    .reset_index()
    .rename(columns={"fr_c_id": "objects_cfpr_id"})
)
tempo_by_company["round_tempo_months"] = tempo_by_company["round_tempo_months"].astype("Int64") # 정수형으로 변경
tempo_by_company[tempo_by_company["round_tempo_months"].notna()].head()

Unnamed: 0,objects_cfpr_id,round_tempo_months
0,c:1,16
3,c:10015,17
4,c:100155,5
5,c:10018,28
9,c:100238,10


In [12]:
tempo_by_company.columns

Index(['objects_cfpr_id', 'round_tempo_months'], dtype='object')

In [13]:
# obj에 회사별 median round_tempo_months 병합
obj = obj.merge(
    tempo_by_company,
    on="objects_cfpr_id",
    how="left"
)

In [14]:
obj.columns

Index(['objects_cfpr_id', 'entity_type', 'parent_c_id', 'normalized_name',
       'category_code', 'status', 'founded_at', 'closed_at', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships',
       'is_obj_parent_id_missing', 'is_obj_category_missing',
       'is_obj_founded_missing', 'is_obj_closed_missing',
       'is_obj_overview_missing', 'is_obj_state_missing',
       'is_obj_inv_rounds_missing', 'is_obj_inv_comp_missing',
       'cat_obj_status', 'obj_overview_fixed', 'cat_obj_overview',
       'obj_region_fixed', 'cat_obj_region', 'obj_state_filled',
       'obj_city_fixed', 'obj_category_filled',
       'is_obj_funding_total_usd_private', 'is_obj_funding_rounds_private',
     

In [15]:
cond1 = (obj["round_tempo_months"].notna())
obj.loc[cond1, ["objects_cfpr_id", "round_tempo_months"]].head()

Unnamed: 0,objects_cfpr_id,round_tempo_months
0,c:1,16
19,c:10015,17
20,c:100155,5
24,c:10018,28
36,c:100238,10


<span style="font-size: 15px;">
<code>time_to_last_round</code> : 첫 투자 → 마지막 투자 기간(개월)
</span>

In [16]:
obj["first_funding_at"] = pd.to_datetime(obj["first_funding_at"], errors="coerce")
obj["last_funding_at"]  = pd.to_datetime(obj["last_funding_at"],  errors="coerce")

# object에 있는 first_funding_at, last_funding_at 사용
obj["time_to_last_round_months"] = (
    (obj["last_funding_at"] - obj["first_funding_at"]).dt.days / 30
)
obj["time_to_last_round_months"] = obj["time_to_last_round_months"].round().astype("Int64") # 정수형으로 변경

# 유효하지 않은 값(NaT, 음수)을 NaN으로 변경
cond1 = obj["first_funding_at"].notna()
cond2 = obj["last_funding_at"].notna()
cond3 = (obj["last_funding_at"] >= obj["first_funding_at"]) # 마지막 < 첫 번째
mask_valid = ( 
    cond1 & cond2 & cond3 
)

obj.loc[~mask_valid, "time_to_last_round_months"] = np.nan
display(obj.loc[mask_valid, ["first_funding_at", "last_funding_at", "time_to_last_round_months"]].head())

Unnamed: 0,first_funding_at,last_funding_at,time_to_last_round_months
0,2005-10-01,2008-05-19,32
13,2008-02-26,2008-02-26,0
18,2008-09-01,2008-09-01,0
19,2008-10-10,2013-08-13,59
20,2011-03-08,2012-01-26,11


##### ◼ funding_rounds

In [17]:
frs.columns

Index(['funding_round_id', 'fr_c_id', 'funded_at', 'funding_round_type',
       'funding_round_code', 'raised_amount_usd', 'pre_money_valuation_usd',
       'post_money_valuation_usd', 'participants', 'is_first_round',
       'is_last_round', 'funded_year', 'funded_quarter', 'cat_fr_type',
       'num_fr_type', 'log_participants', 'is_fr_raised_private'],
      dtype='object')

In [18]:
cols_use = [
    #funding_rounds
    "funding_round_id"  , "fr_c_id"         , "funded_at"           , 
    "raised_amount_usd" , "participants"    , "is_first_round"      , 
    "is_last_round"     , "num_fr_type"     , "is_fr_raised_private",
    "cat_fr_type"  
]
frs = frs[cols_use].copy()
print(frs.columns)

Index(['funding_round_id', 'fr_c_id', 'funded_at', 'raised_amount_usd',
       'participants', 'is_first_round', 'is_last_round', 'num_fr_type',
       'is_fr_raised_private', 'cat_fr_type'],
      dtype='object')


##### ◼ investments

In [19]:
inv.columns

Index(['investments_id', 'funding_round_id', 'invested_c_id',
       'investor_cfp_id'],
      dtype='object')

In [20]:
cols_use = [
    #investments
    "investments_id", "funding_round_id", "invested_c_id", "investor_cfp_id"
]
inv = inv[cols_use].copy()
print(inv.columns)

Index(['investments_id', 'funding_round_id', 'invested_c_id',
       'investor_cfp_id'],
      dtype='object')


##### ◼ acquisitions

In [21]:
acq.columns

Index(['acquisition_id', 'acquiring_c_id', 'acquired_c_id', 'term_code',
       'price_amount', 'price_currency_code', 'acquired_at',
       'is_acq_price_private', 'is_acquisitions_acq_at_missing',
       'price_amount_usd', 'acqusition_currency_rate'],
      dtype='object')

In [22]:
cols_use = [
    #acquisitions
    "acquisition_id"    , "acquired_c_id"       , "acquiring_c_id"  , 
    "acquired_at"       , "is_acq_price_private", "price_amount_usd" 
]
acq = acq[cols_use].copy()
print(acq.columns)

Index(['acquisition_id', 'acquired_c_id', 'acquiring_c_id', 'acquired_at',
       'is_acq_price_private', 'price_amount_usd'],
      dtype='object')


In [23]:
acq["acquired_at"] = pd.to_datetime(acq["acquired_at"], errors='coerce')
acq_by_company = (
    acq
    .groupby("acquired_c_id")
    .agg(
            acquisition_id        = ("acquisition_id","first"),
            acquiring_c_id        = ("acquiring_c_id","first"),
            acquired_at           = ("acquired_at","min"), # 첫번째 이벤트
            is_acq_price_private  = ("is_acq_price_private","max"),
            price_amount_usd      = ("price_amount_usd","sum")
        )
)
acq_by_company

Unnamed: 0_level_0,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd
acquired_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c:10,1,c:11,2007-05-30,0,20000000.0
c:100,20,c:377,2005-05-29,1,0.0
c:1001,1901,c:5,2009-08-10,0,47500000.0
c:10014,3878,c:23054,2010-09-30,1,0.0
c:100265,6106,c:38215,2011-09-06,1,0.0
...,...,...,...,...,...
c:9949,9692,c:267149,2013-09-25,0,400000000.0
c:99685,7240,c:161312,2011-08-01,0,4800000.0
c:997,85,c:29,2007-09-01,1,350000000.0
c:99737,6463,c:67724,2011-11-30,1,0.0


##### ◼ ipos

In [24]:
ipo.columns

Index(['ipo_id', 'ipos_c_id', 'valuation_amount', 'valuation_currency_code',
       'raised_amount', 'raised_currency_code', 'public_at', 'stock_symbol',
       'is_ipos_public_at_missing', 'stock_normalized',
       'is_ipos_valuation_private', 'valuation_amount_usd',
       'valuation_currency_rate', 'is_ipos_raised_private',
       'ipo_raised_amount_usd', 'ipo_raised_currency_rate'],
      dtype='object')

In [25]:
cols_use = [
    #ipos
    "ipo_id"                , "ipos_c_id"                   , "public_at"               , 
    "valuation_amount_usd"  , "is_ipos_valuation_private"   , "is_ipos_raised_private"  , 
    "ipo_raised_amount_usd" 
]
ipo = ipo[cols_use].copy()
print(ipo.columns)

Index(['ipo_id', 'ipos_c_id', 'public_at', 'valuation_amount_usd',
       'is_ipos_valuation_private', 'is_ipos_raised_private',
       'ipo_raised_amount_usd'],
      dtype='object')


In [26]:
ipo["public_at"] = pd.to_datetime(ipo["public_at"], errors='coerce')
ipo_by_company = (
    ipo
    .groupby("ipos_c_id")
    .agg(
            ipo_id                      = ("ipo_id","first"),
            first_public_at             = ("public_at","min"),   # 첫번째 이벤트
            valuation_amount_usd        = ("valuation_amount_usd","sum"),
            ipo_raised_amount_usd       = ("ipo_raised_amount_usd","sum"),
            is_ipos_valuation_private   = ("is_ipos_valuation_private","max"),
            is_ipos_raised_private      = ("is_ipos_raised_private","max")
        )
)
ipo_by_company

Unnamed: 0_level_0,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private
ipos_c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c:100844,1022,2007-08-01,0.0,0.0,1,1
c:10106,1110,2002-06-21,0.0,0.0,1,1
c:10166,840,NaT,0.0,0.0,1,1
c:10222,804,1978-01-13,0.0,0.0,1,1
c:10241,513,NaT,0.0,0.0,1,1
...,...,...,...,...,...,...
c:966,28,1986-04-11,0.0,0.0,1,1
c:9743,1153,2012-07-27,0.0,0.0,1,1
c:9786,528,NaT,0.0,0.0,1,1
c:988,1146,2008-01-11,0.0,0.0,1,1


##### ◼ offices

In [27]:
off.columns

Index(['offices_c_id', 'office_id', 'description', 'city', 'state_code',
       'country_code', 'latitude', 'longitude', 'offices_description_fixed',
       'cat_offices_description', 'offices_state_filled'],
      dtype='object')

In [28]:
cols_use = [
    #offices
    "offices_c_id"  , "office_id"  , "city"       , 
    "country_code"  , "latitude"   , "longitude"  , 
    "cat_offices_description" 
]
off = off[cols_use].copy()
print(off.columns)

Index(['offices_c_id', 'office_id', 'city', 'country_code', 'latitude',
       'longitude', 'cat_offices_description'],
      dtype='object')


In [29]:
off_by_company = (
    off
    .groupby("offices_c_id")
    .agg(
            n_offices        = ("office_id","nunique")
        )
)
off_by_company

Unnamed: 0_level_0,n_offices
offices_c_id,Unnamed: 1_level_1
c:1,2
c:10,1
c:100,1
c:10002,2
c:10003,1
...,...
f:9994,1
f:9995,1
f:9997,1
f:9998,1


##### 📌 join (start-up main)
<table style="font-size: 12px;">
  <tr>
    <th>구분</th>
    <th>설명</th>
  </tr>
  <tr>
    <td><b>grain</b></td>
    <td>스타트업 id</td>
  </tr>
  <tr>
    <td><b>contents</b></td>
    <td>스타트업 성공률 분석의 기준이 되는 테이블</td>
  </tr>
  <tr>
    <td><b>tables</b></td>
    <td>objects, funding_rounds, investments, acquisitions, ipos</td>
  </tr>
</table>


In [30]:
print(obj.shape)
print(obj.columns)

(462620, 49)
Index(['objects_cfpr_id', 'entity_type', 'parent_c_id', 'normalized_name',
       'category_code', 'status', 'founded_at', 'closed_at', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships',
       'is_obj_parent_id_missing', 'is_obj_category_missing',
       'is_obj_founded_missing', 'is_obj_closed_missing',
       'is_obj_overview_missing', 'is_obj_state_missing',
       'is_obj_inv_rounds_missing', 'is_obj_inv_comp_missing',
       'cat_obj_status', 'obj_overview_fixed', 'cat_obj_overview',
       'obj_region_fixed', 'cat_obj_region', 'obj_state_filled',
       'obj_city_fixed', 'obj_category_filled',
       'is_obj_funding_total_usd_private', 'is_obj_funding_rounds_pr

In [31]:
# frs, inv
# acq_by_company
# ipo_by_company

merge_tmp = (
    obj
    # 1) obj ← frs
    .merge(
        frs,
        left_on="objects_cfpr_id",
        right_on="fr_c_id",
        how="left"
    )
    # 2) frs ← inv
    .merge(
        inv,
        on="funding_round_id",
        how="left"
    )
    # 3) inv ← acq
    .merge(
        acq_by_company,
        left_on="invested_c_id",
        right_on="acquired_c_id",
        how="left"
    )
    # 4) inv ← ipo
    .merge(
        ipo_by_company,
        left_on="invested_c_id",
        right_on="ipos_c_id",
        how="left"
    )
)
display(merge_tmp.head())
print("merge_tmp shape:", merge_tmp.shape)
print("merge_tmp columns:", merge_tmp.columns)

Unnamed: 0,objects_cfpr_id,entity_type,parent_c_id,normalized_name,category_code,status,founded_at,closed_at,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,...,time_to_last_round_months,funding_round_id,fr_c_id,funded_at,raised_amount_usd,participants,is_first_round,is_last_round,num_fr_type,is_fr_raised_private,cat_fr_type,investments_id,invested_c_id,investor_cfp_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private
0,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1289.0,c:1,f:430,,,NaT,,,,NaT,,,,
1,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,32,888.0,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1290.0,c:1,f:3,,,NaT,,,,NaT,,,,
2,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,32,889.0,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1291.0,c:1,f:4,,,NaT,,,,NaT,,,,
3,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,32,889.0,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1292.0,c:1,f:430,,,NaT,,,,NaT,,,,
4,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,32,889.0,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1293.0,c:1,f:3,,,NaT,,,,NaT,,,,


merge_tmp shape: (531081, 73)
merge_tmp columns: Index(['objects_cfpr_id', 'entity_type', 'parent_c_id', 'normalized_name',
       'category_code', 'status', 'founded_at', 'closed_at', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships',
       'is_obj_parent_id_missing', 'is_obj_category_missing',
       'is_obj_founded_missing', 'is_obj_closed_missing',
       'is_obj_overview_missing', 'is_obj_state_missing',
       'is_obj_inv_rounds_missing', 'is_obj_inv_comp_missing',
       'cat_obj_status', 'obj_overview_fixed', 'cat_obj_overview',
       'obj_region_fixed', 'cat_obj_region', 'obj_state_filled',
       'obj_city_fixed', 'obj_category_filled',
       'is_obj_funding_total_usd

<span style="font-size: 15px;">
<code>reinvest_flag</code> : 재투자여부
</span>

In [32]:
# 투자자가 동일한 회사에 동일한 라운드 타입으로 재투자했는지 여부 확인
reinvest_check = (
    merge_tmp
    .groupby(["investor_cfp_id", "cat_fr_type"])["invested_c_id"]
    .nunique()
    .gt(1)   # gt → greater than 1
    .rename("reinvest_flag")
    .reset_index()
)

# merge_tmp에 reinvest_flag 병합
merge_tmp = merge_tmp.merge(
    reinvest_check,
    on=["investor_cfp_id", "cat_fr_type"],
    how="left"
)

In [33]:
display(merge_tmp[["objects_cfpr_id", "investor_cfp_id", "cat_fr_type", "invested_c_id", "reinvest_flag"]])

Unnamed: 0,objects_cfpr_id,investor_cfp_id,cat_fr_type,invested_c_id,reinvest_flag
0,c:1,f:430,series-a,c:1,True
1,c:1,f:3,series-a,c:1,True
2,c:1,f:4,series-b,c:1,True
3,c:1,f:430,series-b,c:1,True
4,c:1,f:3,series-b,c:1,True
...,...,...,...,...,...
531076,r:9995,,,,
531077,r:9996,,,,
531078,r:9997,,,,
531079,r:9998,,,,


In [34]:
print(merge_tmp.shape)
print(merge_tmp.columns)

(531081, 74)
Index(['objects_cfpr_id', 'entity_type', 'parent_c_id', 'normalized_name',
       'category_code', 'status', 'founded_at', 'closed_at', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships',
       'is_obj_parent_id_missing', 'is_obj_category_missing',
       'is_obj_founded_missing', 'is_obj_closed_missing',
       'is_obj_overview_missing', 'is_obj_state_missing',
       'is_obj_inv_rounds_missing', 'is_obj_inv_comp_missing',
       'cat_obj_status', 'obj_overview_fixed', 'cat_obj_overview',
       'obj_region_fixed', 'cat_obj_region', 'obj_state_filled',
       'obj_city_fixed', 'obj_category_filled',
       'is_obj_funding_total_usd_private', 'is_obj_funding_rounds_pr

In [35]:
merge_tmp.to_csv("./data/join/success_master.csv", encoding="utf-8", index=False)
print("="*60)
print("csv 추출 완료!")
print("="*60)

csv 추출 완료!


In [36]:
# final_merge (off_by_company)
startup_info = (
    # 5) merge_tmp ← off
    merge_tmp
    .merge(
        off_by_company,
        left_on="objects_cfpr_id",
        right_on="offices_c_id",
        how="left"
    )  
)
display(startup_info.head())
print("startup_info shape:", startup_info.shape)
print("startup_info columns:", startup_info.columns)

Unnamed: 0,objects_cfpr_id,entity_type,parent_c_id,normalized_name,category_code,status,founded_at,closed_at,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,...,fr_c_id,funded_at,raised_amount_usd,participants,is_first_round,is_last_round,num_fr_type,is_fr_raised_private,cat_fr_type,investments_id,invested_c_id,investor_cfp_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private,reinvest_flag,n_offices
0,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1289.0,c:1,f:430,,,NaT,,,,NaT,,,,,True,2.0
1,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,c:1,2005-10-01,5250000.0,2.0,0.0,1.0,1.0,0.0,series-a,1290.0,c:1,f:3,,,NaT,,,,NaT,,,,,True,2.0
2,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1291.0,c:1,f:4,,,NaT,,,,NaT,,,,,True,2.0
3,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1292.0,c:1,f:430,,,NaT,,,,NaT,,,,,True,2.0
4,c:1,Company,,wetpaint,web,operating,2005-10-17,,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,,,0,0,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,...,c:1,2007-01-01,9500000.0,3.0,0.0,0.0,2.0,0.0,series-b,1293.0,c:1,f:3,,,NaT,,,,NaT,,,,,True,2.0


startup_info shape: (531081, 75)
startup_info columns: Index(['objects_cfpr_id', 'entity_type', 'parent_c_id', 'normalized_name',
       'category_code', 'status', 'founded_at', 'closed_at', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships',
       'is_obj_parent_id_missing', 'is_obj_category_missing',
       'is_obj_founded_missing', 'is_obj_closed_missing',
       'is_obj_overview_missing', 'is_obj_state_missing',
       'is_obj_inv_rounds_missing', 'is_obj_inv_comp_missing',
       'cat_obj_status', 'obj_overview_fixed', 'cat_obj_overview',
       'obj_region_fixed', 'cat_obj_region', 'obj_state_filled',
       'obj_city_fixed', 'obj_category_filled',
       'is_obj_funding_tot

In [37]:
startup_info.to_csv("./data/join/startup_info.csv", encoding="utf-8", index=False)
print("="*60)
print("csv 추출 완료!")
print("="*60)

csv 추출 완료!


##### ◼ 추가 파생변수 생성

<span style="font-size: 15px;">
<code>diff_tot_cur_rel</code> : 관계 변화량
</span>

In [38]:
# # 전체에서 현재 관계 수가 얼마나 바뀌었나
# startup_info["diff_tot_cur_rel"] = (
#     mna_size_rel["total_rel_count"] - mna_size_rel["current_rel_count"]
# )

# mna_size_rel_sorted = mna_size_rel.sort_values(
#     by="diff_tot_cur_rel",
#     ascending=False
# )

# display(mna_size_rel_sorted.sort_values(by="diff_tot_cur_rel", ascending=False))