In [1]:
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

# 한글 폰트 설정
pd.set_option("display.float_format","{:.2f}".format)
# OS에 따라 다른 폰트 지정
if platform.system() == 'Darwin':   # macOS
    plt.rcParams['font.family'] = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    plt.rcParams['font.family'] = 'Malgun Gothic'
else:  # Linux (예: Colab, Ubuntu)
    plt.rcParams['font.family'] = 'NanumGothic'

Darwin


# 전처리 파일 불러오기

In [2]:
peo = pd.read_csv("./data/clean_data/clean_people_final.csv")
rel = pd.read_csv("./data/clean_data/clean_relationships_final.csv")
obj = pd.read_csv("./data/clean_data/clean_objects_final.csv")
frs = pd.read_csv("./data/clean_data/clean_fr_final.csv")
fds = pd.read_csv("./data/clean_data/clean_funds_final.csv")
off = pd.read_csv("./data/clean_data/clean_offices_final.csv")
ipo = pd.read_csv("./data/clean_data/clean_ipos_final.csv")
acq = pd.read_csv("./data/clean_data/clean_acquisitions_final.csv")
deg = pd.read_csv("./data/clean_data/clean_degrees_final.csv")
mil = pd.read_csv("./data/clean_data/clean_milestones_final.csv")
inv = pd.read_csv("./data/clean_data/clean_investments_final.csv")

In [3]:
peo['people_p_id'].duplicated().sum()
peo

Unnamed: 0,people_p_id,birthplace,affiliation_name,birthplace_norm,cat_people_birthplace
0,p:2,,Blue Nile,,
1,p:3,,Wetpaint,,
2,p:4,,Zoho,,
3,p:5,,Zoho,,
4,p:6,"Redding, CA",i/o Ventures,"redding, ca",United States
...,...,...,...,...,...
226704,p:268589,,Unaffiliated,,
226705,p:268590,,Unaffiliated,,
226706,p:268592,,Unaffiliated,,
226707,p:268593,,Unaffiliated,,


# 투자 성공 테이블 - 파생변수
 - success_flag : 성공 플래그 ✅

 - size_bin : 회사 규모 ✅

 - diff_tot_cur_rel : 관계 변화량

 - reinvest_flag : 재투자 여부 ✅

 - total_invested : 원금 ✅

 - round_tempo_months : 투자템포(개월) ✅

 - time_to_last_round : 첫 투자 -> 마지막 투자 ✅


# 테이블 별 사용 컬럼 추출

In [4]:
# 1. 각 테이블별 사용할 컬럼 추출

# relationships 컬럼 추출
rel_use_cols = [
    'relationship_id', 'rel_p_id', 'rel_cf_id',
    'is_past',  # p와 cf 관계가 현재인지 과거인지 (0/1)
    'sequence', # 한 사람에 대해 여러 관계가 있을 때, 관계의 순서/우선순위를 나타내는 인덱스(?)
    'cat_rel_title' 
]

# people 컬럼 추출
peo_use_cols = [
    'people_p_id', 'cat_people_birthplace'
]

# degrees 컬럼 추출
deg_use_cols = [
    'degrees_p_id', 'degree_level'  
    'is_degree_missing',
    'cat_degrees_degree_type',  # 학위 종류
    'cat_degrees_subject',      # 전공
    'institution_normalized',    # 출신 학교
    'graduated_at'
]

# objects 컬럼 추출
obj_use_cols = [
    'objects_cfpr_id',
    'founded_at', 'closed_at',
    'description', 'country_code', 'obj_city_fixed',
    'first_investment_at', 'last_investment_at', 'invested_companies',
    'first_funding_at', 'last_funding_at', 'funding_rounds', 'funding_total_usd',  # funding_rounds : 이 회사가 지금까지 받은 투자 라운드의 수
    'relationships',  # 해당 엔티티가 가진 관계수(관련된 사람 또는 조직)
    'cat_obj_status', 'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
    'is_obj_funding_total_usd_private'   # funding_round_usd 비공개 플래그 
]

# founder_master 만들기

## rel(창업자 관계만 필터링) - peo : 사람 정보 조인

In [5]:
# rel(창업자 관계만 필터링) - peo : 사람 정보 조인
founder_rel = rel[rel['cat_rel_title'] == 'Founder'][rel_use_cols].copy()

founder_with_people = founder_rel.merge(
    peo[peo_use_cols],
    how = 'left',
    left_on = 'rel_p_id',
    right_on = 'people_p_id'
)
founder_with_people

Unnamed: 0,relationship_id,rel_p_id,rel_cf_id,is_past,sequence,cat_rel_title,people_p_id,cat_people_birthplace
0,1,p:2,c:1,0,8,Founder,p:2,
1,9,p:10,c:5,0,1,Founder,p:10,
2,10,p:11,c:5,1,1,Founder,p:11,United States
3,14,p:16,c:7299,1,2,Founder,p:16,
4,15,p:17,c:7299,1,2,Founder,p:17,
...,...,...,...,...,...,...,...,...
62110,480741,p:29245,c:286154,1,1,Founder,p:29245,
62111,480794,p:254302,c:286157,0,3,Founder,p:254302,
62112,480796,p:254302,c:286160,0,4,Founder,p:254302,
62113,480797,p:268562,c:286160,0,1,Founder,p:268562,


## degree 최종학력 관련 처리 (degrees_p_id 당 한개의 행으로 집계 -> deg_final_one)

- 최종학력을 2개 이상을 보유한 사람 존재

- graduated_at 값이 있다면 max로 최근 졸업일 데이터를 가져옴

- 최종학력이 1개인 행들은 그대로 유지

- 나머지 컬럼(학교, 전공 등) 대푯값 붙이기

In [6]:
# 1️⃣ degrees_p_id별 최종 학위 레벨 계산
max_level = (
    deg
    .groupby('degrees_p_id')['degree_level']
    .transform('max')    # 그룹별 집계와 같이 쓰임. 모든 행에 맞게 확장하여 결과 반환
)

# 2️⃣ 최종 학위에 해당하는 행만 필터
final_degree_rows = deg[deg['degree_level'] == max_level]
final_degree_rows

# degrees_p_id별 행 개수
cnt_per_id = final_degree_rows['degrees_p_id'].value_counts()

# 2행 이상인 degrees_p_id만 추출
multi_ids = cnt_per_id[cnt_per_id >= 2].index

# 해당 degrees_p_id에 속한 모든 행 # 복수 학위를 가진 사람들에 해당하는 모든 행
deg_multi = final_degree_rows[final_degree_rows['degrees_p_id'].isin(multi_ids)]

deg_multi  



Unnamed: 0,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized
12,p:2351,LLB,Law,University of Toronto School of Law,1993-01-01,Bachelor’s degree,2.00,0,Law,toronto university
13,p:2351,BA,Economics,Princeton University,1990-01-01,Bachelor’s degree,2.00,0,Computer Science / Software,princeton university
27,p:6559,BA,"Communications, Publishing, Social Anthropology","Simon Fraser University, Ryerson University",,Bachelor’s degree,2.00,0,Marketing / Communications,simon fraser university
34,p:6614,BS,Psychology,University of Texas,1998-01-01,Bachelor’s degree,2.00,0,Psychology / Sociology,texas university
35,p:6614,BS,criminal jusitce,University of Texas,1998-01-01,Bachelor’s degree,2.00,0,Computer Science / Software,texas university
...,...,...,...,...,...,...,...,...,...,...
109257,p:207574,MBA,,IESE Business School,,Master’s degree,3.00,0,,iese business school
109278,p:268488,Bachelor of Fine Arts (BFA),Theatre/Theater,SUNY Purchase,2005-01-01,Bachelor’s degree,2.00,0,Arts / Humanities,
109279,p:268488,Bachelor of Arts (BA),Theatre/Theater,Southern Methodist University,2004-01-01,Bachelor’s degree,2.00,0,Arts / Humanities,southern methodist university
109287,p:268428,bachelor of arts,,Cambridge University,,Bachelor’s degree,2.00,0,,cambridge university


In [7]:
# degrees_p_id별 최종 학위 레벨 계산
max_level = (
    deg
    .groupby('degrees_p_id')['degree_level']
    .transform('max')    # 그룹별 집계와 같이 쓰임. 모든 행에 맞게 확장하여 결과 반환
)

# 최종 학위에 해당하는 행만 필터
final_degree_rows = deg[deg['degree_level'] == max_level]
final_degree_rows

# degrees_p_id별 행 개수
cnt_per_id = final_degree_rows['degrees_p_id'].value_counts()

# 2행 이상인 degrees_p_id만 추출
multi_ids = cnt_per_id[cnt_per_id >= 2].index

# 해당 degrees_p_id에 속한 모든 행 # 복수 학위를 가진 사람들에 해당하는 모든 행
deg_multi = final_degree_rows[final_degree_rows['degrees_p_id'].isin(multi_ids)]

# 날짜컬럼 형식변환
deg_multi["graduated_at"] = pd.to_datetime(deg_multi["graduated_at"], errors="coerce")

# 최근 졸업일자 열 생성
deg_multi["max_graduated_at"] = (
    deg_multi.groupby("degrees_p_id")["graduated_at"]
    .transform("max")
)

# 확인
display(deg_multi[["degrees_p_id", "degree_level", "graduated_at", "max_graduated_at"]])

# 최종학위가 2개 이상인 행들 중 graduated_at 결측 확인
num_missing_graduated_at = deg_multi['graduated_at'].isna().sum()

# 최종학력 2개 이상인 행들 중 졸업일자 결측 비율
print((f"전체 비율: {round((num_missing_graduated_at/deg_multi.shape[0])*100, 2)}%"))
print((f"전체 비율: {round((num_missing_graduated_at/deg.shape[0])*100, 2)}%"))

Unnamed: 0,degrees_p_id,degree_level,graduated_at,max_graduated_at
12,p:2351,2.00,1993-01-01,1993-01-01
13,p:2351,2.00,1990-01-01,1993-01-01
27,p:6559,2.00,NaT,2003-01-01
34,p:6614,2.00,1998-01-01,1998-01-01
35,p:6614,2.00,1998-01-01,1998-01-01
...,...,...,...,...
109257,p:207574,3.00,NaT,NaT
109278,p:268488,2.00,2005-01-01,2005-01-01
109279,p:268488,2.00,2004-01-01,2005-01-01
109287,p:268428,2.00,NaT,NaT


전체 비율: 44.45%
전체 비율: 4.09%


In [8]:
deg_multi["graduated_at"] = pd.to_datetime(deg_multi["graduated_at"], errors="coerce")

# 날짜가 있는 그룹: idxmax
tmp = deg_multi.dropna(subset=["graduated_at"])
idx1 = tmp.groupby("degrees_p_id")["graduated_at"].idxmax()

# 날짜가 전부 NaN인 그룹: 첫 행을 대표로 선택 → 항상 1행을 보장하고, 실행할 때마다 동일한 결과를 줌
all_na = deg_multi.groupby("degrees_p_id")["graduated_at"].transform(lambda s: s.isna().all())
idx2 = deg_multi.loc[all_na].groupby("degrees_p_id").head(1).index

# ✅ idx1을 Index로 변환해서 합치기
idx = pd.Index(idx1).union(idx2)

deg_multi_one_row = deg_multi.loc[idx].copy()
display(deg_multi_one_row)

print(deg["degrees_p_id"].is_unique)
print(deg_multi_one_row["degrees_p_id"].is_unique)

deg_tmp = deg.copy()

# 1) 치환 대상 p_id 목록
ids = deg_multi_one_row["degrees_p_id"].unique()

# 2) deg에서 해당 p_id 행은 전부 제거(= drop)
deg_keep = deg_tmp.loc[~deg_tmp["degrees_p_id"].isin(ids)].copy()

# 3) 대표행만 다시 붙이기(= replace)
deg_final = pd.concat([deg_keep, deg_multi_one_row], ignore_index=True, sort=False)

# degrees_p_id별 1행인지 점검 (deg_final은 전체적으로는 1행이 아닐 수 있음: p_id가 원래 여러 학력 가진 경우 유지되니까)
# 하지만 ids에 해당하는 p_id는 반드시 1행만 남아야 함
check = deg_final["degrees_p_id"].isin(ids)
assert (deg_final.loc[check, "degrees_p_id"].value_counts() == 1).all()

display(deg_final)

# deg_final에서 중복 degrees_p_id 대표행 선택 기준
# 1. degree_re_level.max()
# 2. graduated_at이 있으면 최신일(max)값이 우선
# 3. 그래도 동률이면 결측이 적은(정보량이 많은 행) 우선

print(deg_final.shape[0])
deg_final.drop_duplicates(keep="first", inplace=True)
print(deg_final.shape[0])

# (1) 중복 현황
dup_size = deg_final.groupby("degrees_p_id").size().sort_values(ascending=False)
print("전체 p_id 수:", deg_final["degrees_p_id"].nunique())
print("전체 행 수:", len(deg_final))
print("중복 p_id 수(2행 이상):", (dup_size >= 2).sum())

# (2) 중복 분포 (2행,3행... 몇 명인지)
display(dup_size.value_counts().sort_index())

# (3) 중복이 있는 p_id 예시 몇 개 보기
dup_ids = dup_size[dup_size >= 2].head(10).index
display(deg_final.loc[deg_final["degrees_p_id"].isin(dup_ids)].sort_values(["degrees_p_id"]))

# 0) 타입 정리
deg_final["degree_level_num"] = pd.to_numeric(deg_final["degree_level"], errors="coerce")
deg_final["graduated_at_dt"] = pd.to_datetime(deg_final["graduated_at"], errors="coerce")

# 1) 규칙 2를 위해: graduated_at 존재 여부 (있으면 1, 없으면 0)
deg_final["has_grad"] = deg_final["graduated_at_dt"].notna().astype(int)

# 2) 규칙 3을 위해: 정보량(결측이 적을수록 큼)
deg_final["non_null_cnt"] = deg_final.notna().sum(axis=1)

# 3) 우선순위대로 정렬
# - degree_re_level_num: 큰 값 우선
# - has_grad: 1(있음) 우선
# - graduated_at_dt: 최신(큰 값) 우선
# - non_null_cnt: 큰 값 우선
# (정렬 안정성을 위해 mergesort 권장)
deg_final_sorted = deg_final.sort_values(
    by=["degrees_p_id", "degree_level_num", "has_grad", "graduated_at_dt", "non_null_cnt"],
    ascending=[True, False, False, False, False],
    kind="mergesort"
)

# 4) p_id당 대표 1행 선택
deg_final_one = deg_final_sorted.drop_duplicates(subset=["degrees_p_id"], keep="first").copy()

# 5) 검증: degrees_p_id가 유니크여야 함
assert deg_final_one["degrees_p_id"].is_unique

# 6) 보조컬럼 정리
deg_final_one = deg_final_one.drop(
    columns=["degree_level_num", "graduated_at_dt", "has_grad", "non_null_cnt"],
    errors="ignore"
)

display(deg_final_one)

deg_final_one["degrees_p_id"].is_unique

Unnamed: 0,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized,max_graduated_at
12,p:2351,LLB,Law,University of Toronto School of Law,1993-01-01,Bachelor’s degree,2.00,0,Law,toronto university,1993-01-01
34,p:6614,BS,Psychology,University of Texas,1998-01-01,Bachelor’s degree,2.00,0,Psychology / Sociology,texas university,1998-01-01
53,p:2371,BE,Electrical Engineering and Computer Science,"University of California, Berkeley",NaT,Bachelor’s degree,2.00,0,Computer Science / Software,"california, berkeley university",NaT
89,p:7080,MS,MBA,Harvard University,1986-01-01,Master’s degree,3.00,0,,harvard university,1986-01-01
133,p:2755,MBA,,London Business School,2005-01-01,Master’s degree,3.00,0,,london business school,2005-01-01
...,...,...,...,...,...,...,...,...,...,...,...
109238,p:268462,M.S,Materials Science,Northwestern University,NaT,Master’s degree,3.00,0,Mechanical / Industrial Engineering,northwestern university,NaT
109241,p:268466,Diploma,Business Administration,ICADE,NaT,Associate degree,1.00,0,Business Administration,,NaT
109256,p:207574,MSc,Information Science,IESE Business School,NaT,Master’s degree,3.00,0,Computer Science / Software,iese business school,NaT
109278,p:268488,Bachelor of Fine Arts (BFA),Theatre/Theater,SUNY Purchase,2005-01-01,Bachelor’s degree,2.00,0,Arts / Humanities,,2005-01-01


False
True


Unnamed: 0,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized,max_graduated_at
0,p:6117,MBA,,,,Master’s degree,3.00,0,,,NaT
1,p:6136,BA,"English, French","Washington University, St. Louis",1990-01-01,Bachelor’s degree,2.00,0,Arts / Humanities,washington university,NaT
2,p:6136,MS,Mass Communication,Boston University,1992-01-01,Master’s degree,3.00,0,Marketing / Communications,boston university,NaT
3,p:6005,MS,Internet Technology,University of Greenwich,2006-01-01,Master’s degree,3.00,0,Computer Science / Software,greenwich university,NaT
4,p:5832,BCS,"Computer Science, Psychology",Rice University,,Bachelor’s degree,2.00,0,Computer Science / Software,rice university,NaT
...,...,...,...,...,...,...,...,...,...,...,...
101528,p:268462,M.S,Materials Science,Northwestern University,NaT,Master’s degree,3.00,0,Mechanical / Industrial Engineering,northwestern university,NaT
101529,p:268466,Diploma,Business Administration,ICADE,NaT,Associate degree,1.00,0,Business Administration,,NaT
101530,p:207574,MSc,Information Science,IESE Business School,NaT,Master’s degree,3.00,0,Computer Science / Software,iese business school,NaT
101531,p:268488,Bachelor of Fine Arts (BFA),Theatre/Theater,SUNY Purchase,2005-01-01 00:00:00,Bachelor’s degree,2.00,0,Arts / Humanities,,2005-01-01


101533
101533
전체 p_id 수: 68451
전체 행 수: 101533
중복 p_id 수(2행 이상): 27401


1     41050
2     22576
3      4121
4       598
5        76
6        20
7         7
8         1
9         1
10        1
Name: count, dtype: int64

Unnamed: 0,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized,max_graduated_at
26087,p:183805,Microsoft Project 2007 Full Edit,,European School of Project Management,2010-01-01,Others,0.0,0,,european school of project management,NaT
26091,p:183805,5 years,Computer science,Università degli Studi di Torino,2003-01-01,Master’s degree,3.0,0,Computer Science / Software,università degli studi di torino,NaT
26090,p:183805,Thesis development,Symbolic solutions for Petri Nets,"William & Mary College, Willamsburg",2003-01-01,Others,0.0,0,Others / Unknown,william & mary college,NaT
26089,p:183805,Training course,PVS Theorem Prover,"NASA Langley Research Center, National Institute for Aerospace,",2003-01-01,Others,0.0,0,Marketing / Communications,"nasa langley research center, national institute for aerospace",NaT
26088,p:183805,EEF Foundations Summer School,Concurrency (Concurrent Software Systems),EEF Foundations Summer School,2003-01-01,Others,0.0,0,Computer Science / Software,eef foundations summer school,NaT
26086,p:183805,Certified Tester Foundation Leve,Software Testing,International Software Testing Qualification Board,2011-01-01,Others,0.0,0,Computer Science / Software,international software testing qualification board,NaT
26085,p:183805,Certified Tester Advanced Level,Software Testing,International Software Testing Qualification Board,2012-01-01,Others,0.0,0,Computer Science / Software,international software testing qualification board,NaT
26092,p:183805,Diploma,Aviation Technology,I.T.I.S. Carlo Grassi,1994-01-01,Associate degree,1.0,0,Computer Science / Software,i.t.i.s. carlo grassi,NaT
26084,p:183805,ITIL Foundation Certificate,IT Service Management,Information Systems Examinations Board,2012-01-01,Others,0.0,0,Computer Science / Software,information systems examinations board,NaT
26093,p:183805,,,Fedeli Compagne di Gesù,,,,1,,fedeli compagne di gesù,NaT


Unnamed: 0,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized,max_graduated_at
2195,p:10,,Computer Science,Harvard University,,,,1,Computer Science / Software,harvard university,NaT
73646,p:100006,MBA,Strategy & Marketing,MIT Sloan School of Management,2004-01-01,Master’s degree,3.00,0,Marketing / Communications,mit sloan school of management,NaT
98252,p:100013,MBA,Finance,Rutgers University,NaT,Master’s degree,3.00,0,Finance / Accounting,rutgers university,NaT
12924,p:100018,MBA,"Entrepreneurship, Marketing","Columbia University, Business School",2004-01-01,Master’s degree,3.00,0,Marketing / Communications,columbia university,NaT
12927,p:100031,BS,Information Science & Engineering,Gogte Institute of Technology,2010-01-01,Bachelor’s degree,2.00,0,Computer Science / Software,gogte institute of technology,NaT
...,...,...,...,...,...,...,...,...,...,...,...
12921,p:99983,,Finance,University of Washington,,,,1,Finance / Accounting,washington university,NaT
12922,p:99988,BS,Maths and Computer Science,Bristol University,2004-01-01,Bachelor’s degree,2.00,0,Computer Science / Software,bristol university,NaT
13025,p:99990,engineering,,Technion,1971-01-01,Others,0.00,0,,technion,NaT
90350,p:99994,BE,Electronics and Communications,"University of Madras, India",,Bachelor’s degree,2.00,0,Marketing / Communications,"madras, india university",NaT


True

## founder_with_people와 deg_final_one(최종학력 행만 남김) 최종학력 조인 

In [9]:
# 학력 정보 조인 (deg) - 최종학력 행만 남기기 
founder_profile = founder_with_people.merge(
    deg_final_one,
    how = 'left',
    left_on = 'people_p_id',
    right_on = 'degrees_p_id'
)
founder_profile      # ✅ 1 row = 창업자 x 회사 x 출생지 x 학위수준 x 전공 x 학교

# 회사 정보 조인 (company만 필터)
companies = obj[obj['objects_cfpr_id'].str.startswith('c:')][obj_use_cols].copy()

founder_company = founder_profile.merge(
    companies,
    how = 'left',
    left_on = 'rel_cf_id',
    right_on = 'objects_cfpr_id'
)
founder_company
founder_profile


Unnamed: 0,relationship_id,rel_p_id,rel_cf_id,is_past,sequence,cat_rel_title,people_p_id,cat_people_birthplace,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized,max_graduated_at
0,1,p:2,c:1,0,8,Founder,p:2,,p:2,BS,Electrical Engineering/Computer Science,"University of California, Berkeley",1994-01-01 00:00:00,Bachelor’s degree,2.00,0.00,Computer Science / Software,"california, berkeley university",1994-01-01
1,9,p:10,c:5,0,1,Founder,p:10,,p:10,,Computer Science,Harvard University,,,,1.00,Computer Science / Software,harvard university,NaT
2,10,p:11,c:5,1,1,Founder,p:11,United States,p:11,No degree,,Harvard University,,Bachelor’s degree,2.00,0.00,,harvard university,NaT
3,14,p:16,c:7299,1,2,Founder,p:16,,p:16,BA,Computer Information Systems,Colorado State University,,Bachelor’s degree,2.00,0.00,Computer Science / Software,colorado state university,NaT
4,15,p:17,c:7299,1,2,Founder,p:17,,p:17,,Electrical Engineering,University of Texas at Austin,,,,1.00,Computer Science / Software,texas at austin university,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62110,480741,p:29245,c:286154,1,1,Founder,p:29245,,,,,,,,,,,,NaT
62111,480794,p:254302,c:286157,0,3,Founder,p:254302,,,,,,,,,,,,NaT
62112,480796,p:254302,c:286160,0,4,Founder,p:254302,,,,,,,,,,,,NaT
62113,480797,p:268562,c:286160,0,1,Founder,p:268562,,,,,,,,,,,,NaT


In [10]:
founder_profile.duplicated().sum()

np.int64(0)

## founder_profile을 집계 : rel_p_id, rel_cf_id 기준

In [11]:
founder_profile.columns

Index(['relationship_id', 'rel_p_id', 'rel_cf_id', 'is_past', 'sequence',
       'cat_rel_title', 'people_p_id', 'cat_people_birthplace', 'degrees_p_id',
       'degree_type', 'subject', 'institution', 'graduated_at',
       'cat_degrees_degree_type', 'degree_level', 'is_degree_missing',
       'cat_degrees_subject', 'institution_normalized', 'max_graduated_at'],
      dtype='object')

In [12]:
# rel_p_id, rel_cf_id 기준 집계 : agg_founder_profile
agg_founder_profile = founder_profile.groupby(['rel_p_id', 'rel_cf_id']).agg(
    is_past      = ('is_past', 'first'),
    sequence     = ('sequence', 'max'),
    birthplace   = ('cat_people_birthplace', 'first'),
    degree_level = ('degree_level', 'max'),
    subject      = ('cat_degrees_subject', 'first'),
    institution  = ('institution_normalized', 'first'),
    graduated_at = ('graduated_at', 'first')
).reset_index()

In [13]:
# agg_founder_profile 에서 c:만 필터링 (이유 : 창업자와 창업한 회사만 보고, 투자자는 제외하려고)
c_agg_founder_profile = agg_founder_profile[agg_founder_profile['rel_cf_id'].str.startswith('c:')].copy()
c_agg_founder_profile

Unnamed: 0,rel_p_id,rel_cf_id,is_past,sequence,birthplace,degree_level,subject,institution,graduated_at
0,p:10,c:5,0,1,,,Computer Science / Software,harvard university,
2,p:100009,c:152329,0,1,,,,,
3,p:100009,c:81559,0,2,,,,,
4,p:100017,c:81564,0,1,,,,,
5,p:100025,c:287,0,1,,,,,
...,...,...,...,...,...,...,...,...,...
61822,p:99982,c:16993,1,1,,,,,
61823,p:99982,c:81541,0,2,,,,,
61824,p:99988,c:64683,0,1,,2.00,Computer Science / Software,bristol university,2004-01-01
61825,p:99994,c:3643,0,1,,2.00,Marketing / Communications,"madras, india university",


## 창업자가 창업한 회사(c:)의 수 = 즉, 창업경험 수

In [14]:
# 창업자가 창업한 회사 수 : cnt_founding
cnt_founding = c_agg_founder_profile.groupby('rel_p_id').agg(
    n_founding = ('rel_cf_id', 'nunique')
)
cnt_founding

Unnamed: 0_level_0,n_founding
rel_p_id,Unnamed: 1_level_1
p:10,1
p:100009,2
p:100017,1
p:100025,1
p:100026,1
...,...
p:99980,4
p:99982,2
p:99988,1
p:99994,1


## c_agg_founder_profile과 cnt_founding을 merge

In [15]:
# agg_founder_profile과 cnt_founding을 merge
founder_master = c_agg_founder_profile.merge(
    cnt_founding,
    how = 'left',
    on = 'rel_p_id'
)
founder_master

# 컬럼 순서 변경
fm_cols = ['rel_p_id', 'rel_cf_id', 'n_founding', 'is_past', 'sequence', 
    'birthplace', 'degree_level', 'subject', 'institution', 'graduated_at'
    ]
founder_master = founder_master[fm_cols]
founder_master

Unnamed: 0,rel_p_id,rel_cf_id,n_founding,is_past,sequence,birthplace,degree_level,subject,institution,graduated_at
0,p:10,c:5,1,0,1,,,Computer Science / Software,harvard university,
1,p:100009,c:152329,2,0,1,,,,,
2,p:100009,c:81559,2,0,2,,,,,
3,p:100017,c:81564,1,0,1,,,,,
4,p:100025,c:287,1,0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...
59961,p:99982,c:16993,2,1,1,,,,,
59962,p:99982,c:81541,2,0,2,,,,,
59963,p:99988,c:64683,1,0,1,,2.00,Computer Science / Software,bristol university,2004-01-01
59964,p:99994,c:3643,1,0,1,,2.00,Marketing / Communications,"madras, india university",


#### 스타트업의 창업자 조건(with 수아님) : 투자기업 창업자(투자만 행한)만 제외한 모든 스타트업의 창업자

In [16]:
# 0) 투자자 / 투자받은 회사 ID 집합
investor_ids = set(inv['investor_cfp_id'].dropna())
invested_ids = set(inv['invested_c_id'].dropna())

# 1) 투자자 전용 ID (투자만 하고, 투자받지 않은 주체)
pure_investor_ids = investor_ids - invested_ids

# 2) 조건 1: 금융기관(f:) 제외
cond1 = ~founder_master['rel_cf_id'].str.startswith('f:', na=False)

# 3) 조건 2: 투자자 전용 ID 제외
cond2 = ~founder_master['rel_cf_id'].isin(pure_investor_ids)

# 4) 최종 필터링
founder_master_filtered = founder_master[cond1 & cond2].copy()

In [17]:
# 창업자 - 회사 관계 중복 확인
founder_master_filtered[['rel_p_id', 'rel_cf_id']].duplicated().sum()

np.int64(0)

In [18]:
# founder_master csv 파일로 저장!
founder_master_filtered.to_csv("./data/founder_master.csv", index=False)

---

# 창업자와 회사 관계 : only 투자를 받은 회사 (invested_c_id)

- with 채연님

In [19]:
# 1) [투자를 받은 회사 & 투자를 한 회사] 확인
overlap_ids_1 = set(inv['invested_c_id']) & set(inv['investor_cfp_id'])
print("겹치는 ID 개수:", len(overlap_ids_1))

# 조건1 : f: 가 아닌 행들
cond1 = ~founder_master['rel_cf_id'].str.startswith('f:', na=False)

# 조건2 : rel_cf_id 에서 1)을 포함하는 행들 + rel_cf_id 에서 invested_c_id가 들어있는 행들 => 투자를 받은 스타트업 회사
cond2 = founder_master['rel_cf_id'].isin(overlap_ids_1) | founder_master['rel_cf_id'].isin(inv['invested_c_id'])

invested_founder_master = founder_master[cond1 & cond2].copy()
invested_founder_master[['rel_p_id', 'rel_cf_id']].duplicated().sum()

# 중복제거
# dup_founder_master = founder_master.drop_duplicates(subset=['rel_cf_id'])
# print(f"중복제거 전 행수:", founder_master.shape)
# print(f"중복제거 후 행수:", dup_founder_master.shape)


겹치는 ID 개수: 193


np.int64(0)

In [20]:
# 투자받은 스타트업 회사 테이블
invested_founder_master.to_csv("./data/invested_founder_master.csv", index=False)

In [21]:
print(f"모든 스타트업의 창업자 수:",founder_master.shape[0])
print(f"투자만 행한 스타트업을 제외한 모든 스타트업의 창업자 수", founder_master_filtered.shape[0])

# 투자자인 창업자
investor_founder = founder_master.shape[0] - founder_master_filtered.shape[0]
print(f"\n투자만 행한 스타트업의 창업자 수:", investor_founder)

# 투자받은 스타트업
print(f"\n투자받은 스타트업의 창업자 수:", invested_founder_master.shape[0])

모든 스타트업의 창업자 수: 59966
투자만 행한 스타트업을 제외한 모든 스타트업의 창업자 수 59516

투자만 행한 스타트업의 창업자 수: 450

투자받은 스타트업의 창업자 수: 14747


---

# success_master 불러오기

In [22]:
success_master = pd.read_csv("./data/success_master.csv")

In [23]:
success_master[success_master['objects_cfpr_id'].str.startswith('c:')]
success_master

Unnamed: 0,objects_cfpr_id,founded_at,closed_at,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,relationships,cat_obj_status,obj_category_filled,cat_obj_overview,obj_state_filled,is_obj_funding_total_usd_private,success_flag,size_bin,time_to_last_round_months,funding_round_id,fr_c_id,...,is_last_round,funded_year,funded_quarter,cat_fr_type,num_fr_type,log_participants,is_fr_raised_private,prev_round_date,round_tempo_days,round_tempo_months,acquired_c_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipos_c_id,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private,offices_c_id,n_offices
0,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,32.00,888.00,c:1,...,1.00,2005.00,2005Q4,series-a,1.00,1.10,0.00,,,,,,,,,,,,,,,,,c:1,2.00
1,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,32.00,889.00,c:1,...,0.00,2007.00,2007Q1,series-b,2.00,1.39,0.00,2005-10-01,,,,,,,,,,,,,,,,c:1,2.00
2,c:1,2005-10-17,,Technology Platform Company,USA,seattle,,,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,32.00,2312.00,c:1,...,0.00,2008.00,2008Q2,series-c+,3.00,1.61,0.00,,,,,,,,,,,,,,,,,c:1,2.00
3,c:1001,2007-10-01,,Social network aggregator,USA,mountain view,,,0,0,,2008-02-26,1.00,5000000.00,14,acquired,web,Software & Technology,CA,0,1,대형팀,,1644.00,c:1001,...,1.00,2008.00,2008Q1,series-a,1.00,1.39,0.00,,,,c:1001,1901.00,c:5,2009-08-10,0.00,47500000.00,,,,,,,,c:1001,1.00
4,c:10014,2008-03-01,,iPhone Ad Exchange Provider,USA,palo alto,,,0,0,,2008-09-01,,,9,acquired,mobile,Software & Technology,CA,1,1,대형팀,,6682.00,c:10014,...,1.00,2008.00,2008Q3,seed,0.00,0.69,1.00,,,,c:10014,3878.00,c:23054,2010-09-30,1.00,,,,,,,,,c:10014,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483534,r:9995,,,,,,,,0,0,,,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
483535,r:9996,,,,,,,,0,0,,,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
483536,r:9997,,,,,,,,0,0,,,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
483537,r:9998,,,,,,,,0,0,,,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


# success_master의 success_flag 집계

- founder_company_success : 창업자 x 회사 x 회사의 성공(success_flag)

In [24]:
# 1) success_master 에서 objects_cfpr_id가 [c:]인 것만 필터링
success_companies = success_master[success_master['objects_cfpr_id'].str.startswith('c:')].copy()

# 2) objects_cfpr_id 기준으로 success_flag 집계 
scs_flag = (success_companies
    .groupby('objects_cfpr_id')['success_flag']
    .first()
    .reset_index(name='success_flag')
    )
scs_flag    # ✅ c: 별 성공 플래그

Unnamed: 0,objects_cfpr_id,success_flag
0,c:1,1
1,c:10,1
2,c:100,1
3,c:10000,0
4,c:10001,0
...,...,...
196544,c:99940,0
196545,c:9995,0
196546,c:9996,0
196547,c:9997,0


In [25]:
# founder_master와 scs_flag를 merge
founder_company_success = founder_master_filtered.merge(
    scs_flag,
    how = 'left',
    left_on = 'rel_cf_id',
    right_on = 'objects_cfpr_id'
)

founder_company_success     # ✅ 창업자 x 회사 x 회사의 성공(success_flag)
founder_company_success[['rel_p_id', 'rel_cf_id']].duplicated().sum()      # 중복 X

np.int64(0)

In [26]:
# 창업자 입장 : 평생 한번이라도 success를 경험했는지
founder_success = founder_company_success.groupby('rel_p_id').agg(
    founder_ever_success = ('success_flag', lambda x: int((x > 0).any()))
).reset_index()

founder_success     # ✅ founder_ever_success =1 : 해당 창업자(rel_p_id)는 한번이라도 success를 했다.

Unnamed: 0,rel_p_id,founder_ever_success
0,p:10,1
1,p:100009,0
2,p:100017,0
3,p:100025,0
4,p:100026,0
...,...,...
49738,p:99980,1
49739,p:99982,1
49740,p:99988,0
49741,p:99994,0


In [27]:
# founder_company_success와 founder_success을 merge
founder_with_success = founder_success.merge(
    founder_company_success,
    how = 'inner',
    on = 'rel_p_id',
)
founder_with_success        # ✅ 창업자(출생지, 최종학력, 학교, 전공) x 창업자의 성공경험 x 회사의 성공

Unnamed: 0,rel_p_id,founder_ever_success,rel_cf_id,n_founding,is_past,sequence,birthplace,degree_level,subject,institution,graduated_at,objects_cfpr_id,success_flag
0,p:10,1,c:5,1,0,1,,,Computer Science / Software,harvard university,,c:5,1.00
1,p:100009,0,c:152329,2,0,1,,,,,,c:152329,0.00
2,p:100009,0,c:81559,2,0,2,,,,,,c:81559,0.00
3,p:100017,0,c:81564,1,0,1,,,,,,c:81564,0.00
4,p:100025,0,c:287,1,0,1,,,,,,c:287,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59511,p:99982,1,c:16993,2,1,1,,,,,,c:16993,1.00
59512,p:99982,1,c:81541,2,0,2,,,,,,c:81541,0.00
59513,p:99988,0,c:64683,1,0,1,,2.00,Computer Science / Software,bristol university,2004-01-01,c:64683,0.00
59514,p:99994,0,c:3643,1,0,1,,2.00,Marketing / Communications,"madras, india university",,c:3643,0.00


- 회사 기준 집계

In [28]:
success_companies.columns

# use_cols = ['objects_cfpr_id', 'relationships', 'obj_category_filled', 'size_bin']

Index(['objects_cfpr_id', 'founded_at', 'closed_at', 'description',
       'country_code', 'obj_city_fixed', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'relationships', 'cat_obj_status',
       'obj_category_filled', 'cat_obj_overview', 'obj_state_filled',
       'is_obj_funding_total_usd_private', 'success_flag', 'size_bin',
       'time_to_last_round_months', 'funding_round_id', 'fr_c_id', 'funded_at',
       'funding_round_type', 'funding_round_code', 'raised_amount_usd',
       'pre_money_valuation_usd', 'post_money_valuation_usd', 'participants',
       'is_first_round', 'is_last_round', 'funded_year', 'funded_quarter',
       'cat_fr_type', 'num_fr_type', 'log_participants',
       'is_fr_raised_private', 'prev_round_date', 'round_tempo_days',
       'round_tempo_months', 'acquired_c_id', 'acquisition_id',
       'acquiring_c_id', 'acquir

In [29]:
company_with_success = success_companies.groupby('objects_cfpr_id').agg(
    success_flag  = ('success_flag', 'max'),
    relationships = ('relationships', 'first'),
    category      = ('obj_category_filled', 'first'),
    company_size  = ('size_bin', 'first')
).reset_index()

company_with_success        # ✅  회사(c) x 성공여부 x 관계수 x 산업 x 규모 

Unnamed: 0,objects_cfpr_id,success_flag,relationships,category,company_size
0,c:1,1,17,web,대형팀
1,c:10,1,6,games_video,대형팀
2,c:100,1,12,games_video,대형팀
3,c:10000,0,0,network_hosting,초소형팀
4,c:10001,0,0,games_video,초소형팀
...,...,...,...,...,...
196544,c:99940,0,5,ecommerce,대형팀
196545,c:9995,0,14,public_relations,대형팀
196546,c:9996,0,44,consulting,대형팀
196547,c:9997,0,1,search,소형팀


In [30]:
print(founder_with_success.columns)
print(f"founder_with_success 행수:", founder_with_success.shape[0])
print('==='*30)
print(company_with_success.columns)
print(f"company_with_success 행수:", company_with_success.shape[0])

Index(['rel_p_id', 'founder_ever_success', 'rel_cf_id', 'n_founding',
       'is_past', 'sequence', 'birthplace', 'degree_level', 'subject',
       'institution', 'graduated_at', 'objects_cfpr_id', 'success_flag'],
      dtype='object')
founder_with_success 행수: 59516
Index(['objects_cfpr_id', 'success_flag', 'relationships', 'category',
       'company_size'],
      dtype='object')
company_with_success 행수: 196549


# founder_with_success, company_with_success 를 조인

In [31]:
successed_founder_company = founder_with_success.merge(
    company_with_success,
    how = 'left',
    on = 'objects_cfpr_id'
)

successed_founder_company[['rel_p_id', 'rel_cf_id']].duplicated().sum()     # 중복 X
successed_founder_company       # ✅ successed_founder_company : 창업자(출생지, 학교, 전공, 학위수준) x 회사(산업, 규모) 
                                # 나중에 마일스톤을 여기에 조인하면 될듯!

Unnamed: 0,rel_p_id,founder_ever_success,rel_cf_id,n_founding,is_past,sequence,birthplace,degree_level,subject,institution,graduated_at,objects_cfpr_id,success_flag_x,success_flag_y,relationships,category,company_size
0,p:10,1,c:5,1,0,1,,,Computer Science / Software,harvard university,,c:5,1.00,1.00,269.00,social,대형팀
1,p:100009,0,c:152329,2,0,1,,,,,,c:152329,0.00,0.00,2.00,software,중형팀
2,p:100009,0,c:81559,2,0,2,,,,,,c:81559,0.00,0.00,2.00,consulting,중형팀
3,p:100017,0,c:81564,1,0,1,,,,,,c:81564,0.00,0.00,15.00,medical,대형팀
4,p:100025,0,c:287,1,0,1,,,,,,c:287,0.00,0.00,21.00,games_video,대형팀
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59511,p:99982,1,c:16993,2,1,1,,,,,,c:16993,1.00,1.00,2.00,games_video,중형팀
59512,p:99982,1,c:81541,2,0,2,,,,,,c:81541,0.00,0.00,3.00,software,대형팀
59513,p:99988,0,c:64683,1,0,1,,2.00,Computer Science / Software,bristol university,2004-01-01,c:64683,0.00,0.00,1.00,software,소형팀
59514,p:99994,0,c:3643,1,0,1,,2.00,Marketing / Communications,"madras, india university",,c:3643,0.00,0.00,3.00,public_relations,대형팀


In [32]:
# # 창업자 기준 성공 집계
# agg_founder_success = successed_founder_company.groupby('rel_p_id').agg(
#     n_founding           = ('n_founding', 'first'),              # n_founding : 창업자의 창업 횟수
#     founder_ever_success = ('founder_ever_success', 'max'),      # founder_ever_success : 창업자가 한번이라도 성공경험이 있는가
#     # success_flag         = ('success_flag_x', 'max'),            # success_flag : 창업자의 회사는 하나라도 성공했는가  
#     bplace               = ('birthplace', 'first'),              # 창업자 출생지
#     degree_level         = ('degree_level', 'first'),            # 창업자 학위 수준 
#     institution          = ('institution', 'first'),             # 창업자 출신 학교
#     subject              = ('subject', 'first')                  # 창업자 전공
#     # company_size         = ('size_bin', 'first'),              # company_size : 회사규모 -> 따로 집계 or 성공테이블 확인
#     # company_category     = ('obj_category_filled', 'first')    # 창업한 회사의 산업분야     -> 따로 집계 or 성공테이블 확인
# ).reset_index()

# display(agg_founder_success)     # ✅ agg_founder_success : 창업자 x 성공경험 여부 x 출생지 x 학위 x 학교 x 전공

# 데이터 csv 파일로 저장

In [33]:
successed_founder_company.to_csv('./data/successed_founder_company.csv', index=False, encoding='utf-8-sig')
print("CSV 파일 저장 완료! (인덱스 제외)")

CSV 파일 저장 완료! (인덱스 제외)


---