In [14]:
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 통계 라이브러리
from scipy import stats
from scipy.stats import chi2_contingency, mannwhitneyu, kruskal
import statsmodels.api as sm

# 머신러닝
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage, dendrogram

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

# 한글 폰트 설정
pd.set_option("display.float_format","{:.2f}".format)
# OS에 따라 다른 폰트 지정
if platform.system() == 'Darwin':   # macOS
    plt.rcParams['font.family'] = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    plt.rcParams['font.family'] = 'Malgun Gothic'
else:  # Linux (예: Colab, Ubuntu)
    plt.rcParams['font.family'] = 'NanumGothic'

Darwin


In [15]:
import pandas as pd

# 날짜 컬럼 방지
def read_csv_auto_dates(path):
    # 1) 전체 파일을 먼저 읽기
    temp = pd.read_csv(path, low_memory=False)
    
    # 2) _at 로 끝나는 컬럼 자동 탐지
    date_cols = [col for col in temp.columns if col.endswith('_at')]
    
    # 3) 다시 날짜 컬럼을 datetime 으로 읽기
    return pd.read_csv(path, parse_dates=date_cols, low_memory=False)

# Load Files (전처리)
acq = read_csv_auto_dates("./clean_data_v1/clean_acquisitions_final.csv")
deg = read_csv_auto_dates("./clean_data_v1/clean_degrees_final.csv")
frs = read_csv_auto_dates("./clean_data_v1/clean_fr_final.csv")
fds = read_csv_auto_dates("./clean_data_v1/clean_funds_final.csv")
inv = read_csv_auto_dates("./clean_data_v1/clean_investments_final.csv")
ipo = read_csv_auto_dates("./clean_data_v1/clean_ipos_final.csv")
mil = read_csv_auto_dates("./clean_data_v1/clean_milestones_final.csv")
obj = read_csv_auto_dates("./clean_data_v1/clean_objects_final.csv")
peo = read_csv_auto_dates("./clean_data_v1/clean_people_final.csv")
off = read_csv_auto_dates("./clean_data_v1/clean_offices_final.csv")
rel = read_csv_auto_dates("./clean_data_v1/clean_relationships_final.csv")

# statistic join
startup_office = read_csv_auto_dates("./statistic_join/startup_office.csv")

In [16]:
startup_office

Unnamed: 0,objects_cfpr_id,founded_at,closed_at,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,relationships,cat_obj_status,obj_category_filled,cat_obj_overview,obj_state_filled,is_obj_funding_total_usd_private,success_flag,size_bin,round_tempo_months,time_to_last_round_months,funding_round_id,fr_c_id,funded_at,raised_amount_usd,participants,is_first_round,is_last_round,num_fr_type,is_fr_raised_private,cat_fr_type,investments_id,invested_c_id,investor_cfp_id,acquisition_id,acquiring_c_id,acquired_at,is_acq_price_private,price_amount_usd,ipo_id,first_public_at,valuation_amount_usd,ipo_raised_amount_usd,is_ipos_valuation_private,is_ipos_raised_private,reinvest_flag,n_offices
0,c:1,2005-10-17,NaT,Technology Platform Company,USA,seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,16.00,32.00,888.00,c:1,2005-10-01,5250000.00,2.00,0.00,1.00,1.00,0.00,series-a,1289.00,c:1,f:430,,,NaT,,,,NaT,,,,,False,2.00
1,c:1,2005-10-17,NaT,Technology Platform Company,USA,seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,16.00,32.00,888.00,c:1,2005-10-01,5250000.00,2.00,0.00,1.00,1.00,0.00,series-a,1290.00,c:1,f:3,,,NaT,,,,NaT,,,,,False,2.00
2,c:1,2005-10-17,NaT,Technology Platform Company,USA,seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,16.00,32.00,889.00,c:1,2007-01-01,9500000.00,3.00,0.00,0.00,2.00,0.00,series-b,1291.00,c:1,f:4,,,NaT,,,,NaT,,,,,False,2.00
3,c:1,2005-10-17,NaT,Technology Platform Company,USA,seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,16.00,32.00,889.00,c:1,2007-01-01,9500000.00,3.00,0.00,0.00,2.00,0.00,series-b,1292.00,c:1,f:430,,,NaT,,,,NaT,,,,,True,2.00
4,c:1,2005-10-17,NaT,Technology Platform Company,USA,seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3.00,39750000.00,17,operating,web,Software & Technology,WA,0,1,대형팀,16.00,32.00,889.00,c:1,2007-01-01,9500000.00,3.00,0.00,0.00,2.00,0.00,series-b,1293.00,c:1,f:3,,,NaT,,,,NaT,,,,,True,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531076,r:9995,NaT,NaT,,,,NaT,NaT,0,0,NaT,NaT,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,,NaT,,,,,,,,,,,,,NaT,,,,NaT,,,,,False,
531077,r:9996,NaT,NaT,,,,NaT,NaT,0,0,NaT,NaT,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,,NaT,,,,,,,,,,,,,NaT,,,,NaT,,,,,False,
531078,r:9997,NaT,NaT,,,,NaT,NaT,0,0,NaT,NaT,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,,NaT,,,,,,,,,,,,,NaT,,,,NaT,,,,,False,
531079,r:9998,NaT,NaT,,,,NaT,NaT,0,0,NaT,NaT,0.00,0.00,0,operating,software,Software & Technology,,0,0,,,,,,NaT,,,,,,,,,,,,,NaT,,,,NaT,,,,,False,


## [조인] ML용 피처를 위한 조인

### 회사 기준 n_offices 만들기

In [17]:
# 🔗1️⃣ 조인할 n_offices 만들기 -> startup_office 집계  
cnt_offices = startup_office.groupby('objects_cfpr_id')['n_offices'].max().reset_index(name='n_offices')
cnt_offices

Unnamed: 0,objects_cfpr_id,n_offices
0,c:1,2.00
1,c:10,1.00
2,c:100,1.00
3,c:10000,
4,c:10001,
...,...,...
462615,r:9995,
462616,r:9996,
462617,r:9997,
462618,r:9998,


### founder만 추출해서 학교, 전공, 학위 만들기

In [18]:
rd = (
    rel
    .merge(deg, how='left', left_on='rel_p_id', right_on='degrees_p_id')
)
rd_founder = rd[rd['cat_rel_title'] == 'Founder'].copy()
rd_founder  # rel_p_id, rel_cf_id, degree_level, cat_degrees_subject, institution_normalized

Unnamed: 0,relationship_id,rel_p_id,rel_cf_id,start_at,end_at,is_past,sequence,title,has_end_date,cat_rel_title,degrees_p_id,degree_type,subject,institution,graduated_at,cat_degrees_degree_type,degree_level,is_degree_missing,cat_degrees_subject,institution_normalized
0,1,p:2,c:1,NaT,NaT,0,8,Co-Founder/CEO/Board of Directors,0,Founder,p:2,BS,Electrical Engineering/Computer Science,"University of California, Berkeley",1994-01-01,Bachelor’s degree,2.00,0.00,Computer Science / Software,"california, berkeley university"
1,1,p:2,c:1,NaT,NaT,0,8,Co-Founder/CEO/Board of Directors,0,Founder,p:2,BS,Applied Mathematics,"University of California, Berkeley",1994-01-01,Bachelor’s degree,2.00,0.00,Computer Science / Software,"california, berkeley university"
11,9,p:10,c:5,NaT,NaT,0,1,"Founder and CEO, Board Of Directors",0,Founder,p:10,,Computer Science,Harvard University,NaT,,,1.00,Computer Science / Software,harvard university
12,10,p:11,c:5,NaT,NaT,1,1,Co-Founder,0,Founder,p:11,No degree,,Harvard University,NaT,Bachelor’s degree,2.00,0.00,,harvard university
15,14,p:16,c:7299,NaT,NaT,1,2,Founder and CEO,0,Founder,p:16,BA,Computer Information Systems,Colorado State University,NaT,Bachelor’s degree,2.00,0.00,Computer Science / Software,colorado state university
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563564,480741,p:29245,c:286154,2004-01-01,NaT,1,1,Founder,0,Founder,,,,,NaT,,,,,
563629,480794,p:254302,c:286157,NaT,NaT,0,3,Founder,0,Founder,,,,,NaT,,,,,
563631,480796,p:254302,c:286160,NaT,NaT,0,4,Co-founder and Director,0,Founder,,,,,NaT,,,,,
563632,480797,p:268562,c:286160,NaT,NaT,0,1,Co-founder and CEO,0,Founder,,,,,NaT,,,,,


### rel_cf_id, rel_p_id 쌍 기준 집계

In [19]:
# rd_founder를 회사단위로 집계
c_rd_founder = (
    rd_founder.groupby(['rel_p_id', 'rel_cf_id']).agg(
    degree_level = ('degree_level', 'max'),
    subject      = ('cat_degrees_subject', 'first'),
    institution  = ('institution_normalized', 'first')
).reset_index()
)
c_rd_founder = c_rd_founder[c_rd_founder['rel_cf_id'].str.startswith('c:')].copy()
c_rd_founder

Unnamed: 0,rel_p_id,rel_cf_id,degree_level,subject,institution
0,p:10,c:5,,Computer Science / Software,harvard university
2,p:100009,c:152329,,,
3,p:100009,c:81559,,,
4,p:100017,c:81564,,,
5,p:100025,c:287,,,
...,...,...,...,...,...
61822,p:99982,c:16993,,,
61823,p:99982,c:81541,,,
61824,p:99988,c:64683,2.00,Computer Science / Software,bristol university
61825,p:99994,c:3643,2.00,Marketing / Communications,louisiana university


### 사람 기준 n_founding 만들기

In [20]:
# 창업경험 수 n_founding
cnt_founding = c_rd_founder.groupby('rel_p_id')['rel_cf_id'].nunique().reset_index(name='n_founding')
cnt_founding

Unnamed: 0,rel_p_id,n_founding
0,p:10,1
1,p:100009,2
2,p:100017,1
3,p:100025,1
4,p:100026,1
...,...,...
50016,p:99980,4
50017,p:99982,2
50018,p:99988,1
50019,p:99994,1


In [21]:
# 🔗 c_rd_founder <- cnt_founding 조인
rd_founding = (
    c_rd_founder.merge(
    cnt_founding,
    how = 'left',
    on  = 'rel_p_id'
    )
)
rd_founding

Unnamed: 0,rel_p_id,rel_cf_id,degree_level,subject,institution,n_founding
0,p:10,c:5,,Computer Science / Software,harvard university,1
1,p:100009,c:152329,,,,2
2,p:100009,c:81559,,,,2
3,p:100017,c:81564,,,,1
4,p:100025,c:287,,,,1
...,...,...,...,...,...,...
59961,p:99982,c:16993,,,,2
59962,p:99982,c:81541,,,,2
59963,p:99988,c:64683,2.00,Computer Science / Software,bristol university,1
59964,p:99994,c:3643,2.00,Marketing / Communications,louisiana university,1


In [22]:
# 스타트업 유형 클러스터링에 사용할 주요 피처
"""
1. 'obj_category_filled'        -> obj : objects_cfpr_id 
2. 'obj_city_fixed'             -> obj : objects_cfpr_id
3. 'institution_normalized'     -> deg : degrees_p_id 
4. 'cat_degree_subject'         -> deg : degrees_p_id 
4. 'degree_level'               -> deg : degrees_p_id 
5. 'n_offices'                  -> startup_info : objects_cfpr_id
6. 'n_founding'                 -> rel : rel_p_id, rel_cf_id 
"""

"\n1. 'obj_category_filled'        -> obj : objects_cfpr_id \n2. 'obj_city_fixed'             -> obj : objects_cfpr_id\n3. 'institution_normalized'     -> deg : degrees_p_id \n4. 'cat_degree_subject'         -> deg : degrees_p_id \n4. 'degree_level'               -> deg : degrees_p_id \n5. 'n_offices'                  -> startup_info : objects_cfpr_id\n6. 'n_founding'                 -> rel : rel_p_id, rel_cf_id \n"

### obj에서 only investor 제거

In [23]:
# 0) obj에서 회사(c:)만
obj_c = obj[obj['objects_cfpr_id'].str.startswith('c:', na=False)].copy()

# 1) investments에서 investor로 등장한 c:
c_investors = set(
    inv.loc[
        inv['investor_cfp_id'].str.startswith('c:', na=False),
        'investor_cfp_id'
    ].dropna().unique()
)

# 2) investments에서 invested(투자받은 회사)로 등장한 c:
c_invested = set(
    inv.loc[
        inv['invested_c_id'].str.startswith('c:', na=False),
        'invested_c_id'
    ].dropna().unique()
)

# 3) investor에만 있는 c: (투자만 한 회사)  = 제거
c_only_investor = c_investors - c_invested

# 4) obj의 c:에서 투자만 한 회사 제거
obj_c_filtered = obj_c[~obj_c['objects_cfpr_id'].isin(c_only_investor)].copy()

In [24]:
# 🔗 산업, 도시, 직원수 집계
agg_obj_c_filtered = (
    obj_c_filtered.groupby('objects_cfpr_id').agg(
    category = ('obj_category_filled', 'first'),
    city     = ('obj_city_fixed', 'first'),
    relationships = ('relationships', 'max')
    ).reset_index()
)
agg_obj_c_filtered

Unnamed: 0,objects_cfpr_id,category,city,relationships
0,c:1,web,seattle,17
1,c:10,games_video,culver city,6
2,c:100,games_video,san mateo,12
3,c:10000,network_hosting,,0
4,c:10001,games_video,,0
...,...,...,...,...
194146,c:99940,ecommerce,san francisco,5
194147,c:9995,public_relations,san jose,14
194148,c:9996,consulting,falls church,44
194149,c:9997,search,los angeles,1


In [25]:
# 🔗 최종 조인 : cnt_offices(objects_cfpr_id), rd_founding(rel_cf_id), agg_obj_c_filtered(objects_cfpr_id)
startup_df = (
    agg_obj_c_filtered
        .merge(cnt_offices, how='left', on='objects_cfpr_id')
        .merge(rd_founding, how='left', left_on='objects_cfpr_id', right_on='rel_cf_id')
)
startup_df

Unnamed: 0,objects_cfpr_id,category,city,relationships,n_offices,rel_p_id,rel_cf_id,degree_level,subject,institution,n_founding
0,c:1,web,seattle,17,2.00,p:2,c:1,2.00,Computer Science / Software,"california, berkeley university",2.00
1,c:1,web,seattle,17,2.00,p:59304,c:1,2.00,Computer Science / Software,london school of economics and political science,1.00
2,c:10,games_video,culver city,6,1.00,p:39,c:10,2.00,Computer Science / Software,michigan university,2.00
3,c:10,games_video,culver city,6,1.00,p:40,c:10,,,,1.00
4,c:100,games_video,san mateo,12,1.00,p:1274,c:100,,,,1.00
...,...,...,...,...,...,...,...,...,...,...,...
213294,c:99940,ecommerce,san francisco,5,1.00,,,,,,
213295,c:9995,public_relations,san jose,14,1.00,,,,,,
213296,c:9996,consulting,falls church,44,1.00,,,,,,
213297,c:9997,search,los angeles,1,1.00,,,,,,


## [최종] 회사기준 1행으로 만들기 : 집계

In [26]:
# 1행 회사단위로 집계!!! 최종
startup_ml_df = (
    startup_df.groupby('objects_cfpr_id').agg(
        category      = ('category', 'first'),
        city          = ('city', 'first'),
        n_offices     = ('n_offices', 'max'),
        relationships = ('relationships', 'max'),
        degree_level  = ('degree_level', 'max'),
        # subject, institution을 아래와 같이 집계 : 결측 제외 후 첫값 가져오기 (📌 범주형 대표값이 필요할 때 1순위 패턴)
        # 1.두 컬럼은 논리적으로 독립 / 2.한 사람에게 birthplace만 있는데 학교만 없는 경우 많음 / 3.정보 손실 최소화    
        subject       = ('subject',
                         lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan),
        inst          = ('institution',
                         lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan),
        n_founding    = ('n_founding', 'max')
        )
    ).reset_index()


In [27]:
startup_ml_df

Unnamed: 0,objects_cfpr_id,category,city,n_offices,relationships,degree_level,subject,inst,n_founding
0,c:1,web,seattle,2.00,17,2.00,Computer Science / Software,"california, berkeley university",2.00
1,c:10,games_video,culver city,1.00,6,2.00,Computer Science / Software,michigan university,2.00
2,c:100,games_video,san mateo,1.00,12,4.00,Computer Science / Software,stanford university,1.00
3,c:10000,network_hosting,,,0,,,,
4,c:10001,games_video,,,0,,,,
...,...,...,...,...,...,...,...,...,...
194146,c:99940,ecommerce,san francisco,1.00,5,,,,
194147,c:9995,public_relations,san jose,1.00,14,,,,
194148,c:9996,consulting,falls church,1.00,44,,,,
194149,c:9997,search,los angeles,1.00,1,,,,


In [28]:
startup_ml_df.isna().mean().sort_values(ascending=False)

subject           0.91
degree_level      0.91
inst              0.90
n_founding        0.79
city              0.57
n_offices         0.55
category          0.01
objects_cfpr_id   0.00
relationships     0.00
dtype: float64

## [저장] startup_ml_df csv 파일로 저장!

In [29]:
# startup_ml_df.to_csv("./data/startup_ml_final.csv", index=False)
# print("머신러닝용 파일 startup_ml_final.csv 저장완료!!!")