In [1]:

#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

# setting Korean font
import platform
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'
elif platform.system() == 'Darwin':  # macOS
    plt.rcParams['font.family'] = 'AppleGothic'
else:  # Linux
    plt.rcParams['font.family'] = 'NanumGothic'

# statistic
from scipy import stats
from scipy.stats import shapiro, levene, ttest_ind, chi2_contingency, f_oneway
from scipy.stats import mannwhitneyu, fisher_exact, kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
import pingouin as pg
import scikit_posthocs as sp

from collections import Counter
from sklearn.datasets import load_diabetes
import scipy.stats as st
from pathlib import Path

# setting seed
np.random.seed(42)

Darwin


In [2]:
# Load Files
Acquisitions = pd.read_csv("./data/acquisitions.csv")
Degrees = pd.read_csv("./data/degrees.csv")
FundingRounds = pd.read_csv("./data/funding_rounds.csv")
Funds = pd.read_csv("./data/funds.csv")
Investments = pd.read_csv("./data/investments.csv")
Ipos = pd.read_csv("./data/ipos.csv")
Milestones = pd.read_csv("./data/milestones.csv")
Objects = pd.read_csv("./data/objects.csv")
Offices = pd.read_csv("./data/offices.csv")
People = pd.read_csv("./data/people.csv")
Relationships = pd.read_csv("./data/relationships.csv")

print("="*60)
print("Dataset 로드 완료!")
print("="*60)

Dataset 로드 완료!


# 20251206

## Objects 전처리(2025.12.06)

### 기본 전처리

In [3]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. Objects 테이블 필요 없는 컬럼 제거
#      2. 컬럼명 변경
#      3. 데이터 형변환
#      4. 결측 플래그 생성
#      5. 결측치 NaN/NaT 대치 및 삭제
#########################

# 필요 없는 컬럼 drop
Objects_cleaned = Objects[['id', 'entity_type', 'parent_id','normalized_name', 'category_code', 'status', 
                           'founded_at', 'closed_at', 'description','overview', 'tag_list', 'country_code', 
                           'state_code', 'city', 'region','first_investment_at', 'last_investment_at', 'investment_rounds', 
                           'invested_companies', 'first_funding_at', 'last_funding_at','funding_rounds', 'funding_total_usd', 
                           'first_milestone_at', 'last_milestone_at', 'milestones', 'relationships']]



# 컬럼명 변경: id, parent_id 
Objects_cleaned = Objects_cleaned.rename(columns={'id':'objects_cfpr_id', 'parent_id': 'parent_c_id'})



#  데이터 형변환: founded_at, closed_at, first_investment_at, last_investment_at, first_funding_at, last_funding_at, first_milestone_at, last_milestone_at
dtype_cols = ['founded_at', 'closed_at', 'first_investment_at', 'last_investment_at', 'first_funding_at', 
              'last_funding_at', 'first_milestone_at', 'last_milestone_at']
Objects_cleaned[dtype_cols] = Objects_cleaned[dtype_cols].apply(pd.to_datetime, errors='coerce')



# 결측 플래그 생성: parent_c_id(is_obj_parent_id_missing), category_code(is_obj_category_missing),founded_at(is_obj_founded_missing), closed_at(is_obj_closed_missing), overview(is_obj_overview_missing), state_code(is_obj_state_missing), investment_rounds(is_obj_inv_rounds_missing), invested_companies(is_obj_inv_comp_missing)
miss_cols = ["parent_c_id", "category_code", "founded_at", 'closed_at', 'overview', 'state_code', 
             'investment_rounds', 'invested_companies']

for c in miss_cols:
    Objects_cleaned[f"{c}_miss"] = Objects_cleaned[c].isna().astype(int)
    
# 결측 플래그 컬럼명 변경  
Objects_cleaned = Objects_cleaned.rename(columns={"parent_c_id_miss" : "is_obj_parent_id_missing",
                                                  "category_code_miss" : "is_obj_category_missing",
                                                  "founded_at_miss" : "is_obj_founded_missing",
                                                  "closed_at_miss" : "is_obj_closed_missing",
                                                  "overview_miss" : "is_obj_overview_missing",
                                                  "state_code_miss" : "is_obj_state_missing",
                                                  "investment_rounds_miss" : "is_obj_inv_rounds_missing",
                                                  "invested_companies_miss" : "is_obj_inv_comp_missing"})


# 결측치 NaN, NaT 대치
# NaN : parent_c_id, category_code, description, overview, tag_list, country_code, state_code, city, investment_rounds, invested_companies
# NaT: founded_at, closed_at, first_investment_at, last_investment_at, first_funding_at, last_funding_at, first_milestone_at, last_milestone_at

N_cols = ['parent_c_id', 'category_code', 'description', 'overview', 'tag_list', 
          'country_code', 'state_code', 'city', 'investment_rounds', 'invested_companies']
          
T_cols = ['founded_at', 'closed_at', 'first_investment_at', 'last_investment_at', 'first_funding_at', 
          'last_funding_at', 'first_milestone_at', 'last_milestone_at']

Objects_cleaned[N_cols] = Objects_cleaned[N_cols].fillna(np.nan)
Objects_cleaned[T_cols] = Objects_cleaned[T_cols].fillna(pd.NaT)

# normalized_name 결측치 삭제
Objects_cleaned = Objects_cleaned.dropna(subset=['normalized_name'])

### 카테고리 분류

##### status 카테고리 생성 완료

In [4]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. status(cat_obj_status) 카테고리 분류 컬럼 생성
#########################

# 카테고리 분류 컬럼 생성
# status : cat_obj_status
Objects_cleaned['status'].unique()

# status 상위 카테고리 매핑 딕셔너리
status_map = {
    'operating': 'operating',
    'live': 'operating',
    'beta': 'operating',
    'private': 'operating',
    'alpha': 'operating',
    'development': 'operating',
    'acquired': 'acquired',
    'closed': 'closed',
    'ipo': 'ipo'
}

# 상위 카테고리 분류(cat_obj_status) 컬럼 생성
Objects_cleaned['cat_obj_status'] = Objects_cleaned['status'].map(status_map).fillna('other')

print("="*60)
print("[cat_obj_status] 카테고리 컬럼 생성 완료!")


[cat_obj_status] 카테고리 컬럼 생성 완료!


##### overview 카테고리 생성

In [5]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. overview(cat_obj_overview) 카테고리 분류 컬럼 생성
#        - 텍스트 정규화
#        - 카테고리 매핑
#########################

# ====================== 텍스트 정규화 ======================
# obj_overview_fixed 원본 생성
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['overview']

# 1) 앞뒤 공백 제거
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.strip()

# 2) 소문자로 변환
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.lower()

# 3) 문자열 앞뒤의 쉼표, 점, 슬래시 정도 제거
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.replace(r'^[\s\.,/]+', '', regex=True)
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.replace(r'[\s\.,/]+$', '', regex=True)

# 4) the 제거
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.replace(r'^the\s+', '', regex=True)

# 5) 탭, 여러 칸 공백 등을 모두 ' ' 하나로
Objects_cleaned['obj_overview_fixed'] = Objects_cleaned['obj_overview_fixed'].str.replace(r'\s+', ' ', regex=True)

print("="*60)
print("[obj_overview_fixed] 텍스트 정규화 완료!")



# ====================== 카테고리 분류 ======================
# 카테고리 분류
def classify_overview(text):
    if pd.isna(text):
        return "Software & Technology"  # 최빈 산업군 (실제 데이터 기반)

    t = text  # 이미 lower, cleaned

    # SOFTWARE & TECHNOLOGY
    if any(kw in t for kw in [
        "software","technology","platform","system","tools","cloud","application",
        "data","database","analytics","engine","processing","infrastructure",
        "development","developer","api","integration","web","online","solution",
        "digital","automation"
    ]):
        return "Software & Technology"

    # MEDIA & CONTENT
    if any(kw in t for kw in [
        "media","content","video","movie","tv","stream","broadcast",
        "photo","photos","news","entertainment","publisher","music","audio"
    ]):
        return "Media & Content"

    # MARKETING & ADTECH
    if any(kw in t for kw in [
        "marketing","advertis","campaign","brand","analytics","banner",
        "impression","monetiz","targeting","visibility"
    ]):
        return "Marketing & AdTech"

    # SOCIAL & COMMUNITY
    if any(kw in t for kw in [
        "social","community","network","networking","share","sharing","friends",
        "messaging","chat","meet","user","users","profiles","collaboration"
    ]):
        return "Social & Community"

    # GAMING / VIRTUAL / 3D
    if any(kw in t for kw in [
        "game","gaming","virtual","3d","avatar","simulation","immersive",
        "metaverse","virtual world","render","graphics","online world"
    ]):
        return "Gaming & Virtual"

    # E-COMMERCE & RETAIL
    if any(kw in t for kw in [
        "ecommerce","retail","shop","shopping","buy","sell","store","marketplace",
        "merchant","deal"
    ]):
        return "E-commerce & Retail"

    # FINANCE & FINTECH
    if any(kw in t for kw in [
        "finance","financial","payment","payments","credit","transaction","wallet",
        "loan","investment","fund","trading","bank"
    ]):
        return "Finance & FinTech"

    # HEALTH & WELLNESS
    if any(kw in t for kw in [
        "health","medical","doctor","healthcare","clinic","wellness",
        "nutrition","disease","therapy"
    ]):
        return "Health & Wellness"

    # EDUCATION & TRAINING
    if any(kw in t for kw in [
        "education","learning","school","student","teacher","tutoring",
        "course","curriculum","training"
    ]):
        return "Education & Training"

    # PROFESSIONAL SERVICES / CONSULTING
    if any(kw in t for kw in [
        "consulting","consultant","advisor","professional service","agency",
        "outsourcing","support services"
    ]):
        return "Professional Services"

    # FALLBACK (데이터 기반 1위 산업)
    return "Software & Technology"
print("="*60)
print("[obj_overview_fixed] 카테고리 분류 완료!")

# ====================== 카테고리 컬럼 생성 ======================
Objects_cleaned["cat_obj_overview"] = Objects_cleaned["obj_overview_fixed"].apply(classify_overview)
Objects_cleaned["cat_obj_overview"].value_counts()
print("="*60)
print("[cat_obj_overview] 카테고리 컬럼 생성 완료!")

[obj_overview_fixed] 텍스트 정규화 완료!
[obj_overview_fixed] 카테고리 분류 완료!
[cat_obj_overview] 카테고리 컬럼 생성 완료!


##### region 카테고리 생성

In [6]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. region(cat_obj_region) 카테고리 분류 컬럼 생성
#        - 텍스트 정규화
#        - 카테고리 매핑
#########################

# ====================== 텍스트 정규화 ======================
# region 원본 보존
Objects_cleaned["obj_region_fixed"] = Objects_cleaned["region"].astype(str)

# 1) lower + strip
Objects_cleaned["obj_region_fixed"] = (
    Objects_cleaned["obj_region_fixed"]
        .str.lower()
        .str.strip()
)

# 2) unknown → NaN
Objects_cleaned["obj_region_fixed"] = Objects_cleaned["obj_region_fixed"].replace(
    ["unknown", "none", "-", ""], 
    pd.NA
)

# 3) region 대표 표기 통일(일부 메트로 지역)
region_replace = {
    "sf bay": "sf bay area",
    "bay area": "sf bay area",
    "san fran": "sf bay area",
    "nyc": "new york",
    "new york city": "new york",
    "la": "los angeles",
}
Objects_cleaned["obj_region_fixed"] = Objects_cleaned["obj_region_fixed"].replace(region_replace)
print("="*60)
print("[obj_region_fixed] 텍스트 정규화 완료!")


# ====================== 매핑 딕셔너리 ======================
hub_map = {
    # US Major Hubs
    "san francisco": "US – SF Bay Area",
    "sf bay area": "US – SF Bay Area",
    "palo alto": "US – SF Bay Area",
    "mountain view": "US – SF Bay Area",
    "new york": "US – New York",
    "los angeles": "US – Los Angeles",
    "seattle": "US – Seattle",
    "boston": "US – Boston",
    "chicago": "US – Chicago",
    "washington": "US – Washington DC",
    "austin": "US – Austin",
    "denver": "US – Denver",
    "san diego": "US – San Diego",
    "atlanta": "US – Atlanta",
    "dallas": "US – Dallas",

    # Canada
    "toronto": "Canada – Toronto",
    "vancouver": "Canada – Vancouver",

    # Europe
    "london": "Europe – London",
    "paris": "Europe – Paris",
    "berlin": "Europe – Berlin",
    "amsterdam": "Europe – Amsterdam",
    "dublin": "Europe – Dublin",
    "stockholm": "Europe – Stockholm",
    "helsinki": "Europe – Helsinki",
    "madrid": "Europe – Madrid",
    "barcelona": "Europe – Barcelona",

    # Middle East
    "tel aviv": "Middle East – Tel Aviv",

    # APAC
    "singapore": "APAC – Singapore",
    "tokyo": "APAC – Tokyo",
    "sydney": "APAC – Sydney",
    "seoul": "APAC – Seoul",
}
print("="*60)
print("[obj_region_fixed] 카테고리 분류 완료!")

# ====================== 카테고리 분류 ======================
# region 카테고리 분류
def map_region_hub(row):
    city = row["city"]
    region = row["obj_region_fixed"]

    # 1) city 우선 (원본 그대로 사용하지만 lower로 비교)
    if pd.notna(city):
        c = city.lower().strip()
        if c in hub_map:
            return hub_map[c]

    # 2) region_fixed fallback
    if pd.notna(region) and region in hub_map:
        return hub_map[region]

    # 3) 매핑 실패: 모두 other로 통일 (unknown 제거)
    return "Other"

Objects_cleaned["cat_obj_region"] = Objects_cleaned.apply(map_region_hub, axis=1)

print("="*60)
print("[cat_obj_region] 카테고리 컬럼 생성 완료!")

[obj_region_fixed] 텍스트 정규화 완료!
[obj_region_fixed] 카테고리 분류 완료!
[cat_obj_region] 카테고리 컬럼 생성 완료!


##### city, region, state_code, country_code 결측 상호 보완

In [7]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. state_code 결측값 city, country_code, region_fixed 이용해서 보완
#########################

# state_code 보완 컬럼 state_filled
import numpy as np
import pandas as pd

# 매핑 테이블: city → state_code
city_to_state = {
    "seattle": "WA",
    "redmond": "WA",
    "san mateo": "CA",
    "culver city": "CA",
    "los angeles": "CA",
    "san francisco": "CA",

    "toronto": "ON",
    "vancouver": "BC",

    "carlton vic": "VIC",
}

# state_filled 생성
def fill_state(row):
    state = row["state_code"]
    city = row["city"]

    # 1) state_code가 이미 있으면 그대로 유지
    if pd.notna(state):
        return state

    # 2) city가 있고 매핑이 가능하면 보완
    if pd.notna(city):
        c = city.lower().strip()
        if c in city_to_state:
            return city_to_state[c]

    # 3) 보완 불가 → NaN 유지
    return np.nan


Objects_cleaned["obj_state_filled"] = Objects_cleaned.apply(fill_state, axis=1)
print("="*60)
print("[obj_state_filled] state_code 결측 보완 컬럼 생성 완료!")

[obj_state_filled] state_code 결측 보완 컬럼 생성 완료!


In [8]:
Objects_cleaned

Unnamed: 0,objects_cfpr_id,entity_type,parent_c_id,normalized_name,category_code,status,founded_at,closed_at,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,is_obj_parent_id_missing,is_obj_category_missing,is_obj_founded_missing,is_obj_closed_missing,is_obj_overview_missing,is_obj_state_missing,is_obj_inv_rounds_missing,is_obj_inv_comp_missing,cat_obj_status,obj_overview_fixed,cat_obj_overview,obj_region_fixed,cat_obj_region,obj_state_filled
0,c:1,Company,,wetpaint,web,operating,2005-10-17,NaT,Technology Platform Company,"Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaints own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the companys success in building and engaging audiences. Media companies can license Wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.","wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system",USA,WA,Seattle,Seattle,NaT,NaT,0,0,2005-10-01,2008-05-19,3,39750000.0,2010-09-05,2013-09-18,5,17,1,0,0,1,0,0,0,0,operating,"wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. wetpaints own online property, wetpaint entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million facebook fans, is a proof point to the companys success in building and engaging audiences. media companies can license wetpaints platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. founded by internet pioneer ben elowitz, and with offices in new york and seattle, wetpaint is backed by accel partners, the investors behind facebook",Software & Technology,seattle,US – Seattle,WA
1,c:10,Company,,flektor,games_video,acquired,NaT,NaT,,"Flektor is a rich-media mash-up platform that enables consumers to create, remix and share photos and videos on the internet without the need for advanced video-editing skills or software.\n\nFox Interactive Media, a division of News Corporation, announced that it had completed the purchase of Flektor on May 30, 2007. The estimated puchase price is $15-20 million.","flektor, photo, video",USA,CA,Culver City,Los Angeles,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,6,1,0,1,1,0,0,0,0,acquired,"flektor is a rich-media mash-up platform that enables consumers to create, remix and share photos and videos on the internet without the need for advanced video-editing skills or software. fox interactive media, a division of news corporation, announced that it had completed the purchase of flektor on may 30, 2007. the estimated puchase price is $15-20 million",Software & Technology,los angeles,US – Los Angeles,CA
2,c:100,Company,,there,games_video,acquired,NaT,NaT,,"There.com is an online virtual world where anyone can explore, meet friends and play games. It was founded in 1998 by Will Harvey, a Stanford computer science Ph.D. and game developer, and Jeffrey Ventrella, an expert on artificial life from MIT's Media Lab. The duo raised approximately $37 million - including $20 million from employees, $11 million from angel investors and $6 million from Sutter Hill Ventures. In 2005 the company was spun off under Makena Technologies, and in March 2010 There closed to the public. In May 2011, There announced it would reopen as a 18+ Cloud-based service. As of Nov 2013, There is open.\n\nThere.com is a subscription service with a monthly fee of $10.00. Additional in-game accessories can be purchased for separate fees.\n\nOther online virtual worlds include [Kaneva](http://www.crunchbase.com/company/kaneva), [Second Life](http://www.crunchbase.com/company/secondlife) and [Cyworld](http://www.crunchbase.com/company/cyworld).","virtualworld, there, teens",USA,CA,San Mateo,SF Bay,NaT,NaT,0,0,NaT,NaT,0,0.0,2003-02-01,2011-09-23,4,12,1,0,1,1,0,0,0,0,acquired,"there.com is an online virtual world where anyone can explore, meet friends and play games. it was founded in 1998 by will harvey, a stanford computer science ph.d. and game developer, and jeffrey ventrella, an expert on artificial life from mit's media lab. the duo raised approximately $37 million - including $20 million from employees, $11 million from angel investors and $6 million from sutter hill ventures. in 2005 the company was spun off under makena technologies, and in march 2010 there closed to the public. in may 2011, there announced it would reopen as a 18+ cloud-based service. as of nov 2013, there is open. there.com is a subscription service with a monthly fee of $10.00. additional in-game accessories can be purchased for separate fees. other online virtual worlds include [kaneva](http://www.crunchbase.com/company/kaneva), [second life](http://www.crunchbase.com/company/secondlife) and [cyworld](http://www.crunchbase.com/company/cyworld)",Software & Technology,sf bay area,US – SF Bay Area,CA
3,c:10000,Company,,mywebbo,network_hosting,operating,2008-07-26,NaT,,"BRAND NEW ONLINE SOCIAL NETWORKING WEBSITE,FOR MAKING NEW FRIENDS OR CHATTING TO OLD ONE'S.\n\nPACKED WITH NEW FEATURES SUCH AS RATING PROFILES , RATING MUSIC,VIDEO'S AND PICTURES ,UPLOADING MUSIC ,VIDEO'S PICTURES , CREATING CLASSIFIED ADS ,SHOUTOUT BOX!, AND ONLINE CHAT AREA FOR MAKING NEW FRIENDS OR SIMPLY CHATTING TO YOUR OLD ONE'S ,THERE ARE LOADS OF GREAT FEATURES FOR ANYONE TO TRY .. PLUS MANY MORE TO COME .","social-network, new, website, web, friends, chat, people",,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,1,0,0,1,0,1,0,0,operating,"brand new online social networking website,for making new friends or chatting to old one's. packed with new features such as rating profiles , rating music,video's and pictures ,uploading music ,video's pictures , creating classified ads ,shoutout box!, and online chat area for making new friends or simply chatting to your old one's ,there are loads of great features for anyone to try .. plus many more to come",Software & Technology,,Other,
4,c:10001,Company,,the movie streamer,games_video,operating,2008-07-26,NaT,,"This company shows free movies online on their website which, in fact, is not illegal since they are not the ones hosting the videos.","watch, full-length, moives, online, for, free, streaming, videos, tv-shows",,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,1,0,0,1,0,1,0,0,operating,"this company shows free movies online on their website which, in fact, is not illegal since they are not the ones hosting the videos",Software & Technology,,Other,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462646,r:9995,Product,c:14164,sitelink listing feed for brokerages,,operating,NaT,NaT,,,,,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,Other,
462647,r:9996,Product,c:14164,edclink listing feed for economic development groups,,operating,NaT,NaT,,,,,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,Other,
462648,r:9997,Product,c:14164,cmail broadcast email marketing,,operating,NaT,NaT,,,,,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,Other,
462649,r:9998,Product,c:14164,catylistcrm contact database,,operating,NaT,NaT,,,,,,,unknown,NaT,NaT,0,0,NaT,NaT,0,0.0,NaT,NaT,0,0,0,1,1,1,1,1,0,0,operating,,Software & Technology,,Other,


In [9]:
# city 전처리: 원본 보존 + 새로운 city_clean 생성
def clean_city_keep_original(df, col='city'):
    temp = df[col].astype(str)

    # 1) 소문자 변환 + 공백 제거
    city_clean = temp.str.lower().str.strip()

    # 2) 쉼표 뒤 제거: "San Francisco, CA" → "san francisco"
    city_clean = city_clean.str.split(',').str[0]

    # 3) 문자와 공백만 허용 (숫자·특수문자 제거)
    city_clean = city_clean.str.replace(r'[^a-z\s]', '', regex=True).str.strip()

    # 4) 빈 문자열 → NaN
    city_clean = city_clean.replace('', pd.NA)

    # 원본 손상 X → 새로운 컬럼만 추가
    df['obj_city_fixed'] = city_clean

    return df


# Objects_cleaned에 적용
Objects_cleaned = clean_city_keep_original(Objects_cleaned, 'city')


In [10]:
# Objects_cleaned 전처리 파일 csv 저장
Objects_cleaned.to_csv('./cleaned_data/clean_objects_v3.csv', index=False)

## objects.funding_total_usd==0값 비공개 플래그 전처리

In [2]:
# Load Files
acquisitions = pd.read_csv("./cleaned_data/clean_acquisitions_v1.csv")
fundingrounds = pd.read_csv("./cleaned_data/clean_fr_v4.csv")
investments = pd.read_csv("./cleaned_data/clean_investments_v1.csv")
ipos = pd.read_csv("./cleaned_data/clean_ipos_v2.csv")
objects = pd.read_csv("./cleaned_data/clean_objects_v3.csv")

print("="*60)
print("cleaned dataset 로드 완료!")
print("="*60)

cleaned dataset 로드 완료!


In [3]:
#########################
# 변경자: 수아
# 변경일자: 25.12.12
# 변경
# 내용: 1. objects.funding_total_usd==0 비공개 플래그 생성: is_obj_funding_total_usd_private
#      2. objects.funding_rounds==0 비공개 플래그 생성: is_obj_funding_rounds_private
#########################

# ==============================
# 임시 테이블 조인
# ==============================
# objects 테이블 - funding_rounds 테이블 조인
obj_fr = (objects.merge(fundingrounds, how='left', left_on="objects_cfpr_id", right_on="fr_c_id"))
# objects 테이블 - investments 테이블 조인
obj_inv = (objects.merge(investments, how='left', left_on="objects_cfpr_id", right_on="invested_c_id"))
# objects 테이블 - acquisitions 테이블 조인
obj_acq = (objects.merge(acquisitions, how='left', left_on="objects_cfpr_id", right_on="acquired_c_id"))
# objects 테이블 - ipos 테이블 조인
obj_ipo = (objects.merge(ipos, how='left', left_on="objects_cfpr_id", right_on="ipos_c_id"))

# ==============================
# 비공개 아이디 추출
# ==============================
import numpy as np

# 각 조건별로 아이디 추출
ids_fr = obj_fr[(obj_fr['fr_c_id'].notna()) & (obj_fr['funding_total_usd']==0)]['objects_cfpr_id'].unique()
ids_inv = obj_inv[(obj_inv['invested_c_id'].notna()) & (obj_inv['funding_total_usd']==0)]['objects_cfpr_id'].unique()
ids_acq = obj_acq[(obj_acq['acquired_c_id'].notna()) & (obj_acq['funding_total_usd']==0)]['objects_cfpr_id'].unique()
ids_ipo = obj_ipo[(obj_ipo['ipos_c_id'].notna()) & (obj_ipo['funding_total_usd']==0)]['objects_cfpr_id'].unique()

# 리스트로 변환 (생략 없이 출력됨)
anomaly_ids_list = sorted(list(set(ids_fr) | set(ids_inv) | set(ids_acq) | set(ids_ipo)))

print(f"총 이상 아이디 개수: {len(anomaly_ids_list)}")

import numpy as np

# ======================================================
# 비공개 ID 집합 생성 (이미 anomaly_ids_list 존재)
# ======================================================
anomaly_set = set(anomaly_ids_list)

# ======================================================
# 비공개 플래그 생성
# ======================================================
objects['is_obj_funding_total_usd_private'] = (
    objects['objects_cfpr_id'].isin(anomaly_set).astype(int)
)

objects['is_obj_funding_rounds_private'] = (
    objects['is_obj_funding_total_usd_private']
)

# ======================================================
# 비공개 객체 원본 컬럼 NaN 처리
# ======================================================
private_mask = objects['is_obj_funding_total_usd_private'] == 1

objects.loc[private_mask, [
    'funding_total_usd',
    'funding_rounds'
]] = np.nan

# ======================================================
# 결과 저장
# ======================================================
objects.to_csv('./cleaned_data/clean_objects_v3.csv', index=False)



총 이상 아이디 개수: 11278


## Offices 전처리 20251206

In [13]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. Offices 테이블 필요없는 컬럼 제거
#      2. 컬럼명 변경
#      3. 결측값 채우기
#########################

# ====================== 필요 없는 컬럼 제거 ======================
Offices_cleaned = Offices[['object_id', 'office_id', 'description', 'city', 'state_code', 'country_code', 'latitude', 'longitude']]
print("="*60)
print("불필요한 컬럼 제거 완료!")

# ====================== 컬럼명 변경 ======================
Offices_cleaned = Offices_cleaned.rename(columns={'object_id':'offices_c_id'})
print("="*60)
print("컬럼명 변경 완료!")

# ====================== city 결측치 대치 ======================
Offices_cleaned['city'] = Offices_cleaned['city'].fillna('Unknown')
print("="*60)
print("city 결측치 Unknown 대치 완료!")

불필요한 컬럼 제거 완료!
컬럼명 변경 완료!
city 결측치 Unknown 대치 완료!


In [14]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. description 텍스트 정규화 및 카테고리 분류 컬럼(cat_offices_description) 생성
#########################

import pandas as pd
import numpy as np
import re

# =========================================================
# 1) offices_description_fixed 생성 (결측은 모두 no office info)
# =========================================================
def clean_desc(x):
    # ---------------------------
    # ① 결측이면 무조건 no office info
    # ---------------------------
    if pd.isna(x):
        return "no office info"

    x = str(x).lower().strip()

    # ---------------------------
    # ② 텍스트 정규화
    # ---------------------------
    x = x.replace("headquarters", "headquarter")
    x = x.replace("head office", "head")
    x = x.replace("corporate headquarters", "corporate hq")
    x = x.replace("corporate headquarter", "corporate hq")
    x = x.replace("main office", "main")
    x = x.replace("home office", "home")
    x = x.replace("branch office", "branch")

    # 다중 공백 제거
    x = " ".join(x.split())

    return x

Offices_cleaned["offices_description_fixed"] = Offices_cleaned["description"].apply(clean_desc)
print("="*60)
print("description 텍스트 정규화 완료!")

# =========================================================
# 2) description_category 생성
# =========================================================
def categorize_desc(x):
    # x는 절대 NaN이 아님 → 이미 no office info로 정규화됨

    # ---------------------------
    # No Office Info
    # ---------------------------
    if x == "no office info":
        return "No Office Info"

    # ---------------------------
    # Headquarters
    # ---------------------------
    if (
        x.startswith("hq")
        or x.startswith("head")
        or "headquarter" in x
        or x.startswith("main")
        or x.startswith("corporate")
        or x.startswith("home")
        or "corporate hq" in x
    ):
        return "Headquarters"

    # ---------------------------
    # R&D
    # ---------------------------
    if x.startswith("r&d") or x.startswith("development"):
        return "R&D"

    # ---------------------------
    # Sales
    # ---------------------------
    if x.startswith("sales"):
        return "Sales"

    # ---------------------------
    # Operations
    # ---------------------------
    if x.startswith("operations") or "central office" in x:
        return "Operations"

    # ---------------------------
    # Branch
    # ---------------------------
    if "office" in x:
        return "Branch"

    if "european" in x:
        return "Branch"

    # 도시명 단독
    if bool(re.fullmatch(r"[a-z ]+", x)) and len(x.split()) <= 3:
        return "Branch"

    # ---------------------------
    # Others
    # ---------------------------
    return "Others"


Offices_cleaned["cat_offices_description"] = Offices_cleaned["offices_description_fixed"].apply(categorize_desc)
print("="*60)
print("cat_offices_description 카테고리 컬럼 생성 완료!")


description 텍스트 정규화 완료!
cat_offices_description 카테고리 컬럼 생성 완료!


In [15]:
#########################
# 변경자: 수아
# 변경일자: 25.12.06
# 변경
# 내용: 1. state_code 결측치 city 컬럼 참고해서 보완
#########################

# ====================== state_code 결측 보완 ======================
import numpy as np
import pandas as pd

# ============================================================
# 1) city → state_code 매핑 테이블 생성
# - Offices_cleaned 내부 실제 데이터로만 매핑 생성
# ============================================================
# city가 있고, state_code가 있는 row만 사용
city_state_map = (
    Offices_cleaned
    .dropna(subset=["city", "state_code"])
    .assign(city=lambda x: x["city"].str.lower().str.strip())
    .drop_duplicates(subset=["city"])
    .set_index("city")["state_code"]
    .to_dict()
)

# 예시 출력:
# {'seattle': 'WA', 'pleasanton': 'CA', 'san francisco': 'CA', ...}


# ============================================================
# 2) offices_state_filled 생성 함수
# ============================================================
def fill_state(row):
    state = row["state_code"]
    city = row["city"]

    # 1) 원래 state_code가 있으면 그대로 유지
    if pd.notna(state):
        return state

    # 2) city로 보완 가능한 경우
    if pd.notna(city):
        c = city.lower().strip()
        if c in city_state_map:
            return city_state_map[c]

    # 3) city로도 못 채우면 Unknown
    return "Unknown"


# ============================================================
# 3) Offices_cleaned에 새로운 컬럼 생성
# ============================================================
Offices_cleaned["offices_state_filled"] = Offices_cleaned.apply(fill_state, axis=1)

print("="*60)
print("state_code 결측 보완 완료!")

state_code 결측 보완 완료!


In [16]:
# Offices_cleaned 전처리 파일 csv 저장
Offices_cleaned.to_csv('./cleaned_data/clean_offices_v1.csv', index=False)

In [17]:
Offices_cleaned

Unnamed: 0,offices_c_id,office_id,description,city,state_code,country_code,latitude,longitude,offices_description_fixed,cat_offices_description,offices_state_filled
0,c:1,1,,Seattle,WA,USA,47.603122,-122.333253,no office info,No Office Info,WA
1,c:3,3,Headquarters,Pleasanton,CA,USA,37.692934,-121.904945,headquarter,Headquarters,CA
2,c:4,4,,San Francisco,CA,USA,37.764726,-122.394523,no office info,No Office Info,CA
3,c:5,5,Headquarters,Menlo Park,CA,USA,37.416050,-122.151801,headquarter,Headquarters,CA
4,c:7,7,,Palo Alto,CA,ISR,0.000000,0.000000,no office info,No Office Info,CA
...,...,...,...,...,...,...,...,...,...,...,...
112713,f:15098,127846,,Greenwich,CT,USA,0.000000,0.000000,no office info,No Office Info,CT
112714,c:286200,127847,,Santa Barbara,CA,USA,0.000000,0.000000,no office info,No Office Info,CA
112715,c:256895,127848,,Los Angeles,CA,USA,0.000000,0.000000,no office info,No Office Info,CA
112716,c:256200,127849,,New York,NY,USA,0.000000,0.000000,no office info,No Office Info,NY


# 20251209

In [6]:
# Load Files
acquisitions = pd.read_csv("./cleaned_data/clean_acquisitions_v1.csv")
degrees = pd.read_csv("./cleaned_data/clean_degrees_v3.csv")
fundingrounds = pd.read_csv("./cleaned_data/clean_fr_v4.csv")
funds = pd.read_csv("./cleaned_data/clean_funds_v3.csv")
investments = pd.read_csv("./cleaned_data/clean_investments_v1.csv")
ipos = pd.read_csv("./cleaned_data/clean_ipos_v2.csv")
milestones = pd.read_csv("./cleaned_data/clean_milestons_v1.csv")
objects = pd.read_csv("./cleaned_data/clean_objects_v3.csv")
offices = pd.read_csv("./cleaned_data/clean_offices_v1.csv")
people = pd.read_csv("./cleaned_data/clean_people_v1.csv")
relationships = pd.read_csv("./cleaned_data/clean_relationships_v1.csv")

print("="*60)
print("cleaned dataset 로드 완료!")
print("="*60)

cleaned dataset 로드 완료!


## 테이블 조인(투자자 관점)

In [7]:
print(f"investment 행수:{investments.shape}")
print(f"fundingrounds 행수:{fundingrounds.shape}")
print(f"objects 행수:{objects.shape}")
print(f"acquisitions 행수:{acquisitions.shape}")
print(f"ipos 행수:{ipos.shape}")
print(f"relationships 행수:{relationships.shape}")
print(f"milestones 행수:{milestones.shape}")

investment 행수:(80902, 4)
fundingrounds 행수:(52928, 17)
objects 행수:(462620, 44)
acquisitions 행수:(9562, 8)
ipos 행수:(1259, 16)
relationships 행수:(402412, 11)
milestones 행수:(39456, 6)


#### Q1.VC는 주로 어떤 투자 라운드에 투자했을 때, 엑싯 성공률이 높을까?

In [8]:
#########################
# 변경자: 수아
# 변경일자: 25.12.09
# 변경
# 내용: 1. 투자자 분석 테이블 조인
#      2. 투자 라운드-엑싯 성공률
#########################

# ============================================================
# investments - funding_rounds - objects - acquisitions 조인
# Q1. VC는 어떤 투자 라운드에 투자했을 때, 엑싯 성공률(인수합병)이 높을까?
# ============================================================
vc_invest_fr_acq= (investments
                   .merge(fundingrounds, how='left', left_on="funding_round_id", right_on="funding_round_id")
                   .merge(objects, how='left', left_on='fr_c_id', right_on='objects_cfpr_id')
                   .merge(acquisitions, how='left', left_on='objects_cfpr_id', right_on='acquired_c_id')
                   )
print(f"vc_invest_fr_acq 행수:{vc_invest_fr_acq.shape}")
print("acquisitions 테이블 조인 후, 행 수 80902 → 81143 증가: 하나의 회사가 여러 번 인수합병되었기에 정상")
print("investments - funding_rounds - objects - acquisitions 조인 완료!")

# ============================================================
# investments - funding_rounds - objects - ipos 조인
# Q1. VC는 어떤 투자 라운드에 투자했을 때, 엑싯 성공률(상장)이 높을까?
# ============================================================
vc_invest_fr_ipo = (investments
                    .merge(fundingrounds, how='left', left_on="funding_round_id", right_on="funding_round_id")
                    .merge(objects, how='left', left_on='fr_c_id', right_on='objects_cfpr_id')
                    .merge(ipos, how='left', left_on='objects_cfpr_id', right_on='ipos_c_id')
                    )
print(f"vc_invest_fr_ipo 행수:{vc_invest_fr_ipo.shape}")
print("ipos 테이블 조인 후, 행 수 80902 → 82238 증가")
print("investments - funding_rounds - objects - ipos 조인 완료!")

vc_invest_fr_acq 행수:(81143, 72)
acquisitions 테이블 조인 후, 행 수 80902 → 81143 증가: 하나의 회사가 여러 번 인수합병되었기에 정상
investments - funding_rounds - objects - acquisitions 조인 완료!
vc_invest_fr_ipo 행수:(82230, 80)
ipos 테이블 조인 후, 행 수 80902 → 82238 증가
investments - funding_rounds - objects - ipos 조인 완료!


#### Q2. VC는 어떤 기업일까?(VC의 특징)

In [9]:
#########################
# 변경자: 수아
# 변경일자: 25.12.09
# 변경
# 내용: 1. 투자자 분석 테이블 조인
#      2. VC 특징-라운드 단계/관계/산업/엑싯 여부/이슈
#########################

# ============================================================
# investments - fundingrounds
# Q2. VC는 어떤 라운드 단계의 기업일까?
# ============================================================
vc_round = (investments
            .merge(fundingrounds, how='left',left_on='investor_cfp_id', right_on='fr_c_id')
            )
print(f"vc_round 행수:{vc_round.shape}")
print("fundingrounds 테이블 조인 후, 행 수 80902 → 82958 증가")
print("investments - funding_rounds 조인 완료!")

# ============================================================
# investments - relationships
# Q2. VC는 어떤 관계를 가진 기업일까?
# ============================================================
vc_rel = (investments
          .merge(relationships, how='left', left_on='investor_cfp_id', right_on='rel_cf_id')
          )
print(f"vc_rel 행수:{vc_rel.shape}")
print("relationships 테이블 조인 후, 행 수 80902 → 1699572 증가: 하나의 투자 기업이 여러 관계를 가지기에 정상")
print("investments - relationships 조인 완료!")

# ============================================================
# investments - objects
# Q2. VC는 어떤 산업의 기업일까?
# ============================================================
vc_category = (investments
               .merge(objects, how='left', left_on='investor_cfp_id', right_on='objects_cfpr_id')
               )
print(f"vc_category 행수:{vc_category.shape}")
print("investments - objects 조인 완료!")

# ============================================================
# investments - objects - acquisitions
# Q2. VC는 인수합병한 기업일까?
# ============================================================
vc_acquired = (investments
               .merge(objects, how='left', left_on='investor_cfp_id', right_on='objects_cfpr_id')
               .merge(acquisitions, how='left', left_on='objects_cfpr_id', right_on='acquired_c_id')
               )
print(f"vc_acquired 행수:{vc_acquired.shape}")
print("acquisitions 테이블 조인 후, 행 수 80902 → 80918 증가")
print("investments - objects - acquisitions 조인 완료!")

# ============================================================
# investments - objects - ipos 
# Q2. VC는 상장한 기업일까?
# ============================================================
vc_ipo = (investments
          .merge(objects, how='left', left_on='investor_cfp_id', right_on='objects_cfpr_id')
          .merge(ipos, how='left', left_on='objects_cfpr_id', right_on='ipos_c_id')
          )
print(f"vc_ipo 행수:{vc_ipo.shape}")
print("ipos 테이블 조인 후, 행 수 80902 → 82238 증가: ipos 테이블의 c_id 중복으로 정상적")
print("investments - objects - ipos 조인 완료!")

# ============================================================
# investments - objects - milestones
# Q2. VC는 어떤 이슈를 가진 기업일까?
# 투자 당시, 그 VC는 어떤 milestone(이슈)를 가진 상태인지 확인
# ============================================================
vc_issues = (investments
        .merge(objects, how='left', left_on='investor_cfp_id', right_on='objects_cfpr_id')
        .merge(milestones, how='left', left_on='investor_cfp_id', right_on='mile_cfpr_id')
        )
print(f"vc_issues 행수:{vc_issues.shape}")
print("milestones 테이블 조인 후, 행 수 80902 → 148951 증가")
print("investments - objects - milestones 조인 완료!")

vc_round 행수:(82958, 21)
fundingrounds 테이블 조인 후, 행 수 80902 → 82958 증가
investments - funding_rounds 조인 완료!
vc_rel 행수:(1699572, 15)
relationships 테이블 조인 후, 행 수 80902 → 1699572 증가: 하나의 투자 기업이 여러 관계를 가지기에 정상
investments - relationships 조인 완료!
vc_category 행수:(80902, 48)
investments - objects 조인 완료!
vc_acquired 행수:(80918, 56)
acquisitions 테이블 조인 후, 행 수 80902 → 80918 증가
investments - objects - acquisitions 조인 완료!
vc_ipo 행수:(82238, 64)
ipos 테이블 조인 후, 행 수 80902 → 82238 증가: ipos 테이블의 c_id 중복으로 정상적
investments - objects - ipos 조인 완료!
vc_issues 행수:(148951, 54)
milestones 테이블 조인 후, 행 수 80902 → 148951 증가
investments - objects - milestones 조인 완료!


#### Q3. VC는 주로 어떤 기업에 투자할까?

In [10]:
#########################
# 변경자: 수아
# 변경일자: 25.12.09
# 변경
# 내용: 1. 투자자 분석 테이블 조인
#      2. VC 투자 패턴-산업/라운드/기업 수/금액/창업자
#########################

# ============================================================
# investments - funding_rounds - objects - relationships - degrees
# Q3. VC는 어떤 창업자에게 투자할까?(어떤 창업자: 학력/지역/국가 출신)
# ============================================================
vc_invest_founder = (investments
                     .merge(fundingrounds, how='left', on='funding_round_id')
                     .merge(objects, how='left', left_on='fr_c_id', right_on='objects_cfpr_id')
                     .merge(relationships, how='left', left_on='objects_cfpr_id', right_on='rel_cf_id')
                     .merge(degrees, how='left', left_on='rel_p_id', right_on='degrees_p_id')
                     )
print(f"vc_invest_founder 행수:{vc_invest_founder.shape}")
print("relationships 테이블 조인 후, 행 수 80902 → 659048 증가")
print("degrees 테이블 조인 후, 행 수 659048 → 959671 증가")
print("investments - funding_rounds - objects - relationships - degrees 조인 완료!")
# 나중에 EDA할 때, relationships.title → 'founder'만 필터링하기

# ============================================================
# investments - funding_rounds - objects
# Q3. VC는 어떤 라운드의 기업에 투자할까?
# Q3. VC는 어떤 산업의 기업에 투자할까?
# Q3. VC는 투자할 때 한번에 몇 개의 기업에, 얼마를 투자할까?
# ============================================================
vc_invest_cat_fr = (investments
        .merge(fundingrounds, how='left', left_on="funding_round_id", right_on="funding_round_id")
        .merge(objects, how='left', left_on='fr_c_id', right_on='objects_cfpr_id')
        )
print(f"vc_invest_cat_fr 행수:{vc_invest_cat_fr.shape}")
print("investments - funding_rounds - objects 조인 완료!")

vc_invest_founder 행수:(959671, 84)
relationships 테이블 조인 후, 행 수 80902 → 659048 증가
degrees 테이블 조인 후, 행 수 659048 → 959671 증가
investments - funding_rounds - objects - relationships - degrees 조인 완료!
vc_invest_cat_fr 행수:(80902, 64)
investments - funding_rounds - objects 조인 완료!


#### Q4. VC가 모은 펀드 규모는 주로 얼마일까?

In [11]:
#########################
# 변경자: 수아
# 변경일자: 25.12.09
# 변경
# 내용: 1. 투자자 분석 테이블 조인
#      2. VC 펀드 규모
#########################

# ============================================================
# investments - funds
# Q4. VC가 모은 펀드 규모는 주로 얼마일까?
# ============================================================
vc_fundraising = (investments
                  .merge(funds, how='left', left_on="investor_cfp_id", right_on="funds_f_id")
                  )
print(f"vc_fundraising 행수:{vc_fundraising.shape}")
print("funds 테이블 조인 후, 행 수 80902 → 125969 증가: 하나의 투자기업이 여러 펀드를 만들기에 정상")
print("investments - funds 조인 완료!")


vc_fundraising 행수:(125969, 16)
funds 테이블 조인 후, 행 수 80902 → 125969 증가: 하나의 투자기업이 여러 펀드를 만들기에 정상
investments - funds 조인 완료!


In [12]:
#########################
# 변경자: 수아
# 변경일자: 25.12.12
# 변경
# 내용: 1. VC의 특징 분석(VC의 학력)
#########################

# ============================================================
# investments - relationships - degrees
# Q4. VC의 학력은 어떨까?
# ============================================================
vc_degree = (investments
                  .merge(relationships, how='left', left_on="investor_cfp_id", right_on="rel_cf_id")
                  .merge(degrees, how='left', left_on="rel_p_id", right_on="degrees_p_id")
                  )
print(f"vc_degree 행수:{vc_degree.shape}")
print("investments - relationships - degrees 조인 완료!")

vc_degree 행수:(2753868, 24)
investments - relationships - degrees 조인 완료!


In [13]:
# 조인 테이블 csv 저장
vc_invest_fr_acq.to_csv('./join_data/vc_invest_fr_acq.csv', index=False)
vc_invest_fr_ipo.to_csv('./join_data/vc_invest_fr_ipo.csv', index=False)
vc_round.to_csv('./join_data/vc_round.csv', index=False)
vc_rel.to_csv('./join_data/vc_rel.csv', index=False)
vc_category.to_csv('./join_data/vc_category.csv', index=False)
vc_acquired.to_csv('./join_data/vc_acquired.csv', index=False)
vc_ipo.to_csv('./join_data/vc_ipo.csv', index=False)
vc_issues.to_csv('./join_data/vc_issues.csv', index=False)
vc_invest_cat_fr.to_csv('./join_data/vc_invest_cat_fr.csv', index=False)
vc_invest_founder.to_csv('./join_data/vc_invest_founder.csv', index=False)
vc_fundraising.to_csv('./join_data/vc_fundraising.csv', index=False)
vc_degree.to_csv('./join_data/vc_degree.csv', index=False)