## Preprocessing

In [1]:
#초기 설정및 시스템 라이브러리
import platform
import warnings

# 데이터 시각화 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
print(platform.system())
warnings.filterwarnings('ignore')

# 행,열,결과값 생략 없이 보기,세팅
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

# 시각화 OS별 한글폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif platform.system() == 'Mac':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정
    
print("="*60)
print("라이브러리 로드 완료!")
print("한글 폰트 설정 완료!")
print("="*60)

Windows
라이브러리 로드 완료!
한글 폰트 설정 완료!


### Load Data
* funding_round.csv
* funds.csv

In [2]:
# Load Files
# df_acqui = pd.read_csv("./data/acquisitions.csv")
# df_degree = pd.read_csv("./data/degrees.csv")
df_fr = pd.read_csv("./data/funding_rounds.csv")
df_funds = pd.read_csv("./data/funds.csv")
# df_inv = pd.read_csv("./data/investments.csv")
# df_ipo = pd.read_csv("./data/ipos.csv")
# df_ms = pd.read_csv("./data/milestones.csv")
# df_obj = pd.read_csv("./data/objects.csv")
# df_office = pd.read_csv("./data/offices.csv")
# df_ppl = pd.read_csv("./data/people.csv")
# df_rel = pd.read_csv("./data/relationships.csv")

print("="*60)
print("Dataset 로드 완료!")
print("="*60)

Dataset 로드 완료!


### Functions

In [3]:
# 간단 요약/통계 함수 
def compute_skew_kurtosis(df, col):
    """
    한 열에 대해 왜도(skewness)와 첨도(kurtosis)를 계산해서 리턴.
    """
    s = df[col].dropna()
    skew = stats.skew(s)
    kurt = stats.kurtosis(s)
    print(f"왜도skew={skew}")
    print(f"첨도kurt={kurt}")

    return skew, kurt

# 결측치 관련 함수
def null_cnt_ratio(df, col):
    """
    특정 행의 결측치 갯수와 전체에서의 비율 계산 함수
    """
    total = len(df)
    null_cnt = df[col].isnull().sum()
    ratio = (null_cnt / total) * 100

    print(f"[{col}]")
    print(f"결측 수 : {null_cnt}")
    print(f"결측 비율 : {ratio:.2f}%")

    return null_cnt, ratio

# 결측 플래그 컬럼 추가
def add_missing_flag(df, col, flag_col=None):
    """
    결측 여부 플래그 열 추가 (0/1).
    예: col='income' → income_was_missing 열 추가
    """
    out = df.copy()
    if flag_col is None:
        flag_col = f"{col}_was_missing"
    out[flag_col] = out[col].isna().astype(int)
    return out

# 중복 관련 함수
def drop_full_duplicates(df):
    """
    완전 중복 행 제거 후 index 리셋.
    """
    return df.drop_duplicates().reset_index(drop=True)

def is_duplicated(df, cols):
    """
    중복 검사 함수 
    """
    for col in cols:
        dup_exist = df[col].duplicated().any()
        dup_cnt = df[col].duplicated().sum()

        print(f"[{col}] 중복 존재: {dup_exist} (중복 개수: {dup_cnt})")
        
# 이상치 관련 함수 
def iqr_bounds(series, k=1.5):
    """
    IQR 방식 이상치 경계 계산.
    → (lo, hi, q1, q3, iqr) 리턴
    """
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lo = q1 - k*iqr
    hi = q3 + k*iqr
    return lo, hi, q1, q3, iqr


def cap_with_iqr(df, col, new_col=None, k=1.5):
    """
    IQR 기준으로 캡핑(윈저화)한 새 열 생성.
    예: income → income_cap
    """
    out = df.copy()
    if new_col is None:
        new_col = f"{col}_cap"
    lo, hi, *_ = iqr_bounds(out[col].dropna(), k=k)
    out[new_col] = out[col].clip(lo, hi)
    return out


def add_log1p(df, col, new_col=None, clip_lower=0):
    """
    오른쪽 꼬리 완화를 위한 log1p 변환 열 추가.
    예: income_cap → income_log1p
    """
    out = df.copy()
    if new_col is None:
        new_col = f"{col}_log1p"
    out[new_col] = np.log1p(out[col].clip(lower=clip_lower))
    return out

# 값이 0인 행의 갯수와 전체에서의 비율 계산 함수
def zero_cnt_ratio(df, col):
    cond = (df[col] == 0)
    zero_cnt = cond.sum()              # True 개수 세기
    ratio = cond.mean() * 100          # True 비율 * 100
    
    print(f"[{col}]")
    print("0인 개수:", zero_cnt)
    print(f"0 비율: {ratio:.2f}%")

### funding_rounds 전처리

In [4]:
# 사용하는 컬럼으로만 구성 (불필요한 컬럼 제거)
fr_cleaned = df_fr[['funding_round_id', 'object_id', 'funded_at', 'funding_round_type', 'raised_amount_usd',
                    'pre_money_valuation_usd', 'post_money_valuation_usd', 'participants', 'is_first_round', 'is_last_round']]
print("="*60)
print("불필요한 컬럼 제거 완료!")

불필요한 컬럼 제거 완료!


In [5]:
# object_id
print("="*60)
print("[object_id 컬럼명 변경 전]")
display(fr_cleaned.columns)

# 컬럼명 변경
# as-is: object_id
# to-be: fr_c_id
fr_cleaned.rename(columns={'object_id':'fr_c_id'}, inplace=True)

print("="*60)
print("[object_id 컬럼명 변경 후]")
display(fr_cleaned.columns)

[object_id 컬럼명 변경 전]


Index(['funding_round_id', 'object_id', 'funded_at', 'funding_round_type',
       'raised_amount_usd', 'pre_money_valuation_usd',
       'post_money_valuation_usd', 'participants', 'is_first_round',
       'is_last_round'],
      dtype='object')

[object_id 컬럼명 변경 후]


Index(['funding_round_id', 'fr_c_id', 'funded_at', 'funding_round_type',
       'raised_amount_usd', 'pre_money_valuation_usd',
       'post_money_valuation_usd', 'participants', 'is_first_round',
       'is_last_round'],
      dtype='object')

In [6]:
print(fr_cleaned['funded_at'].dtype)

object


In [7]:
# funded_at
print("="*60)
print("[funded_at 형변환 전]")
print(f"[funded_at] 데이터타입: {fr_cleaned['funded_at'].dtype}")

# 날짜형으로 형변환
# as-is: object
# to-be: datetime
fr_cleaned["funded_at"]   = pd.to_datetime(fr_cleaned["funded_at"],   errors="coerce")

print("="*60)
print("[funded_at 형변환 후]")
print(f"[funded_at] 데이터타입: {fr_cleaned['funded_at'].dtype}")

# 결측치 NaT대치
fr_cleaned["funded_at"] = fr_cleaned["funded_at"].fillna(pd.NaT)
print("[funded_at] 결측치 대치 완료!")


[funded_at 형변환 전]
[funded_at] 데이터타입: object
[funded_at 형변환 후]
[funded_at] 데이터타입: datetime64[ns]
[funded_at] 결측치 대치 완료!


In [8]:
# funded_at
print("="*60)
print("[funded_at 형변환 전]")
print(f"[funded_at] 데이터타입: {fr_cleaned['funded_at'].dtype}")

# 날짜형으로 형변환
# as-is: object
# to-be: datetime
fr_cleaned["funded_at"] = pd.to_datetime(fr_cleaned["funded_at"],   errors="coerce")

print("="*60)
print("[funded_at 형변환 후]")
print(f"[funded_at] 데이터타입: {fr_cleaned['funded_at'].dtype}")

# 결측치 NaT로 대치
fr_cleaned['funded_at'] = fr_cleaned['funded_at'].fillna(pd.NaT)
print("[funded_at] 결측치 채우기 완료!")

# 파생변수 생성 (시계열 패턴 분석 위한)
print("="*60)
# funded_year : 연도별
fr_cleaned['funded_year'] = fr_cleaned['funded_at'].dt.year
print("[funded_year] 파생변수 생성 완료!")
# funded_quarter: 분기별
fr_cleaned['funded_quarter'] = fr_cleaned['funded_at'].dt.to_period('Q').astype(str)
print("[funded_quarter] 파생변수 생성 완료!")

# 내용 확인
print("="*60)
print(fr_cleaned[['funded_at', 'funded_year', 'funded_quarter']].head())

[funded_at 형변환 전]
[funded_at] 데이터타입: datetime64[ns]
[funded_at 형변환 후]
[funded_at] 데이터타입: datetime64[ns]
[funded_at] 결측치 채우기 완료!
[funded_year] 파생변수 생성 완료!
[funded_quarter] 파생변수 생성 완료!
   funded_at  funded_year funded_quarter
0 2006-12-01       2006.0         2006Q4
1 2004-09-01       2004.0         2004Q3
2 2005-05-01       2005.0         2005Q2
3 2006-04-01       2006.0         2006Q2
4 2006-05-01       2006.0         2006Q2


In [9]:
fr_cleaned['funding_round_type'].value_counts()

funding_round_type
venture           15342
angel             13163
series-a           9873
series-b           4892
series-c+          4216
other              4201
private-equity     1043
crowdfunding        111
post-ipo             87
Name: count, dtype: int64

In [10]:
# funding_round_type
print("="*60)
print(f"[funding_round_type] 데이터 타입: {fr_cleaned['funding_round_type'].dtype}")

# funding_round_type 카테고리 매핑 딕셔너리
# mapping 기준 변경
fr_type_map = {
    'venture'       : 'seed',
    'angel'         : 'seed',
    'crowdfunding'  : 'seed',
    'series-a'      : 'series-a',
    'series-b'      : 'series-b',
    'series-c+'     : 'series-c+',
    'private-equity': 'private-equity',
    'post-ipo'      : 'post-ipo',
    'other'         : 'other'
}

# 컬럼 추가 (funding_round_type을 category로 형변환한 것)
# 컬럼명: cat_fr_type
fr_cleaned['cat_fr_type'] = fr_cleaned['funding_round_type'].map(fr_type_map).fillna('other')
print("="*60)
print("[cat_fr_type] 컬럼 생성 완료!")

display(fr_cleaned['cat_fr_type'].value_counts())
print(fr_cleaned['cat_fr_type'].dtype)

[funding_round_type] 데이터 타입: object
[cat_fr_type] 컬럼 생성 완료!


cat_fr_type
seed              28616
series-a           9873
series-b           4892
series-c+          4216
other              4201
private-equity     1043
post-ipo             87
Name: count, dtype: int64

object


In [11]:
# 형변환
# as-is : object
# to-be : category
fr_cleaned['cat_fr_type'] = fr_cleaned['cat_fr_type'].astype('category')
print("="*60)
print(f"[cat_fr_type] 데이터 타입: {fr_cleaned['cat_fr_type'].dtype}")
print(fr_cleaned[['funding_round_type', 'cat_fr_type']].head())

[cat_fr_type] 데이터 타입: category
  funding_round_type cat_fr_type
0           series-b    series-b
1              angel        seed
2           series-a    series-a
3           series-b    series-b
4           series-b    series-b


In [12]:
# funding_round_type
print("="*60)
print(f"[funding_round_type] 데이터 타입: {fr_cleaned['funding_round_type'].dtype}")

# funding_round_type 카테고리 매핑 딕셔너리(숫자 순서형)
# mapping 기준 변경
# 0: seed, 1:series-a, 2:series-b, 3:series-c+
# 97: private-equity, 98: post_ipo, 99:other
fr_type_map = {
    'venture'       : '0',         
    'angel'         : '0',   
    'crowdfunding'  : '0',        
    'series-a'      : '1',
    'series-b'      : '2',
    'series-c+'     : '3',
    'private-equity': '97',
    'post-ipo'      : '98',
    'other'         : '99'
}

# 컬럼 추가 (funding_round_type을 category로 형변환한 것)
# 컬럼명: cat_fr_type
fr_cleaned['num_fr_type'] = fr_cleaned['funding_round_type'].map(fr_type_map).fillna('other')
print("="*60)
print("[num_fr_type] 컬럼 생성 완료!")

display(fr_cleaned['num_fr_type'].value_counts())
print(fr_cleaned['num_fr_type'].dtype)

[funding_round_type] 데이터 타입: object
[num_fr_type] 컬럼 생성 완료!


num_fr_type
0     28616
1      9873
2      4892
3      4216
99     4201
97     1043
98       87
Name: count, dtype: int64

object


In [13]:
# 형변환
# as-is : object
# to-be : category
fr_cleaned['num_fr_type'] = fr_cleaned['num_fr_type'].astype('category')
print("="*60)
print(f"[num_fr_type] 데이터 타입: {fr_cleaned['num_fr_type'].dtype}")
print(fr_cleaned[['cat_fr_type', 'num_fr_type']].head())

[num_fr_type] 데이터 타입: category
  cat_fr_type num_fr_type
0    series-b           2
1        seed           0
2    series-a           1
3    series-b           2
4    series-b           2


In [14]:
# participants

# 컬럼 추가 (participants을 log1p 로그변환 한 것)
# 컬럼명: log_participants
fr_cleaned['log_participants'] = np.log1p(fr_cleaned['participants'])
print("="*60)
print("[log_participants] 컬럼 생성 완료!")

# 내용 확인
fr_cleaned['log_participants'] = np.log1p(fr_cleaned['participants'])
print(fr_cleaned[['participants', 'log_participants']].head())


[log_participants] 컬럼 생성 완료!
   participants  log_participants
0             2          1.098612
1             2          1.098612
2             3          1.386294
3             4          1.609438
4             2          1.098612


In [15]:
# raised_amount_usd 
fr_cleaned["raised_amount_usd"] = fr_cleaned["raised_amount_usd"].replace(0, np.nan)

# 비공개 플래그 변수 생성 → is_fr_raised_disclosed
fr_cleaned["is_fr_raised_disclosed"] = np.nan
fr_cleaned["is_fr_raised_disclosed"] = np.where(
    fr_cleaned["raised_amount_usd"].isna(), 
    1, 
    0
)
display(fr_cleaned["is_fr_raised_disclosed"].value_counts())

is_fr_raised_disclosed
0    46928
1     6000
Name: count, dtype: int64

In [16]:
# pre_money_valuation_usd
fr_cleaned["pre_money_valuation_usd"] = fr_cleaned["pre_money_valuation_usd"].replace(0, np.nan)

In [17]:
# post_money_valuation_usd
fr_cleaned["post_money_valuation_usd"] = fr_cleaned["post_money_valuation_usd"].replace(0, np.nan)

In [18]:
fr_cleaned.to_csv("./data/clean/clean_fr_v3.csv", encoding="utf-8", index=False)
print("="*60)
print("funding_rounds 전처리 완료 csv 추출 완료!")
print("="*60)

funding_rounds 전처리 완료 csv 추출 완료!


### funds 전처리

In [19]:
# 사용하는 컬럼으로만 구성 (불필요한 컬럼 제거)
funds_cleaned = df_funds[['fund_id', 'object_id', 'name', 'funded_at', 'raised_amount', 'raised_currency_code']]
print("="*60)
print("불필요한 컬럼 제거 완료!")

불필요한 컬럼 제거 완료!


In [20]:
# object_id
print("="*60)
print("[object_id 컬럼명 변경 전]")
display(funds_cleaned.columns)

# 컬럼명 변경
# as-is: object_id
# to-be: funds_f_id
funds_cleaned.rename(columns={'object_id':'funds_f_id'}, inplace=True)

print("="*60)
print("[object_id 컬럼명 변경 후]")
display(funds_cleaned.columns)

[object_id 컬럼명 변경 전]


Index(['fund_id', 'object_id', 'name', 'funded_at', 'raised_amount',
       'raised_currency_code'],
      dtype='object')

[object_id 컬럼명 변경 후]


Index(['fund_id', 'funds_f_id', 'name', 'funded_at', 'raised_amount',
       'raised_currency_code'],
      dtype='object')

In [21]:
funds_cleaned['name'].value_counts()

name
Fund I                       61
Fund II                      48
Fund III                     37
Fund IV                      17
Fund VI                      14
                             ..
Gilde Healthcare III Fund     1
Capital Factory IV            1
FTV IV LP                     1
GTCR FUND XI/B LP             1
Hyper Ventures Fund I         1
Name: count, Length: 1313, dtype: int64

In [22]:
# name
print("="*60)
print(f"[name] 데이터 타입: {funds_cleaned['name'].dtype}")

# 컬럼 추가 (name을 category로 형변환한 것)
# 컬럼명: cat_funds_name
funds_cleaned['cat_funds_name'] = funds_cleaned['name'].copy()
print("="*60)
print("[cat_funds_name] 컬럼 생성 완료!")

funds_cleaned['cat_funds_name'].value_counts()

[name] 데이터 타입: object
[cat_funds_name] 컬럼 생성 완료!


cat_funds_name
Fund I                       61
Fund II                      48
Fund III                     37
Fund IV                      17
Fund VI                      14
                             ..
Gilde Healthcare III Fund     1
Capital Factory IV            1
FTV IV LP                     1
GTCR FUND XI/B LP             1
Hyper Ventures Fund I         1
Name: count, Length: 1313, dtype: int64

In [23]:
# 텍스트 정규화
# 1) 앞뒤 공백 제거
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.strip()

# 2) 소문자로 변환
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.lower()

# 3) 이름 앞뒤의 쉼표, 점, 슬래시 정도만 정리 (내부 텍스트는 유지)
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.replace(r'^[\s\.,/]+', '', regex=True)
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.replace(r'[\s\.,/]+$', '', regex=True)

# 4) The 제거
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.replace(r'^the\s+', '', regex=True)

# 5) 탭, 여러 칸 공백 등을 모두 ' ' 하나로
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].str.replace(r'\s+', ' ', regex=True)

print("="*60)
print("[cat_funds_name] 텍스트 정규화 완료!")

# 형변환
# as-is : object
# to-be : category
funds_cleaned['cat_funds_name'] = funds_cleaned['cat_funds_name'].astype('category')
print(f"[cat_funds_name] 데이터 타입: {funds_cleaned['cat_funds_name'].dtype}")
print(funds_cleaned[['name', 'cat_funds_name']].head())

[cat_funds_name] 텍스트 정규화 완료!
[cat_funds_name] 데이터 타입: category
                         name              cat_funds_name
0                 Second Fund                 second fund
1  Sequoia Israel Fourth Fund  sequoia israel fourth fund
2                  Tenth fund                  tenth fund
3           New funds acquire           new funds acquire
4                  Third fund                  third fund


In [24]:
# funded_at
print("="*60)
print("[funded_at 형변환 전]")
print(f"[funded_at] 데이터타입: {funds_cleaned['funded_at'].dtype}")

# 날짜형으로 형변환
# as-is: object
# to-be: datetime
funds_cleaned["funded_at"] = pd.to_datetime(funds_cleaned["funded_at"],   errors="coerce")

print("="*60)
print("[funded_at 형변환 후]")
print(f"[funded_at] 데이터타입: {funds_cleaned['funded_at'].dtype}")

# 결측치 NaT로 대치
funds_cleaned['funded_at'] = funds_cleaned['funded_at'].fillna(pd.NaT)
print("[funded_at] 결측치 채우기 완료!")

# 파생변수 생성 (시계열 패턴 분석 위한)
print("="*60)
# funded_year : 연도별
funds_cleaned['funded_year'] = funds_cleaned['funded_at'].dt.year
print("[funded_year] 파생변수 생성 완료!")
# funded_quarter: 분기별
funds_cleaned['funded_quarter'] = funds_cleaned['funded_at'].dt.to_period('Q').astype(str)
print("[funded_quarter] 파생변수 생성 완료!")

# 내용 확인
print("="*60)
print(funds_cleaned[['funded_at', 'funded_year', 'funded_quarter']].head())

[funded_at 형변환 전]
[funded_at] 데이터타입: object
[funded_at 형변환 후]
[funded_at] 데이터타입: datetime64[ns]
[funded_at] 결측치 채우기 완료!
[funded_year] 파생변수 생성 완료!
[funded_quarter] 파생변수 생성 완료!
   funded_at  funded_year funded_quarter
0 2008-12-16       2008.0         2008Q4
1 2008-12-17       2008.0         2008Q4
2 2008-08-11       2008.0         2008Q3
3        NaT          NaN            NaT
4 2008-05-20       2008.0         2008Q2


In [25]:
# raised_amount
#0값을 nan으로 대체
funds_cleaned["raised_amount"] = funds_cleaned["raised_amount"].replace(0, np.nan)

# 비공개 플래그 생성 → is_funds_raised_disclosed 
funds_cleaned["is_funds_raised_disclosed"] = np.nan
funds_cleaned["is_funds_raised_disclosed"] = np.where(
    funds_cleaned["raised_amount"].isna(), 
    1, 
    0
)
display(funds_cleaned["is_funds_raised_disclosed"].value_counts())

is_funds_raised_disclosed
0    1393
1     171
Name: count, dtype: int64

In [26]:
# Load Files (경로확인 후 실행!)
usd_aud = pd.read_csv("./data/rate/USD_AUD_rate.csv")
usd_cad = pd.read_csv("./data/rate/USD_CAD_rate.csv")
usd_eur = pd.read_csv("./data/rate/USD_EUR_rate.csv")
usd_gbp = pd.read_csv("./data/rate/USD_GBP_rate.csv")
usd_jpy = pd.read_csv("./data/rate/USD_JPY_rate.csv")
usd_sek = pd.read_csv("./data/rate/USD_SEK_rate.csv")

print("="*60)
print("환율 Dataset 로드 완료!")
print("="*60)

환율 Dataset 로드 완료!


In [27]:
# 각 환율코드별 날짜 min&max 값 찾기 
print("="*60)
print("AUD")
cond1 = (funds_cleaned['raised_currency_code'] =="AUD")
print(funds_cleaned.loc[cond1, "funded_at"].min())
print(funds_cleaned.loc[cond1, "funded_at"].max())

print("="*60)
print("CAD")
cond2 = (funds_cleaned['raised_currency_code'] =="CAD")
print(funds_cleaned.loc[cond2, "funded_at"].min())
print(funds_cleaned.loc[cond2, "funded_at"].max())

print("="*60)
print("EUR")
cond3 = (funds_cleaned['raised_currency_code'] =="EUR")
print(funds_cleaned.loc[cond3, "funded_at"].min())
print(funds_cleaned.loc[cond3, "funded_at"].max())

print("="*60)
print("GBP")
cond4 = (funds_cleaned['raised_currency_code'] =="GBP")
print(funds_cleaned.loc[cond4, "funded_at"].min())
print(funds_cleaned.loc[cond4, "funded_at"].max())

print("="*60)
print("JPY")
cond5 = (funds_cleaned['raised_currency_code'] =="JPY")
print(funds_cleaned.loc[cond5, "funded_at"].min())
print(funds_cleaned.loc[cond5, "funded_at"].max())

print("="*60)
print("SEK")
cond6 = (funds_cleaned['raised_currency_code'] =="SEK")
print(funds_cleaned.loc[cond6, "funded_at"].min())
print(funds_cleaned.loc[cond6, "funded_at"].max())

AUD
2013-03-13 00:00:00
2013-11-01 00:00:00
CAD
2006-04-01 00:00:00
2013-11-12 00:00:00
EUR
1999-01-01 00:00:00
2013-12-09 00:00:00
GBP
2001-07-12 00:00:00
2013-12-10 00:00:00
JPY
2000-02-10 00:00:00
2011-07-26 00:00:00
SEK
2013-10-08 00:00:00
2013-10-08 00:00:00


In [28]:
# 환율 계산 함수
def cal_rate(code):
    if code=="AUD":
        cond1 = (usd_aud["observation_date"]>="2013-11-01") & (usd_aud["observation_date"]<="2013-11-01")
        result = usd_aud.loc[cond1, "DEXUSAL"].mean()

    if code=="CAD":
        cond2 = (usd_cad["observation_date"]>="2009-12-21") & (usd_cad["observation_date"]<="2013-11-12")
        result = usd_cad.loc[cond2, "DEXCAUS"].mean()

    if code=="EUR":
        cond3 = (usd_eur["observation_date"]>="2009-12-01") & (usd_eur["observation_date"]<="2013-12-09")
        result = usd_eur.loc[cond3, "DEXUSEU"].mean()
    

    if code=="GBP":
        cond4 = (usd_gbp["observation_date"]>="2004-12-31") & (usd_gbp["observation_date"]<="2013-12-10")
        result = usd_gbp.loc[cond4, "DEXUSUK"].mean()

    if code=="JPY":
        cond5 = (usd_jpy["observation_date"]>="2010-11-10") & (usd_jpy["observation_date"]<="2010-11-10")
        result = usd_jpy.loc[cond5, "DEXJPUS"].mean()

    if code=="SEK":
        cond6 = (usd_sek["observation_date"]>="2013-10-08") & (usd_sek["observation_date"]<="2013-10-08")
        result = usd_sek.loc[cond6, "DEXSDUS"].mean()
    
    return result

In [29]:
aud_mean = cal_rate("AUD")
cad_mean = cal_rate("CAD")
eur_mean = cal_rate("EUR")
gbp_mean = cal_rate("GBP")
jpy_mean = cal_rate("JPY")
sek_mean = cal_rate("SEK")
usd_mean = 1.0

funds_cleaned["raised_amount_usd"] = np.nan
funds_cleaned["currency_rate"] = np.nan

avg_rate = {
    "AUD": aud_mean,   # 예: 0.75  (U.S. $ to 1 AUD 이면 곱셈)
    "CAD": cad_mean,   # 예: 0.85  (U.S. $ to 1 CAD 이면 곱셈)
    "EUR": eur_mean,
    "GBP": gbp_mean,
    "JPY": jpy_mean,   # 예: 110.0 (JPY to 1 USD 이면 나눗셈)
    "SEK": sek_mean,
    "USD": 1.0         # 이미 USD인 경우 그대로 사용
}

for cur, r in avg_rate.items():
    cond = funds_cleaned["raised_currency_code"] == cur
    
    # 방향에 따라 곱하거나 나누기
    if cur in ["AUD", "CAD", "EUR", "GBP"]:   # "U.S. $ to 1 통화" 타입
        funds_cleaned.loc[cond, "raised_amount_usd"] = (
            funds_cleaned.loc[cond, "raised_amount"] * r
        )
        funds_cleaned.loc[cond, "currency_rate"] = r
    elif cur in ["JPY", "SEK"]:               # "통화 to 1 U.S. $" 타입
        funds_cleaned.loc[cond, "raised_amount_usd"] = (
            funds_cleaned.loc[cond, "raised_amount"] / r
        )
        funds_cleaned.loc[cond, "currency_rate"] = r
    else:  # USD 등
        funds_cleaned.loc[cond, "raised_amount_usd"] = funds_cleaned.loc[cond, "raised_amount"]
        funds_cleaned.loc[cond, "currency_rate"] = r


#확인
cond1 = (funds_cleaned["raised_currency_code"] != "USD")
cond2 = (funds_cleaned["raised_amount"] > 0)
display(funds_cleaned.loc[cond1 & cond2, ["funds_f_id","raised_currency_code", "currency_rate", "raised_amount", "raised_amount_usd"]])

Unnamed: 0,funds_f_id,raised_currency_code,currency_rate,raised_amount,raised_amount_usd
12,f:1363,GBP,1.710477,22500000.0,38485740.0
13,f:1363,GBP,1.710477,45300000.0,77484630.0
45,f:18,EUR,1.335427,350000000.0,467399500.0
50,f:1142,EUR,1.335427,112000000.0,149567800.0
51,f:1142,EUR,1.335427,113000000.0,150903300.0
62,f:512,GBP,1.710477,33000000.0,56445760.0
63,f:512,GBP,1.710477,20000000.0,34209550.0
66,f:512,GBP,1.710477,15000000.0,25657160.0
82,f:18,EUR,1.335427,400000000.0,534170900.0
87,f:18,EUR,1.335427,350000000.0,467399500.0


In [30]:
funds_cleaned.to_csv("./data/clean/clean_funds_v3.csv", encoding="utf-8", index=False)
print("="*60)
print("funds 전처리 완료 csv 추출 완료!")
print("="*60)

funds 전처리 완료 csv 추출 완료!


In [31]:
funds_cleaned["funded_at"].dtype # Datetime64

dtype('<M8[ns]')

In [32]:
fr_cleaned["funded_at"].dtype # Datetime64

dtype('<M8[ns]')