In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython
from datetime import datetime
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pickle
from pandas.plotting import scatter_matrix
from dateutil.relativedelta import relativedelta
from sklearn.decomposition import PCA

In [2]:
with open('google_earnings.csv', 'r', encoding='utf-8') as file:
  lines = file.readlines()
  for i, line in enumerate(lines):
    print(line.strip())
    if i >= 8:
      break

﻿Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT",,,
GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,
GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91
GOOGL,Alphabet Inc.,"Jul 23, 2024, 4 PMEDT",1.84,1.89,2.47
GOOGL,Alphabet Inc.,"Apr 25, 2024, 4 PMEDT",1.51,1.89,24.77
GOOGL,Alphabet Inc.,"Jan 30, 2024, 4 PMEST",1.59,1.64,2.98


In [97]:
# head() : 데이터 프레임의 시작부를 출력.
earning_df = pd.read_csv("google_earnings.csv")
earning_df.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
0,GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT",,,
1,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
2,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91


In [10]:
# dtypes: 각 열이 어떤 데이터 타입인지. (보통 문자열(String)타입일때 object라고 표시됨.)
earning_df.dtypes

Unnamed: 0,0
Symbol,object
Company,object
Earnings Date,object
EPS Estimate,float64
Reported EPS,float64
Surprise(%),float64


In [12]:
# to_csv('저장할 파일 경로', 인덱스 포함여부)
earning_df.to_csv('earning.csv', index=False)

earning_df_from_csv = pd.read_csv('earning.csv')
earning_df_from_csv.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
0,GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT",,,
1,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
2,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91


In [13]:
# 데이터프레임을 Numpy 배열로 읽어오기.
earning_array = earning_df.values
earning_array[:2]

array([['GOOGL', 'Alphabet Inc', 'Oct 27, 2025, 4 PMEDT', nan, nan, nan],
       ['GOOGL', 'Alphabet Inc', 'Jul 21, 2025, 4 PMEDT', nan, nan, nan]],
      dtype=object)

In [15]:
# Numpy 배열을 데이터프레임으로 만듦. DataFrame(사용할 데이터, 사용할 컬럼명)
earning_df_from_array = pd.DataFrame(data=earning_array, columns=earning_df.columns.values)
earning_df_from_array.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
0,GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT",,,
1,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
2,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91


---



In [20]:
# 데이터 프레임의 특정 열(column) 선택.
earning_cols = earning_df[["Company", "Earnings Date"]]
earning_cols.head()

Unnamed: 0,Company,Earnings Date
0,Alphabet Inc,"Oct 27, 2025, 4 PMEDT"
1,Alphabet Inc,"Jul 21, 2025, 4 PMEDT"
2,Alphabet Inc,"Apr 23, 2025, 4 PMEDT"
3,Alphabet Inc,"Jan 28, 2025, 4 PMEST"
4,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT"


In [21]:
# 시리즈(Series): 일차원 배열 형태의 데이터타입.
earning_df["Company"]

Unnamed: 0,Company
0,Alphabet Inc
1,Alphabet Inc
2,Alphabet Inc
3,Alphabet Inc
4,Alphabet Inc.
...,...
80,Alphabet Inc.
81,Alphabet Inc.
82,Alphabet Inc.
83,Alphabet Inc.


In [25]:
type(earning_df["Company"]), type(earning_df[["Company", "Earnings Date"]])

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [40]:
# 데이터 프레임의 특정 행을 선택: 파이썬 리스트의 서브리스트를 선택하는 것과 같은 방식으로 하면 됨.
[1, 2, 3, 4, 5][0:3]

# 인덱스가 0번부터 시작.
print(earning_df)
# []에 입력하는 범위는 인덱스가 아님에 유의. (즉, 인덱스가 아니라 데이터 순서에 기반함. )
earning_rows = earning_df[1:3]
earning_rows

   Symbol        Company           Earnings Date  EPS Estimate  Reported EPS  \
0   GOOGL   Alphabet Inc   Oct 27, 2025, 4 PMEDT           NaN           NaN   
1   GOOGL   Alphabet Inc   Jul 21, 2025, 4 PMEDT           NaN           NaN   
2   GOOGL   Alphabet Inc   Apr 23, 2025, 4 PMEDT           NaN           NaN   
3   GOOGL   Alphabet Inc   Jan 28, 2025, 4 PMEST          2.12           NaN   
4   GOOGL  Alphabet Inc.   Oct 29, 2024, 4 PMEDT          1.85          2.12   
..    ...            ...                     ...           ...           ...   
80  GOOGL  Alphabet Inc.  Oct 20, 2005, 12 AMEDT          0.03          0.04   
81  GOOGL  Alphabet Inc.  Jul 21, 2005, 12 AMEDT          0.03          0.03   
82  GOOGL  Alphabet Inc.  Apr 21, 2005, 12 AMEDT          0.02          0.03   
83  GOOGL  Alphabet Inc.  Feb 01, 2005, 12 AMEST          0.02          0.02   
84  GOOGL  Alphabet Inc.  Oct 21, 2004, 12 AMEDT          0.01          0.02   

    Surprise(%)  
0           NaN  
1  

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
1,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
2,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,


In [42]:
earning_df_new_index = pd.DataFrame(
    data=earning_array,
    columns=earning_df.columns.values,
    index=range(1, len(earning_df) + 1))
# 인덱스가 1번부터 시작.
print(earning_df_new_index)
earning_df_new_index.head()
earning_df_new_index[1:3]

   Symbol        Company           Earnings Date EPS Estimate Reported EPS  \
1   GOOGL   Alphabet Inc   Oct 27, 2025, 4 PMEDT          NaN          NaN   
2   GOOGL   Alphabet Inc   Jul 21, 2025, 4 PMEDT          NaN          NaN   
3   GOOGL   Alphabet Inc   Apr 23, 2025, 4 PMEDT          NaN          NaN   
4   GOOGL   Alphabet Inc   Jan 28, 2025, 4 PMEST         2.12          NaN   
5   GOOGL  Alphabet Inc.   Oct 29, 2024, 4 PMEDT         1.85         2.12   
..    ...            ...                     ...          ...          ...   
81  GOOGL  Alphabet Inc.  Oct 20, 2005, 12 AMEDT         0.03         0.04   
82  GOOGL  Alphabet Inc.  Jul 21, 2005, 12 AMEDT         0.03         0.03   
83  GOOGL  Alphabet Inc.  Apr 21, 2005, 12 AMEDT         0.02         0.03   
84  GOOGL  Alphabet Inc.  Feb 01, 2005, 12 AMEST         0.02         0.02   
85  GOOGL  Alphabet Inc.  Oct 21, 2004, 12 AMEDT         0.01         0.02   

   Surprise(%)  
1          NaN  
2          NaN  
3          N

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
2,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,


In [46]:
# iloc: 기존과 같은 데이터 순서 기반.
earning_df_new_index.iloc[1:3]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
2,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,


In [48]:
# loc: 인덱스 기반. (loc를 사용할 경우 범위의 마지막에 해당하는 값도 결과에 포함됨.)
earning_df_new_index.loc[1:3]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
1,GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT",,,
2,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT",,,
3,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT",,,


In [49]:
# 행과 열의 범위를 동시에 지정. ( iloc[행의 범위 , [선택할 열]] )
# iloc는 순서에 의존해서 선택이 이뤄지기 때문에 숫자 범위로 넣으면 됨.
earning_df_new_index.iloc[1:4,[2,3]]

Unnamed: 0,Earnings Date,EPS Estimate
2,"Jul 21, 2025, 4 PMEDT",
3,"Apr 23, 2025, 4 PMEDT",
4,"Jan 28, 2025, 4 PMEST",2.12


In [52]:
# loc은 index, column값을 가지고 선택이 이뤄지기 때문에 선택하려는 index, column 값들을 넣어줘야함.
earning_df_new_index.loc[1:4,["Earnings Date", "EPS Estimate"]]

Unnamed: 0,Earnings Date,EPS Estimate
1,"Oct 27, 2025, 4 PMEDT",
2,"Jul 21, 2025, 4 PMEDT",
3,"Apr 23, 2025, 4 PMEDT",
4,"Jan 28, 2025, 4 PMEST",2.12


In [98]:
# 조건으로 필터링.
earning_df[earning_df["EPS Estimate"] > 1.0]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91
5,GOOGL,Alphabet Inc.,"Jul 23, 2024, 4 PMEDT",1.84,1.89,2.47
6,GOOGL,Alphabet Inc.,"Apr 25, 2024, 4 PMEDT",1.51,1.89,24.77
7,GOOGL,Alphabet Inc.,"Jan 30, 2024, 4 PMEST",1.59,1.64,2.98
8,GOOGL,Alphabet Inc.,"Oct 24, 2023, 4 PMEDT",1.45,1.55,6.84
9,GOOGL,Alphabet Inc.,"Jul 25, 2023, 4 PMEDT",1.34,1.44,7.54
10,GOOGL,Alphabet Inc.,"Apr 25, 2023, 4 PMEDT",1.07,1.17,9.72
11,GOOGL,Alphabet Inc.,"Feb 02, 2023, 4 PMEST",1.18,1.05,-10.73
12,GOOGL,Alphabet Inc.,"Oct 25, 2022, 4 PMEDT",1.25,1.06,-15.14


In [69]:
earning_df[(earning_df["EPS Estimate"] > 0.8) &
           (earning_df["Earnings Date"] > "2021-01-01")]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91
5,GOOGL,Alphabet Inc.,"Jul 23, 2024, 4 PMEDT",1.84,1.89,2.47
6,GOOGL,Alphabet Inc.,"Apr 25, 2024, 4 PMEDT",1.51,1.89,24.77
7,GOOGL,Alphabet Inc.,"Jan 30, 2024, 4 PMEST",1.59,1.64,2.98
8,GOOGL,Alphabet Inc.,"Oct 24, 2023, 4 PMEDT",1.45,1.55,6.84
9,GOOGL,Alphabet Inc.,"Jul 25, 2023, 4 PMEDT",1.34,1.44,7.54
10,GOOGL,Alphabet Inc.,"Apr 25, 2023, 4 PMEDT",1.07,1.17,9.72
11,GOOGL,Alphabet Inc.,"Feb 02, 2023, 4 PMEST",1.18,1.05,-10.73
12,GOOGL,Alphabet Inc.,"Oct 25, 2022, 4 PMEDT",1.25,1.06,-15.14


In [99]:
# 조건에 맞는 인덱스만 추출. (NumPy의 where 메소드 활용)
index = np.where((earning_df["EPS Estimate"] > 0.8) &
           (earning_df["Earnings Date"] > "2021-01-01"))
index

(array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),)

In [74]:
earning_df.loc[index]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST",2.12,,
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT",1.85,2.12,14.91
5,GOOGL,Alphabet Inc.,"Jul 23, 2024, 4 PMEDT",1.84,1.89,2.47
6,GOOGL,Alphabet Inc.,"Apr 25, 2024, 4 PMEDT",1.51,1.89,24.77
7,GOOGL,Alphabet Inc.,"Jan 30, 2024, 4 PMEST",1.59,1.64,2.98
8,GOOGL,Alphabet Inc.,"Oct 24, 2023, 4 PMEDT",1.45,1.55,6.84
9,GOOGL,Alphabet Inc.,"Jul 25, 2023, 4 PMEDT",1.34,1.44,7.54
10,GOOGL,Alphabet Inc.,"Apr 25, 2023, 4 PMEDT",1.07,1.17,9.72
11,GOOGL,Alphabet Inc.,"Feb 02, 2023, 4 PMEST",1.18,1.05,-10.73
12,GOOGL,Alphabet Inc.,"Oct 25, 2022, 4 PMEDT",1.25,1.06,-15.14


In [82]:
# sample: 데이터프레임의 행들을 랜덤하게 추출.
# (n개만큼)
print(len(earning_df.sample(n=5)))
# 전체 샘플수를 1로 봤을때 frac 값 비율만큼의 샘플들을 랜덤으로 뽑음.
len(earning_df.sample(frac=0.5)), len(earning_df)

5


(42, 85)

In [94]:
earning_df = earning_df[["Symbol",
                         "Company",
                         "Earnings Date"]]
earning_df.head()

Unnamed: 0,Symbol,Company,Earnings Date
0,GOOGL,Alphabet Inc,"Oct 27, 2025, 4 PMEDT"
1,GOOGL,Alphabet Inc,"Jul 21, 2025, 4 PMEDT"
2,GOOGL,Alphabet Inc,"Apr 23, 2025, 4 PMEDT"
3,GOOGL,Alphabet Inc,"Jan 28, 2025, 4 PMEST"
4,GOOGL,Alphabet Inc.,"Oct 29, 2024, 4 PMEDT"
