## 4.1 制定資料分析的例行程序

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 讀取資料集 並查看資料
college = pd.read_csv('../../data/college.csv')
print(college.sample(random_state=42))

                    INSTNM         CITY STABBR  HBCU  MENONLY  WOMENONLY  \
3649  Career Point College  San Antonio     TX   0.0      0.0        0.0   

      RELAFFIL  SATVRMID  SATMTMID  DISTANCEONLY  ...  UGDS_2MOR  UGDS_NRA  \
3649         0       NaN       NaN           0.0  ...        0.0       0.0   

      UGDS_UNKN  PPTUG_EF  CURROPER  PCTPELL  PCTFLOAN  UG25ABV  \
3649        0.0       0.0         1   0.9172    0.9172    0.697   

      MD_EARN_WNE_P10  GRAD_DEBT_MDN_SUPP  
3649            20700               14977  

[1 rows x 27 columns]


In [3]:
# 用shape屬性看dataframe 的維度大小
college.shape

(7535, 27)

In [4]:
# 用info 看更多的metadata
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [5]:
# 用describe() 取得總結性統計
print(college.describe().T)

               count         mean          std    min         25%        50%  \
HBCU          7164.0     0.014238     0.118478    0.0    0.000000    0.00000   
MENONLY       7164.0     0.009213     0.095546    0.0    0.000000    0.00000   
WOMENONLY     7164.0     0.005304     0.072642    0.0    0.000000    0.00000   
RELAFFIL      7535.0     0.190975     0.393096    0.0    0.000000    0.00000   
SATVRMID      1185.0   522.819409    68.578862  290.0  475.000000  510.00000   
SATMTMID      1196.0   530.765050    73.469767  310.0  482.000000  520.00000   
DISTANCEONLY  7164.0     0.005583     0.074519    0.0    0.000000    0.00000   
UGDS          6874.0  2356.837940  5474.275871    0.0  117.000000  412.50000   
UGDS_WHITE    6874.0     0.510207     0.286958    0.0    0.267500    0.55570   
UGDS_BLACK    6874.0     0.189997     0.224587    0.0    0.036125    0.10005   
UGDS_HISP     6874.0     0.161635     0.221854    0.0    0.027600    0.07140   
UGDS_ASIAN    6874.0     0.033544     0.

In [6]:
# 用include參數 取得類別型資料的總結性統計
print(college.describe(include=[object]).T)

                   count unique                                      top  freq
INSTNM              7535   7535  Excel Learning Center-San Antonio South     1
CITY                7535   2514                                 New York    87
STABBR              7535     59                                       CA   773
MD_EARN_WNE_P10     6413    598                        PrivacySuppressed   822
GRAD_DEBT_MDN_SUPP  7503   2038                        PrivacySuppressed  1510


In [7]:
# 指定百分位數
print(college.describe(include=[np.number],
                       percentiles=[.01, .05, .10, .25, .50,
                                    .75, .9, .95, .99]).T)

               count         mean          std    min          1%          5%  \
HBCU          7164.0     0.014238     0.118478    0.0    0.000000    0.000000   
MENONLY       7164.0     0.009213     0.095546    0.0    0.000000    0.000000   
WOMENONLY     7164.0     0.005304     0.072642    0.0    0.000000    0.000000   
RELAFFIL      7535.0     0.190975     0.393096    0.0    0.000000    0.000000   
SATVRMID      1185.0   522.819409    68.578862  290.0  390.000000  430.000000   
SATMTMID      1196.0   530.765050    73.469767  310.0  395.000000  430.000000   
DISTANCEONLY  7164.0     0.005583     0.074519    0.0    0.000000    0.000000   
UGDS          6874.0  2356.837940  5474.275871    0.0   14.000000   31.650000   
UGDS_WHITE    6874.0     0.510207     0.286958    0.0    0.000000    0.013265   
UGDS_BLACK    6874.0     0.189997     0.224587    0.0    0.000000    0.000000   
UGDS_HISP     6874.0     0.161635     0.221854    0.0    0.000000    0.000000   
UGDS_ASIAN    6874.0     0.0

## 4.2 資料字典

In [8]:
# data dictionaries
print(pd.read_csv('../../data/college_data_dictionary.csv'))

           column_name                                        description
0               INSTNM                                   Institution Name
1                 CITY                                      City Location
2               STABBR                                 State Abbreviation
3                 HBCU           Historically Black College or University
4              MENONLY                                       0/1 Men Only
5            WOMENONLY                                     0/1 Women only
6             RELAFFIL                          0/1 Religious Affiliation
7             SATVRMID                                  SAT Verbal Median
8             SATMTMID                                    SAT Math Median
9         DISTANCEONLY                            Distance Education Only
10                UGDS                           Undergraduate Enrollment
11          UGDS_WHITE                            Percent Undergrad White
12          UGDS_BLACK                

## 4.3 改變資料型別以減少記憶體用量

In [9]:
# 各型別欄位取出幾個作範例
college = pd.read_csv('../../data/college.csv')
different_cols =['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
print(col2.head())

   RELAFFIL  SATMTMID  CURROPER                               INSTNM STABBR
0         0     420.0         1             Alabama A & M University     AL
1         0     565.0         1  University of Alabama at Birmingham     AL
2         1       NaN         1                   Amridge University     AL
3         0     590.0         1  University of Alabama in Huntsville     AL
4         0     430.0         1             Alabama State University     AL


In [10]:
# 檢視欄位型別
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [11]:
# 檢視各欄記憶體用量
original_mem = col2.memory_usage(deep=True)
original_mem

Index          132
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      599848
STABBR      384285
dtype: int64

In [12]:
# 改dtype
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)

# 檢查型別
print(col2.dtypes)

# 檢查記憶體用量
print(col2.memory_usage(deep=True))

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object
Index          132
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      599848
STABBR      384285
dtype: int64


In [13]:
# 檢視重複資料
col2.select_dtypes(include=['object']).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [14]:
# 轉為categorical type
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

# 檢查記憶體用量
print(col2.memory_usage(deep=True))

Index          132
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      600307
STABBR       12648
dtype: int64


In [15]:
# 比較節省的記憶體用量
new_mem = col2.memory_usage(deep=True)

new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000765
STABBR      0.032913
dtype: float64

In [16]:
# 有缺失值的資料型別
print(college['MENONLY'])

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7530    NaN
7531    NaN
7532    NaN
7533    NaN
7534    NaN
Name: MENONLY, Length: 7535, dtype: float64


In [19]:
# 改成Int64型別
print(college['MENONLY'].memory_usage(deep=True))
print(college['MENONLY'].astype('Int64').memory_usage(deep=True))

60412
67947


## 4.4 資料的排序

In [5]:
# 載入資料集
movie = pd.read_csv('../../data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
print(movie2.head())

                                  movie_title  imdb_score       budget
0                                      Avatar         7.9  237000000.0
1    Pirates of the Caribbean: At World's End         7.1  300000000.0
2                                     Spectre         6.8  245000000.0
3                       The Dark Knight Rises         8.5  250000000.0
4  Star Wars: Episode VII - The Force Awakens         7.1          NaN


In [25]:
# 找前100名
print(movie2.nlargest(100, 'imdb_score').head())

                   movie_title  imdb_score      budget
2725          Towering Inferno         9.5         NaN
1920  The Shawshank Redemption         9.3  25000000.0
3402             The Godfather         9.2   6000000.0
2779                   Dekalog         9.1         NaN
4312      Kickboxer: Vengeance         9.1  17000000.0


In [6]:
# 先找出評分前100名 再找出預算前5低的電影
print(movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget'))

               movie_title  imdb_score    budget
4804        Butterfly Girl         8.7  180000.0
4801    Children of Heaven         8.5  180000.0
4706          12 Angry Men         8.9  350000.0
4550          A Separation         8.4  500000.0
4636  The Other Dream Team         8.4  500000.0


## 4.5 排序後選取每組的最大值和最小值

In [2]:
# 載入資料集
movie = pd.read_csv('../../data/movie.csv')
movie3 = movie[['movie_title', 'title_year','imdb_score']]
print(movie3.head())

                                  movie_title  title_year  imdb_score
0                                      Avatar      2009.0         7.9
1    Pirates of the Caribbean: At World's End      2007.0         7.1
2                                     Spectre      2015.0         6.8
3                       The Dark Knight Rises      2012.0         8.5
4  Star Wars: Episode VII - The Force Awakens         NaN         7.1


In [32]:
# 按照年分與評分排序
print(movie3.sort_values(['title_year', 'imdb_score'], ascending=False).head())

                      movie_title  title_year  imdb_score
4312         Kickboxer: Vengeance      2016.0         9.1
4277  A Beginner's Guide to Snuff      2016.0         8.7
3798                      Airlift      2016.0         8.5
27     Captain America: Civil War      2016.0         8.2
98            Godzilla Resurgence      2016.0         8.2


In [None]:
# 只保留每個年份的第一筆資料
print(movie3.sort_values(['title_year', 'imdb_score'], ascending=False)
            .drop_duplicates(subset='title_year').head())

                                  movie_title  title_year  imdb_score
4312                     Kickboxer: Vengeance      2016.0         9.1
3745                          Running Forever      2015.0         8.6
4369                   Queen of the Mountains      2014.0         8.7
3935  Batman: The Dark Knight Returns, Part 2      2013.0         8.4
3                       The Dark Knight Rises      2012.0         8.5


In [None]:
# groupby 新的做法
print(movie3.sort_values(['title_year', 'imdb_score'], ascending=[False, False])
            .groupby('title_year').head(1).head())

                                  movie_title  title_year  imdb_score
4312                     Kickboxer: Vengeance      2016.0         9.1
3745                          Running Forever      2015.0         8.6
4369                   Queen of the Mountains      2014.0         8.7
3935  Batman: The Dark Knight Returns, Part 2      2013.0         8.4
3                       The Dark Knight Rises      2012.0         8.5


In [97]:
print(
    movie3.groupby('title_year')
          .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1), include_groups=False)
          .sort_values('title_year', ascending=False)
          .head()
)

                                             movie_title  imdb_score
title_year                                                          
2016.0     4312                     Kickboxer: Vengeance         9.1
2015.0     3745                          Running Forever         8.6
2014.0     4804                           Butterfly Girl         8.7
2013.0     3935  Batman: The Dark Knight Returns, Part 2         8.4
2012.0     3                       The Dark Knight Rises         8.5


In [None]:
# 依照電影分級制度查詢年度預算最低的電影
print(movie[['movie_title', 'title_year', 'content_rating', 'budget']]
      .sort_values(['title_year', 'content_rating', 'budget'],
                     ascending=[False, False, True])
      .drop_duplicates(subset=['title_year', 'content_rating']))

                         movie_title  title_year content_rating      budget
4026                       Compadres      2016.0              R   3000000.0
4658             Fight to the Finish      2016.0          PG-13    150000.0
4661                      Rodeo Girl      2016.0             PG    500000.0
3252                     The Wailing      2016.0      Not Rated         NaN
4659  Alleluia! The Devil's Carnival      2016.0            NaN    500000.0
...                              ...         ...            ...         ...
2558                      Lilyhammer         NaN          TV-MA  34000000.0
807       Sabrina, the Teenage Witch         NaN           TV-G   3000000.0
848                    Stargate SG-1         NaN          TV-14   1400000.0
2436                          Carlos         NaN      Not Rated         NaN
2119                    The Bachelor         NaN            NaN   3000000.0

[359 rows x 4 columns]


## 4.6 用sort_values() 選取最大值

In [7]:
# 載入資料集
movie = pd.read_csv('../../data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]

# 先找出評分前100名 再找出預算前5低的電影
print(movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget'))

               movie_title  imdb_score    budget
4804        Butterfly Girl         8.7  180000.0
4801    Children of Heaven         8.5  180000.0
4706          12 Angry Men         8.9  350000.0
4550          A Separation         8.4  500000.0
4636  The Other Dream Team         8.4  500000.0


In [8]:
# 使用sort_values() 進行一樣的操作
print(movie2.sort_values('imdb_score', ascending=False)
      .head(100)
      .sort_values('budget')
      .head(5))

                    movie_title  imdb_score    budget
4815  A Charlie Brown Christmas         8.4  150000.0
4804             Butterfly Girl         8.7  180000.0
4801         Children of Heaven         8.5  180000.0
4706               12 Angry Men         8.9  350000.0
4550               A Separation         8.4  500000.0


In [15]:
# 用nlargest先找出評分前100名 然後直接看tail
print(movie2.nlargest(100, 'imdb_score').tail())

                movie_title  imdb_score     budget
4023                 Oldboy         8.4  3000000.0
4163  To Kill a Mockingbird         8.4  2000000.0
4395         Reservoir Dogs         8.4  1200000.0
4550           A Separation         8.4   500000.0
4636   The Other Dream Team         8.4   500000.0


In [16]:
# 使用sort_values() 進行一樣的操作
print(movie2.sort_values('imdb_score', ascending=False)
      .head(100)
      .tail(5))

                    movie_title  imdb_score      budget
2646                      U2 3D         8.4         NaN
4815  A Charlie Brown Christmas         8.4    150000.0
3902                    M*A*S*H         8.4         NaN
2922                   Das Boot         8.4  14000000.0
2605         Lawrence of Arabia         8.4  15000000.0


## 4.7 案例演練: 計算移動停損單價格

In [6]:
# 使用假資料集模擬
import pandas as pd
import numpy as np

np.random.seed(0)
dates = pd.date_range(start="2025-01-01", end="2025-04-01", freq="B")
prices = np.cumsum(np.random.normal(0, 2, len(dates))) + 250

tsla_sim = pd.DataFrame({
    "Date": dates,
    "Open": prices + np.random.normal(0, 1, len(dates)),
    "High": prices + np.random.uniform(0, 2, len(dates)),
    "Low": prices - np.random.uniform(0, 2, len(dates)),
    "Close": prices,
    "Volume": np.random.randint(1e6, 3e6, len(dates)),
})
tsla_sim.set_index("Date", inplace=True)
print(tsla_sim.head())

                  Open        High         Low       Close   Volume
Date                                                               
2025-01-01  253.126324  255.479148  252.066393  253.528105  1380163
2025-01-02  252.698221  256.040026  253.820536  254.328419  1221310
2025-01-03  256.748677  256.309323  255.859271  256.285895  1841514
2025-01-06  259.860383  261.487638  259.731280  260.767681  1991809
2025-01-07  264.554743  265.962779  264.451472  264.502797  1808468


In [7]:
# 只看收盤價格
tsla_close = tsla_sim['Close']

# 查看歷史最高收盤價格
tsla_cummax = tsla_close.cummax()
print(tsla_cummax.head())

Date
2025-01-01    253.528105
2025-01-02    254.328419
2025-01-03    256.285895
2025-01-06    260.767681
2025-01-07    264.502797
Name: Close, dtype: float64


In [9]:
# 設立停損點
print(tsla_sim['Close'].cummax()
                  .mul(.9)
                  .head())

Date
2025-01-01    228.175294
2025-01-02    228.895577
2025-01-03    230.657306
2025-01-06    234.690913
2025-01-07    238.052518
Name: Close, dtype: float64
