# Coding for Economists - Session 4

***

## 1. Environment Setup 

In [None]:
# make sure current environment is correct
import sys
print(sys.executable)

In [None]:
# use %conda to install libraries in current environment
%conda install pandas numpy openpyxl -y

In [None]:
# Use %pip to install libraries not available in conda
%pip install yfinance pandas_datareader

<div class="alert alert-block alert-info">
<b>Reminder:</b> Restart the kernel after installing any libraries.
</div>

In [None]:
import numpy as np
import pandas as pd
from datetime import date

***

## 2. File Handling

### Directory Paths

- __Windows__: `'C:\\Users\\Tommy\\'`
- __MacOS(Linux)__: `'/home/Tommy/'`

In [None]:
import os
os.path.join('C:', 'Users', 'Tommy')

In [None]:
os.path.join('home', 'Tommy')

### Current Working Directory

- __Print Current Working Directory__ `os.getcwd()`:

In [None]:
os.getcwd()

- __Change Current Working Directory__ `os.chdir()`:

In [None]:
os.chdir(os.getcwd().replace('Session 3', ''))
os.getcwd()

In [None]:
os.chdir(os.path.join(os.getcwd(), 'Session 3'))
os.getcwd()

### Absolute and Relative Paths:
> 1. `'.'`: current folder
> 2. `'..'` parent folder

In [None]:
os.listdir('.')

In [None]:
os.listdir('..')

In [None]:
print(os.getcwd())
os.listdir(os.getcwd())

### Directory Manipulation

- __Create New Folders__ `os.makedirs()`:

In [None]:
os.makedirs('FOLDER_A')

In [None]:
os.chdir('FOLDER_A')
os.getcwd()

In [None]:
os.chdir('..')
os.getcwd()

- __Copy Files__ `shutil.copy()`:

In [None]:
# Create an empty file in folder 'FOLDER_A'
open(os.path.join('FOLDER_A', 'FILE_1.txt'), 'w').close()
os.listdir('FOLDER_A')

In [None]:
# Create another folder 'FOLDER_B'
os.makedirs('FOLDER_B')

In [None]:
# Copy file 'FILE_1.txt' from folder 'FOLDER_A' to 'FOLDER_B'
import shutil
shutil.copy(os.path.join('FOLDER_A', 'FILE_1.txt'), 'FOLDER_B')
print(os.listdir('FOLDER_A'))
print(os.listdir('FOLDER_B'))

- __Move Files__ `shutil.move()`:

In [None]:
# Create another empty file in folder 'FOLDER_A'
open(os.path.join('FOLDER_A', 'FILE_2.txt'), 'w').close()
os.listdir('FOLDER_A')

In [None]:
# Move 'FILE_2.txt' to folder 'FOLDER_B'
shutil.move(os.path.join('FOLDER_A', 'FILE_2.txt'), 'FOLDER_B')
print(os.listdir('FOLDER_A'))
print(os.listdir('FOLDER_B'))

- __Delete Folders__ `os.rmdir()`:

In [None]:
# Cannot delete folder if it is not empty
os.rmdir('FOLDER_A')

- __Delete Files__ `os.remove()`:

In [None]:
# Delete 'FILE_1.txt'
os.remove(os.path.join('FOLDER_A', 'FILE_1.txt'))
os.listdir('FOLDER_A')

In [None]:
# Delete folder 'FOLDER_A'
os.rmdir('FOLDER_A')
os.listdir()

### Read Files

- __(Not Useful) Built-in Method__ `csv.reader()`:

In [None]:
import csv
with open('economic_indicators.csv') as file:
    reader = csv.reader(file)
    data = list(reader) # Convert csv data into a list
data[:5]

- __Read <span style="color: red;">csv</span> Files__ `pd.read_csv()`:

In [None]:
import pandas as pd
data = pd.read_csv('economic_indicators.csv')
data.head()

- __Read <span style="color: red;">Excel</span>(xlsx) Files__ `pd.read_excel()`:

In [None]:
# Need to install library openpyxl
data = pd.read_excel('economic_indicators.xlsx')
data.head()

- __Read <span style="color: red;">Stata</span>(dta) Files__ `pd.read_stata()`:

In [None]:
data = pd.read_stata('economic_indicators.dta')
data.head()

### Write Files

- __(Not Useful) Built-in Method__ `csv.writer()`:

In [None]:
# Read the csv file into a list
with open('economic_indicators.csv') as file:
    reader = csv.reader(file)
    data = list(reader)

# Write the list into a csv file
with open('economic_indicators_csv_writer.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write all rows from data
    writer.writerows(data)

- __Write <span style="color: red;">csv</span> Files__ `pd.to_csv()`:

In [None]:
# Read the csv file into a pd.DataFrame
data = pd.read_csv('economic_indicators.csv')

# Write the pd.DataFrame into a csv file
data.to_csv('economic_indicators_pandas.csv', index=False)

- __Write <span style="color: red;">Excel</span>(xlsx) Files__ `pd.to_excel()`:

In [None]:
data.to_excel('economic_indicators_pandas.xlsx', index=False)

- __Write <span style="color: red;">Stata</span>(dta) Files__ `pd.to_stata()`:

In [None]:
data.to_stata('economic_indicators_pandas.dta', write_index=False)

### Best Practice in File Handling

- Use folders to organize files
- Use relative path
- Use `pandas` to read and write files
- Use meaningful file names

***

## 3. Data Collection

#### __World Bank__ `pandas_datareader.wb`

Use __World Bank Indicators__ webpage https://data.worldbank.org/indicator to find the indicator ID
1. Search for the keyword "GDP"
2. Click on the indicator `GDP (current US$)` from the search results
3. The indicator ID is found in the URL address: https:\/\/data.worldbank.org/indicator/<span style="color: red;">NY.GDP.MKTP.CD</span>?view=chart

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pandas_datareader import wb

# Define indicators & country list
indicators_dict = {
    "NY.GDP.MKTP.CD": "GDP (Current US$)",  # GDP in current US dollars
    "FP.CPI.TOTL.ZG": "Inflation (CPI)"  # Inflation (Consumer Price Index)
}

countries = ["USA", "CHN", "DEU"]  # Using ISO country codes

# Fetch data from World Bank
data = wb.download(
    indicator=list(indicators_dict.keys()),
    country=countries,
    start=2019,
    end=2024
)

# Display the data
print(data)

#### __Federal Reserve Economic Data (FRED)__ `pandas_datareader.data`

Use __FRED__ webpage https://fred.stlouisfed.org/ to find the indicator ID
1. Search for the keyword "GDP"
2. Click on the indicator `Gross Domestic Product` from the search results
3. The indicator ID is found in the URL address: https:\/\/fred.stlouisfed.org/series/<span style="color: red;">GDP</span>


In [None]:
import pandas_datareader.data as web
import datetime

# Define the time period
start = datetime.datetime(2019, 1, 1)
end = datetime.datetime(2024, 12, 31)

# Fetch the US unemployment rate and CPI from FRED
data = web.DataReader(['UNRATE', 'CPIAUCSL'], 'fred', start, end)

# Display the data
print(data)

#### __Fama/French Data__ `pandas_datareader.data`

Use the `get_available_datasets()` function from `pandas_datareader.famafrench` to list all available data sets

In [None]:
from pandas_datareader.famafrench import get_available_datasets
get_available_datasets()[:20]

In [None]:
import pandas_datareader.data as web
ds = web.DataReader('6_Portfolios_2x3', 'famafrench')
print(ds['DESCR'])

In [None]:
ds[2].head()

#### __Yahoo Finance__ `yfinance`

Use standard stock tickers to fetch data

In [None]:
import yfinance as yf
import pandas as pd

# Define stock tickers
tickers = ["AAPL", "MSFT"]

# Define date range (Optional: adjust as needed)
start_date = "2025-02-01"
end_date = "2025-02-15"

# Fetch daily historical data
df = yf.download(tickers, start=start_date, end=end_date, interval="1d")

# Display first few rows
print(df.head())

#### __International Monetary Fund (IMF) Data__ `requests`

Follow these steps to find the data set codes: https://www.bd-econ.com/imfapi2.html

In [None]:
import requests

url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
key = 'CompactData/IFS/M.GB.PMP_IX' # adjust codes here

# Navigate to series in API-returned JSON data
obj = requests.get(f'{url}{key}').json()
data = obj['CompactData']['DataSet']['Series']

print(data['Obs'][:5])

#### __JSON Object__

In [None]:
# Create a JSON object.
data = {
  "pets": [
    {
      "type": "dog",
      "name": "Buddy",
      "age": 5,
      "vaccinations": ["rabies", "distemper", "parvovirus"],
      "owner": {
        "name": "Alice",
        "contact": "alice@example.com"
      }
    },
    {
      "type": "cat",
      "name": "Whiskers",
      "age": 3,
      "vaccinations": ["feline distemper", "rabies"],
      "owner": {
        "name": "Bob",
        "contact": "bob@example.com"
      }
    },
    {
      "type": "parrot",
      "name": "Polly",
      "age": 2,
      "vaccinations": [],
      "owner": {
        "name": "Charlie",
        "contact": "charlie@example.com"
      }
    }
  ]
}

In [None]:
data['pets'][1]

In [None]:
data['pets'][1]['name']

***

## 4. Data Manipulation

### NumPy Basics

In [None]:
import numpy as np

#### NumPy Array (<span style="color: orange;">same-type</span>, <span style="color: blue;">mutable</span>, ideal for numerical computation)
- Must be the same type of data (e.g. number, string)
- Total size of the array cannot change (appending an array creates a new array)
- Each row must have the same number of columns (supports n-dimensional data)
- Please pay attention to __alias__

- __Create NumPy Arrays__

In [None]:
# Create a 1-D array
data = np.array([1, 2, 3, 4])
print(data)
data.shape

In [None]:
# Create a 2-D array
data = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
print(data)
data.shape

In [None]:
y = np.array([[1, 2, 3],
              [4, 5, 6]])

In [None]:
# Create a 3-D array
data = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]])
print(data)
data.shape

In [None]:
# Create zeros
data = np.zeros(5)
data

In [None]:
# Create ones
data = np.ones(5)
data

In [None]:
# Create a range of numbers
data = np.arange(5)
data

In [None]:
# Create a range of numbers from 10 to 19 with step size 2
data = np.arange(10, 19, 2)
data

In [None]:
# Create 5 numbers linearly spaced between 10 and 19
data = np.linspace(10, 19, num = 5)
data

- __Reshaping__ `.reshape()`

In [None]:
data = np.array(range(30))
print(data)
data_reshaped = data.reshape(2,3,5)
print(data_reshaped)

- __Transposing__ `.transpose()`

In [None]:
data = np.array([[1,2,3],[4,5,6]])
print(data)
print(data.transpose())

- __Indexing and Slicing__ (same as lists)

In [None]:
# Print the third element in the first row in the second layer, 
data = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])
print(data)
print(data[1,0,2])

In [None]:
# Replace the value
data = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])
print(data)
data[1,0,2] = 999
print(data)

In [None]:
# Slice the first two rows and the last two columns of the first layer
data = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])
print(data)
print(data[0, :2, -2:])

- __Concat__ `.concatenate()`

In [None]:
# Concat rows (along the first axis = 0)
x = np.array([[1, 2, 3, 4]])
y = np.array([[5, 6, 7, 8]])
data = np.concatenate((x, y), axis = 0)  # put x, y in a tuple (x, y)
print(data)

# Use .vstack instead
data = np.vstack((x, y))
print(data)

In [None]:
# Concat columns (along the second axis = 1)
x = np.array([[1, 2, 3, 4]])
y = np.array([[5, 6, 7, 8]])
data = np.concatenate((x, y), axis = 1)  # put x, y in a tuple (x, y)
print(data)

# Use .hstack instead
data = np.hstack((x, y))
print(data)

<div class="alert alert-block alert-info">
<b>Reminder:</b> For 3-D arrays, axis = 0 refers to layers, 1 refers to rows, 2 refers to columns.
</div>

- __Split__ `.split()`

In [None]:
data = np.array([[1,2,3,4],[5,6,7,8]])
print(np.split(data, 2))

- __Select Values Using Boolean Conditions__

In [None]:
# Create the 2-D array
x = np.array([[1, 2, 3, 4]])
y = np.array([[5, 6, 7, 8]])
data = np.concatenate((x, y), axis = 0)

# Print values between [3, 8]
data[(data >= 3) & (data <= 8)]

- __Math Operations__

In [None]:
# Create the 2-D array
data = np.array([[1, 2, 3, 4],[5, 6, 7, 8]])
data

In [None]:
# Addition
print(data + 1)
print(data + data)

In [None]:
# Multiplication
print(data*2)
print(data*data)

In [None]:
# Division
print(data/2)
print(data/data)

In [None]:
# Summation
print(data)
print(data.sum())
print(data.sum(axis = 0))
print(data.sum(axis = 1))

In [None]:
# Max, Min, and Mean
print(data.max())
print(data.min())
print(data.min(axis = 1))
print(data.mean(axis = 1))

In [None]:
# Broadcasting
# A 2D array
a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

# A 1D array (vector)
b = np.array([10, 20, 30])

# Broadcasting: add the vector b to each row of a
print(a + b)

- __Linear Algebra__

In [None]:
x = np.array([[1, 2], [3, 4]])
y = np.array([[2, 0], [1, 2]])

# Matrix multiplication
print(x @ y)

In [None]:
# Inverse matrix
inv_x = np.linalg.inv(x)
print(inv_x)

- __Count Unique Values__

In [None]:
# Create the 2-D array
data = np.array([[1, 2, 3, 4],[6, 5, 4, 3]])
values, count = np.unique(data, return_counts = True)
for i in range(len(values)):
    print(f'value: {values[i]} - count: {count[i]}')

#### Generate Random Numbers

In [None]:
# Generate 5 uniformly distributed random numbers between [0, 1]
np.random.rand(5)

In [None]:
# Generate a random 2 by 3 matrix
np.random.rand(2, 3)

In [None]:
# Generte a 2 by 3 matrix with random integers between 0 and 10
np.random.randint(0, 10, size=(2, 3))

In [None]:
# Generte a 2 by 3 matrix with standard normal random numbers
np.random.randn(2, 3)

### Pandas Basics

In [None]:
import pandas as pd

#### Pandas Series (<span style="color: green;">mixed-type</span>, <span style="color: blue;">mutable</span>)
- Supports labels (row names)
- Automatically match labels when creating a data frame
- Please pay attention to __alias__.

In [None]:
data = pd.Series([1, 2, np.nan, 'a'], index = ['A', 'B', 'C', 'D'])
data

#### Pandas DataFrame (<span style="color: green;">mixed-type</span>, <span style="color: blue;">mutable</span>, ideal for data manipulation)
- Works swiftly with most data manipulation, analysis, and visualization tools.
- Data structure is similar to spreadsheet and Stata. Easy to read.
- Please pay attention to __alias__.

- __Create pd.DataFrame__

In [None]:
# Create using a dictionary
pets = pd.DataFrame(
    {
        'Type': ['rabbit', 'fish', 'cat', 'dog'],
        'Name': ['Meatball', 'Flash', 'Coco', 'Fluffy'],
        'Age': np.random.randint(0, 10, size=(4)),
        'Last_visit': pd.date_range('20250210', periods = 4)
    }
)
pets

In [None]:
# Create using a np.arrapy
daily_return = pd.DataFrame(
    np.random.randn(10, 4),
    index = pd.date_range('20250210', periods = 10),
    columns = ['AAPL', 'GOOG', 'META', 'NVDA']
)
daily_return.head()

In [None]:
daily_return.tail()

In [None]:
daily_return.shape

In [None]:
print(daily_return.columns)
print(daily_return.index)

- __Describe__ `.describe()`

In [None]:
# Describe data frame
daily_return.describe()

- __Indexing__ `.iloc[]`, `.loc[]`, `.at[]`, `.iat[]`, `.COLNAME`, and `[]`

In [None]:
# Create using a dictionary
pets = pd.DataFrame(
    {
        'Type': ['rabbit', 'fish', 'cat', 'dog'],
        'Name': ['Meatball', 'Flash', 'Coco', 'Fluffy'],
        'Age': np.random.randint(0, 10, size=(4)),
        'Last_visit': pd.date_range('20250210', periods = 4)
    }
)

# Selection by position using .iloc[]   (indices in [] must be integers or :)
print(pets.iloc[3, 1])

# Selection by label using .loc[]   (indices in [] must be row\column names or :)
print(pets.loc[3, 'Name'])

# Get single value by position using .at[] (indices in [] must be integers)
print(pets.iat[3, 1])

# Get single value by label using .at[] (indices in [] must be row\column names)
print(pets.at[3, 'Name'])

# Selection using .COLNAME
print(pets.Name[3])

# Selection using []
print(pets['Name'][3])

In [None]:
# Boolean indexing
pets[pets['Age'] > 2]

- __Editing__

In [None]:
# Edit using indices
pets.iloc[3, 0] = 'parrot'
pets.loc[3, 'Name'] = 'Polly'
pets.loc[3, 'Age'] = 35
pets

In [None]:
# Edit using a boolean condition
pets.loc[pets['Last_visit'] < '2025-02-12', 'Last_visit'] = pd.to_datetime('2000-01-01')
pets

- __Adding Columns and Rows__

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# Add a new column 'C'
df['C'] = [7, 8, 9]
print("After adding column 'C':")
print(df)

In [None]:
# Remove the column 'B'
df_drop_col = df.drop('B', axis = 1)
print("\nAfter removing column 'B':")
print(df_drop_col)

In [None]:
# Add a new row
df.loc[3] = [10, 11, 12]
print("\nAfter adding a new row with loc:")
print(df)

In [None]:
# Remove the first fow
df_drop_row = df.drop(0, axis = 0)
print("\nAfter filtering out rows where 'A' < 3:")
print(df_drop_row)

- __Merging DataFrames by a Column__

In [None]:
# Create two DataFrames with a common column 'key'
df1 = pd.DataFrame({
    'key': ['A', 'B', 'C', 'D'],
    'value1': [1, 2, 3, 4]
})

df2 = pd.DataFrame({
    'key': ['B', 'D', 'E', 'F'],
    'value2': [5, 6, 7, 8]
})

# Merge the DataFrames on 'key'
merged_df = pd.merge(df1, df2, on='key', how='left')
print("Merged DataFrame using pd.merge():")
print(merged_df)

- __Joining DataFrames by labels (row names)__

In [None]:
# Create two DataFrames with indexes
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['A', 'B', 'C'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['A', 'B', 'D'])

# Join the DataFrames on the index
joined_df = df1.join(df2, how='left')
print("\nJoined DataFrame using df.join():")
print(joined_df)

- __Concating DataFrames__

In [None]:
# Create two DataFrames with the same columns
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

# Concatenate vertically
vertical_concat = pd.concat([df1, df2], axis=0)
print("\nVertical Concatenation using pd.concat():")
print(vertical_concat)

# Concatenate horizontally
vertical_concat = pd.concat([df1, df2], axis=1)
print("\nHorizontal Concatenation using pd.concat():")
print(vertical_concat)