In [3]:
# Import Libraries

import numpy as np # numPy - mathematical operations
import matplotlib.pyplot as plt # matplotlib - generate graphs
import pandas as pd # pandas - manipulate datasets
import statsmodels.api as sm # function to estimate models
import statsmodels.formula.api as smf # general use statistical options


Python Built-in Functions

In [5]:
# Important functions to remember:

print("String")
type(3.14) # returns int, str, boolean, float
round(np.pi, 6) # rounds float to 5 decimal places
abs(-4) # returns -4
len("happy") # returns length of strings or number elements in list (ex. 5)

String printed


5

Lecture 8: User-defined functions

In [None]:
# Define function
def fn_status (numeric_grade):
    if(numeric_grade >= 55):
        status = "pass"
    else:
        status = "fail"
    return status


print(fn_status(56))

In [None]:
# Lambda function
fn_sum = lambda x,y,z: x + y + z
fn_iseligible_vote = lambda age: (age>=18)

Lecture 9: Local/Global Variables

In [6]:
# Global variables defined are stored in working environment (can be referenced in other parts of notebook) ex. hi = "hello"
# Although global var can be referenced inside functions, better to include inputs as parameters

# Local variables are stored temporarily, supercede global variables  ex. defined inside functions
# to permanently modify a variable, use "global"
def modify_x():
    global x
    x = x + 5

In [7]:
data  = pd.DataFrame(  [ ] ) # empty data frame

data["age"] = [18,29,15,32,6] # data variables
data["num_underage_siblings"] = [0,0,1,1,0]
data["num_adult_siblings"]    = [1,0,0,1,0]

fn_iseligible_vote = lambda age: age >= 18 # define function
fn_istwenties      = lambda age: (age >= 20) & (age < 30)
fn_sum             = lambda x,y: x + y

# Apply function to dataframe series
data["can_vote"] = data["age"].apply(fn_iseligible_vote) # apply function --> extract each eleement and return function value 

# Mapping functions with one or more arguments
data["num_siblings"] = list(map(fn_sum,
                                    data["num_underage_siblings"],
                                    data["num_adult_siblings"]))

Lecture 10: Subsetting Data

In [None]:
# Read the car features dataset
carfeatures = pd.read_csv("data_raw/features.csv")

# The display() command will show the first 5 rows and the last five rows
display(carfeatures)

# Extract column names
car_colnames = carfeatures.columns.values

# Subset columns
display(carfeatures[["weight","mpg"]])

# Subset rows: data.iloc[lower:upper, : ]
display(carsorted.iloc[0,:]) 
display(carfeatures.iloc[0:5,:]) # Extract rows 0 to 5
display(car_ascendingmpg.iloc[:, 2]) # Extract column 2

# Sort
carsorted = carfeatures.sort_values(by = "mpg", ascending = False)

# Query
data_rangeweight    = carfeatures.query("(acceleration >= 10) & (acceleration < 18)")
threshold = 25
data_varthreshold_mpg = carfeatures.query("mpg >= @threshold") # need @ for column names
data_spacesthreshold_mpg = carfeatures.query("`new variable` >= 25") # need for spaces in column name

Lecture 11: Linear Regression

In [8]:
dataset = pd.DataFrame([])

# 2 random variable
n = 50
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n) # error term

# create data from linear model
b0 = 1 # intercept
b1 = 2 # slope
dataset["y"] = b0 + b1 * dataset["x"] + dataset["e"] # compute formulas

# theoretical best fit line
dataset["p"] = b0 + b1*dataset["x"]

# summary statistics
ybar = dataset["y"].mean()
stdv_sample = dataset["y"].std()
y_median = dataset["y"].median()

#------------------------------------------------------------------------------#
# We use the subfunction "ols()" in the library "smf"
#---- (i) The first argument is a string called "formula" with the format 
#-------- "outcome ~ indepdent_vars"
#----(ii) the second argument is the dataset
# The second line fits the model with standard errors "cov". In this case we 
# use "robust" standard errors (HC1)
#-------------------------------------------------------------------------------#

model   = smf.ols(formula = 'y ~  x',data = dataset)
results = model.fit(cov = "HC1")

# Can also run as one line
# results = smf.ols(formula = 'y ~ x',data = dataset).fit(cov = "HC1")

# We will use ".params" to get the attribute "parameters from the results"

b_list = results.params
print(b_list)

# We can then compute the "estimated" best fit lines
# by extracting the intercept and slop from "b_list"

dataset["p_estimated"] = b_list[0] + b_list[1]  * dataset["x"]

# Note: The estimators for "b0" and "b1" are close to 
# the values we used to generate the data



In [None]:
"""
(d) Split a dataset into subsets

- You will be asked to randomly assign a status to each row
- Split the data into separate datasets using ".query()"
- This will closely follow the material in Lecture 12 (this one)
- You will need this result to answer questions (e), (f)
"""

In [None]:
# Write your own code here -- subset by querying
subset_above2 = dataset.query("y >= 2")
original = len(dataset)
new = len(subset_above2)
print(original)
print(new)
print("Proportion: " + str(new/original))


# Write your own code

dataset["sample_error"] = dataset["y"] - dataset["p_estimated"]

# assigns pos error status to each row
fn_pos_error= lambda error: error >= 0 

dataset["pos_error"] = list(map(fn_pos_error, dataset["sample_error"]))
dataset["positive_error"] = dataset["sample_error"].apply(fn_pos_error)
display(dataset)

In [None]:
"""

(e) Create a function with four inputs $f(y,x,b0,b1)$

- Start by using "def" to define the function
- The function will include arithmetic operations (Lecture 3) <br>
and summary statistics for pandas (mean, std, min, max, etc.)
- You will be asked to test different values of $(y,x,b0,b1)$
- You will get $y$ and $x$ from the two datasets in part (d)
- Note: You will **not** be required to use the "statsmodels" library
"""

In [None]:
# summary statistics
dataset["y"].describe()

In [None]:
"""(f) Create two overlapping histogram plots

- You will use a variable from the two datasets in (d)
- You need to use the "alpha" option to make the graphs semitransparent
- You will need to add a legend, label the axes, and the title
- Note: The goal of this question is to illustrate that random <br>
assignment produces very similar distributions between two groups"""

In [None]:
# Write your own code here
data_urban = pd.read_csv("data/wdi_urban.csv")

list_years = pd.unique(data_urban["year"])
for year in list_years:
    df = data_urban.query("year == @year")
    plt.hist(x=df["prop_urbanpopulation"], alpha = 0.5)

plt.xlabel("Proportion of Urban Population")
plt.ylabel("Number of Countries")
plt.title("Proportion of Urban Population for Countries in 1980 vs 2020")
plt.legend(labels = list_years, title="Years")
plt.show()

# From this graph we can compare the distribution of different proportions 
# of urban populations in countries across 40 years. Countries in 2020 shift
# right, suggesting a higher urban population globally. 