# What is a Generator and why use it?

In [3]:
def csv_reader(file_name):
    for row in open(file_name, "r"):
        yield row
!wget -O data.csv https://www.stats.govt.nz/assets/Uploads/Annual-enterprise-survey/Annual-enterprise-survey-2020-financial-year-provisional/Download-data/annual-enterprise-survey-2020-financial-year-provisional-csv.csv
reader = csv_reader("data.csv")
print(next(reader))
print(next(reader))

--2022-01-07 14:54:15--  https://www.stats.govt.nz/assets/Uploads/Annual-enterprise-survey/Annual-enterprise-survey-2020-financial-year-provisional/Download-data/annual-enterprise-survey-2020-financial-year-provisional-csv.csv
Resolving www.stats.govt.nz (www.stats.govt.nz)... 45.60.11.104
Connecting to www.stats.govt.nz (www.stats.govt.nz)|45.60.11.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5881081 (5.6M) [text/csv]
Saving to: ‘data.csv’


2022-01-07 14:54:22 (1.17 MB/s) - ‘data.csv’ saved [5881081/5881081]

Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06

2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,"733,258","ANZSIC06 divisions A-S (excluding classes K6330, L6711, O7552, O760, O771, O772, S9540, S9601, S9602, and S9603)"



# What is a Decorator and why use it?

In [None]:
def getName():
    return "Daniel"

def addTextToName(function):
    def wrapper():
        return "This is your name: {}".format(function())
    return wrapper

namePrinter = addTextToName(getName)

def wrapName(func):
    def wrapper():
        print("Your name is: {}".format(func()))
        print("Nice to meet you.")
    return wrapper

@wrapName
def getNameWrapped():
    return "Daniel"

print(getName())
print("=====")
print(namePrinter())
print("=====")
getNameWrapped()

Daniel
=====
This is your name: Daniel
=====
Your name is: Daniel
Nice to meet you.


# What is list/dict comprehension and why use it?

In [None]:
salaries = {'Anne': 50000, 'Bert': 60000, 'Carl': 70000, 'Dom': 80000}
raisedSalaries = {"key_"+k:round(v*1.13,0) for (k,v) in salaries.items()}
print(raisedSalaries)

{'key_Anne': 56500.0, 'key_Bert': 67800.0, 'key_Carl': 79100.0, 'key_Dom': 90400.0}


In [None]:
something = [1,2,3]
a = [i for i in something if i<2]
a

[1]

# You are multithreading a list of parallel tasks using a thread pool. What is t.join() used for? What is the purpose of using t.join() rather than skipping it. This program seems to work with or without the t.join(). Why do we still include it?

In [None]:
from threading import Thread
from queue import Queue
import time

def worker(args,q):
    time.sleep(1)
    print("done {}".format(args))
    q.put(1)
    return

workerList=[]
for i in range(3):
    q = Queue()
    t = Thread(target=worker,args=(i,q))
    t.start()
    workerList.append([q,t])

for i,workerPair in enumerate(workerList):
    workerPair[1].join()
    
print("ALL WORK DONE")

total=0
for i,workerPair in enumerate(workerList):
    total+=workerPair[0].get()
    
print("TOTAL={}".format(total))

done 1done 2
done 0

ALL WORK DONE
TOTAL=3


# What is df.apply(myFunction, axis=1)?

In [None]:
import pandas as pd

def reverseName(row):
    text=row["name"]
    return "".join(reversed(list(text)))

df = pd.DataFrame(data={'name': ["Alice", "Bob"]})
df["reversed"]=df.apply(reverseName,axis=1)
display(df.head())

Unnamed: 0,name,reversed
0,Alice,ecilA
1,Bob,boB


# What operation is df1.merge(df2, left_on='lkey', right_on='rkey') doing? What would this be called in SQL?

In [None]:
df1 = pd.DataFrame({'name': ['Brian', 'Bill', 'Frank'],
                    'demerits': [2, 3, 5]})
df2 = pd.DataFrame({'name': ['Brian', 'Bill', 'Frank'],
                    'convictions': [6, 7, 8]})
df1.merge(df2, on='name')

Unnamed: 0,name,demerits,convictions
0,Brian,2,6
1,Bill,3,7
2,Frank,5,8


# What is faster, pd.concat([df1,df2,df3]) or a loop of df.append()? Explain your reasoning.

In [None]:
%%time
import random
import pandas as pd
numRows=10000

df = pd.DataFrame(columns=["age","gender"])
for _ in range(numRows):
    df2=pd.DataFrame(data={'age': [random.randint(0,120)], 'gender': [random.choice(["M","F"])]})
    df=df.append(df2)
df.head()

Wall time: 8.55 s


Unnamed: 0,age,gender
0,110,M
0,75,M
0,116,F
0,9,M
0,4,F


In [None]:
%%time
import random
numRows=10000
resultArr = []
for _ in range(numRows):
    df2=pd.DataFrame(data={'age': [random.randint(0,120)], 'gender': [random.choice(["M","F"])]})
    resultArr.append(df2)

df=pd.concat(resultArr)
df.head()

Wall time: 3.52 s


Unnamed: 0,age,gender
0,109,F
0,62,M
0,113,M
0,43,M
0,50,M


# What is the purpose of tools such as Flask and Django?

# Compare the purposes of 1,2, and 3:

1) Flask/Django/Others

2) apache2/nginx

3) gunicorn/other WSGI

# You are writing a program that scrapes text from a long list of websites. How would you apply parallelism to speed up the scraping task?

# You are writing a python3 program. When should you use Docker and when should you use VENV?

# What is requirements.txt used for?

# Why not use `git add .`

# Compare matrix multiplication using the GPU, x86 CPU, and x86 vector coprocessor such as AVX2/SSE3. Why do these different kinds of hardware all exist in our personal computers?

# Compare and contrast Ubuntu and RHEL.