## Create Large Data File

Create a version of Norfolk weather file that is 1M+ lines long.

In [None]:
with open('files/NorfolkWeather1999.csv', 'r') as f:
    data = f.read()
    
with open('files/NorfolkWeatherLong.csv', 'w') as f: 
    f.write('\n'.join(data for _ in range(3000)))

If you prefer, the cell below creates 1M lines of random temperatures.

In [None]:
import random

with open('files/NorfolkWeatherLong.csv', 'w') as f:
    f.write('\n'.join([str(i) + ',' + str(100*random.random()) for i in range(1_000_000)]))

In [None]:
from itertools import islice 
with open('files/NorfolkWeather1999.csv', 'r') as f:
    _ = f.readline()
    print([*islice(f,5)])
    print([*islice(f,5)])

In [None]:
with open('files/NorfolkWeather1999.csv', 'r') as f:
    f.

# Speed Study

In [None]:
import pandas as pd
import time

cs = [1000, 10000, 100000, 1000000, 2000000, 5000000, 10000000]
times = {}
for chunksize in cs:
    start = time.time()

    fh = pd.Series(dtype='int64')
    for chunk in pd.read_csv('files/new-york-city-taxi-fare-prediction/train.csv', chunksize=2000000):
        fh = fh.add(chunk['passenger_count'].value_counts(), fill_value=0)
    print(fh)
    et = float(time.time() - start)
    print(f'Execution time: {et: .2f} seconds')
    times[chunksize] = float(time.time() - start)
times

In [None]:
import gzip

with gzip.open('files/title.crew.tsv.gz', 'rb') as f:
    data = f.readlines()
data[:5]

# Generate Number of Guests in Restaurant Parties

In [None]:
p_dist = ((0.1,1), (.4,2), (.15,3), (.15,4), (0.05,5), (0.07,6), (0.01,7), (0.01,8), (0.01,10), (0.01,12), (0.04,99))
cum_dist = [sum([x[0] for x in p_dist][:i+1]) for i in range(len(p_dist))]
cum_dist[-1] = 1.0
num_guest = [x[1] for x in p_dist]
cum_dist, num_guest

In [None]:
import json
import random

num_obs = 10000
obs = []
for i in range(num_obs):
    rv = random.random()
    j = 0
    while cum_dist[j] < rv:
        j += 1
    obs.append(num_guest[j])
with open('files/guests.json', 'w') as f:
    json.dump(obs, f)

In [None]:
with open('files/guests.json', 'r') as f:
    x = json.load(f)
x

# Small File for Bluebikes

In [None]:
import requests
import time

start = time.time()
url = 'https://jrbrad.people.wm.edu/data/ctba/bluebikes.csv'

response = requests.get(url)
data = (line.decode('utf-8') for line in response.iter_lines())
print(time.time() - start)

data = [next(data) for _ in range(103)]
with open('bluebikes_small.csv', 'w') as f:
    f.write('\n'.join(data))

In [None]:
with open('files/bluebikes_small.csv', 'r') as f:
    data = f.readlines()
data

# Take out empty lines of Bluebikes data and replace decoding issues

__Big takeaway:__ the methods <code>response.iter_lines()</code> is not reliable.  Do not use it to retrieve Internet data.  Instead, use <code>response.text.split('\r\n')</code>.

In [None]:
import requests
import time

start = time.time()

''' Get the large data set from the Internet with the requests module '''
''' This code creates a generator '''
url = 'https://jrbrad.people.wm.edu/data/ctba/bluebikes.csv'
#with requests.get(url) as f:   # , stream=True
#    data = (line.decode('utf-8') for line in f.iter_lines())
    
response = requests.get(url)
data = (line.decode(encoding='utf-8', errors='replace').replace(u'\uFFFD', '/') for line in response.iter_lines())

with open('files/bluebikes_clean.csv', 'w') as f:
    f.write('\n'.join([*data]))
    
print(f'Acquisition time: {time.time() - start} seconds')

# Clean Monument Data

In [None]:
import unicodedata

with open('files/nri-AntActspsht2021-10-21.csv', 'r', encoding='utf-8', errors='replace') as f:
    data = f.read()
data =unicodedata.normalize('NFD', data).encode('ascii', 'replace').decode('utf-8')
with open('files/nri-AntActspsht2021-10-21_.csv', 'w') as f:
    for d in data:
        try:
            f.write(d)
        except:
            print(d)
#data = [d.strip().split(',') for d in data]
''' Put your code here '''

#print(data)

In [None]:
data