In [1]:
from customerSeg.config import raw_dir

from pathlib import Path
import requests
import re

In [2]:
url = "https://mathcs.org/statistics/datasets/MLBPlayerSalaries.xlsx" # just a small toy dataset

In [3]:
response = requests.get(url)

In [4]:
filepath = raw_dir / Path(url).name
print(filepath)

/Users/jared/DevProjects/customer-segmenter/data/raw/MLBPlayerSalaries.xlsx


In [5]:
with open(filepath, 'wb') as f:
    f.write(response.content)

In [6]:
# checking quality of download

import pandas as pd

df = pd.read_excel(filepath, engine='openpyxl')

In [7]:
df

Unnamed: 0,Year,Player,Salary,Position,Team
0,1988,Mike Witt,1400000,Pitcher,Los Angeles Angels
1,1988,George Hendrick,989333,Outfielder,Los Angeles Angels
2,1988,Chili Davis,950000,Outfielder,Los Angeles Angels
3,1988,Brian Downing,900000,Designated Hitter,Los Angeles Angels
4,1988,Bob Boone,883000,Catcher,Los Angeles Angels
...,...,...,...,...,...
19538,2011,Gustavo Molina,455000,Catcher,New York Yankees
19539,2011,Ivan Nova,432900,Pitcher,New York Yankees
19540,2011,Colin Curtis,420400,Outfielder,New York Yankees
19541,2011,Eduardo Nunez,419300,Shortstop,New York Yankees


In [8]:
### handling subdirectory
sub_dir = raw_dir / 'baseball' / Path(url).name
sub_dir

PosixPath('/Users/jared/DevProjects/customer-segmenter/data/raw/baseball/MLBPlayerSalaries.xlsx')

In [9]:
def _mkdir(path):
    # path : PosixPath
    try:
        path.mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        print(f'Path {path} already exists')


def from_url(url, sub_dir='', mkdir=False):
    try:
        response = requests.get(url, stream=True) #stream=True for large files
        response.raise_for_status()
    except requests.RequestException as e:
        print(f'Connection failed to source {url}')
        print(e)
        return 9
    sub_path = raw_dir / sub_dir 
    if mkdir:
        _mkdir(sub_path)
    
    filepath = sub_path / Path(url).name
    with open(filepath, 'wb') as f:
        
        f.writelines(response.iter_content(1024))
        

In [10]:
from_url(url, 'baseball', False)

In [11]:
from_url(url, 'baseball', True)

Path /Users/jared/DevProjects/customer-segmenter/data/raw/baseball already exists


In [12]:
from_url('https://www.bbbbbbb88888.com/nope.csv')

Connection failed to source https://www.bbbbbbb88888.com/nope.csv
HTTPSConnectionPool(host='www.bbbbbbb88888.com', port=443): Max retries exceeded with url: /nope.csv (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x115793eb0>: Failed to establish a new connection: [Errno 61] Connection refused'))


9

### Using pandas.io to handle reading all types of files

In [28]:
from io import StringIO
from pandas.io.common import is_url, is_file_like, is_fsspec_url
from pandas.io.parsers import get_filepath_or_buffer

In [30]:
source = StringIO(str(filepath))

In [34]:
is_file_like(source)

True

In [17]:
get_filepath_or_buffer(filepath)

('/Users/jared/DevProjects/customer-segmenter/data/raw/MLBPlayerSalaries.xlsx',
 None,
 None,
 False)

In [21]:
is_fsspec_url('s3://example-bucket/path/to/object')

True

In [22]:
is_url('https://mathcs.org/statistics/datasets/MLBPlayerSalaries.xlsx')

True

In [35]:
s3_url = 'http://s3-aws-region.amazonaws.com/bucket/key1/key2'
is_fsspec_url(s3_url)

False

### From JSON url

In [84]:
import json

In [97]:
url = 'http://api.worldbank.org/v2/countries/USA/indicators/NY.GDP.MKTP.CD?per_page=5000&format=json'

response = requests.get(url)

In [104]:
response.json()

[{'page': 1,
  'pages': 1,
  'per_page': 5000,
  'total': 61,
  'sourceid': '2',
  'lastupdated': '2020-12-16'},
 [{'indicator': {'id': 'NY.GDP.MKTP.CD', 'value': 'GDP (current US$)'},
   'country': {'id': 'US', 'value': 'United States'},
   'countryiso3code': 'USA',
   'date': '2020',
   'value': None,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {'indicator': {'id': 'NY.GDP.MKTP.CD', 'value': 'GDP (current US$)'},
   'country': {'id': 'US', 'value': 'United States'},
   'countryiso3code': 'USA',
   'date': '2019',
   'value': 21433226000000,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {'indicator': {'id': 'NY.GDP.MKTP.CD', 'value': 'GDP (current US$)'},
   'country': {'id': 'US', 'value': 'United States'},
   'countryiso3code': 'USA',
   'date': '2018',
   'value': 20580159776000,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {'indicator': {'id': 'NY.GDP.MKTP.CD', 'value': 'GDP (current US$)'},
   'country': {'id': 'US', 'value': 'United States'},
   

In [105]:
from_url('http://api.worldbank.org/v2/countries/USA/indicators/NY.GDP.MKTP.CD?per_page=5000&format=json')

In [136]:
raw_dir

PosixPath('/Users/jared/DevProjects/customer-segmenter/data/raw')