Skip to content

Commit

Permalink
FIX PermissionError in datasets fetchers on Windows (scikit-learn#9847)
Browse files Browse the repository at this point in the history
  • Loading branch information
massich authored and jnothman committed Oct 3, 2017
1 parent c59af71 commit ab33915
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 44 deletions.
22 changes: 12 additions & 10 deletions sklearn/datasets/california_housing.py
Expand Up @@ -49,6 +49,7 @@

logger = logging.getLogger(__name__)


def fetch_california_housing(data_home=None, download_if_missing=True):
"""Loader for the California housing dataset from StatLib.
Expand Down Expand Up @@ -96,20 +97,21 @@ def fetch_california_housing(data_home=None, download_if_missing=True):

logger.info('Downloading Cal. housing from {} to {}'.format(
ARCHIVE.url, data_home))

archive_path = _fetch_remote(ARCHIVE, dirname=data_home)

fileobj = tarfile.open(
mode="r:gz",
name=archive_path).extractfile(
'CaliforniaHousing/cal_housing.data')
with tarfile.open(mode="r:gz", name=archive_path) as f:
cal_housing = np.loadtxt(
f.extractfile('CaliforniaHousing/cal_housing.data'),
delimiter=',')
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]

joblib.dump(cal_housing, filepath, compress=6)
remove(archive_path)

cal_housing = np.loadtxt(fileobj, delimiter=',')
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]
joblib.dump(cal_housing, filepath, compress=6)
else:
cal_housing = joblib.load(filepath)

Expand Down
39 changes: 21 additions & 18 deletions sklearn/datasets/rcv1.py
Expand Up @@ -166,21 +166,23 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,

Xy = load_svmlight_files(files, n_features=N_FEATURES)

# delete archives
for f in files:
remove(f.name)

# Training data is before testing data
X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
sample_id = sample_id.astype(np.uint32)

joblib.dump(X, samples_path, compress=9)
joblib.dump(sample_id, sample_id_path, compress=9)

# delete archives
for f in files:
f.close()
remove(f.name)
else:
X = joblib.load(samples_path)
sample_id = joblib.load(sample_id_path)


# load target (y), categories, and sample_id_bis
if download_if_missing and (not exists(sample_topics_path) or
not exists(topics_path)):
Expand All @@ -195,20 +197,21 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
category_names = {}
for line in GzipFile(filename=topics_archive_path, mode='rb'):
line_components = line.decode("ascii").split(u" ")
if len(line_components) == 3:
cat, doc, _ = line_components
if cat not in category_names:
n_cat += 1
category_names[cat] = n_cat

doc = int(doc)
if doc != doc_previous:
doc_previous = doc
n_doc += 1
sample_id_bis[n_doc] = doc
y[n_doc, category_names[cat]] = 1
with GzipFile(filename=topics_archive_path, mode='rb') as f:
for line in f:
line_components = line.decode("ascii").split(u" ")
if len(line_components) == 3:
cat, doc, _ = line_components
if cat not in category_names:
n_cat += 1
category_names[cat] = n_cat

doc = int(doc)
if doc != doc_previous:
doc_previous = doc
n_doc += 1
sample_id_bis[n_doc] = doc
y[n_doc, category_names[cat]] = 1

# delete archive
remove(topics_archive_path)
Expand Down
30 changes: 14 additions & 16 deletions sklearn/datasets/species_distributions.py
Expand Up @@ -241,29 +241,27 @@ def fetch_species_distributions(data_home=None,
logger.info('Downloading species data from %s to %s' % (
SAMPLES.url, data_home))
samples_path = _fetch_remote(SAMPLES, dirname=data_home)
X = np.load(samples_path) # samples.zip is a valid npz
with np.load(samples_path) as X: # samples.zip is a valid npz
for f in X.files:
fhandle = BytesIO(X[f])
if 'train' in f:
train = _load_csv(fhandle)
if 'test' in f:
test = _load_csv(fhandle)
remove(samples_path)

for f in X.files:
fhandle = BytesIO(X[f])
if 'train' in f:
train = _load_csv(fhandle)
if 'test' in f:
test = _load_csv(fhandle)

logger.info('Downloading coverage data from %s to %s' % (
COVERAGES.url, data_home))
coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
X = np.load(coverages_path) # coverages.zip is a valid npz
with np.load(coverages_path) as X: # coverages.zip is a valid npz
coverages = []
for f in X.files:
fhandle = BytesIO(X[f])
logger.debug(' - converting {}'.format(f))
coverages.append(_load_coverage(fhandle))
coverages = np.asarray(coverages, dtype=dtype)
remove(coverages_path)

coverages = []
for f in X.files:
fhandle = BytesIO(X[f])
logger.debug(' - converting {}'.format(f))
coverages.append(_load_coverage(fhandle))
coverages = np.asarray(coverages, dtype=dtype)

bunch = Bunch(coverages=coverages,
test=test,
train=train,
Expand Down

0 comments on commit ab33915

Please sign in to comment.