diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index cc5882ecb9cb9..784f5742c1b43 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -49,6 +49,7 @@ logger = logging.getLogger(__name__) + def fetch_california_housing(data_home=None, download_if_missing=True): """Loader for the California housing dataset from StatLib. @@ -96,20 +97,21 @@ def fetch_california_housing(data_home=None, download_if_missing=True): logger.info('Downloading Cal. housing from {} to {}'.format( ARCHIVE.url, data_home)) + archive_path = _fetch_remote(ARCHIVE, dirname=data_home) - fileobj = tarfile.open( - mode="r:gz", - name=archive_path).extractfile( - 'CaliforniaHousing/cal_housing.data') + with tarfile.open(mode="r:gz", name=archive_path) as f: + cal_housing = np.loadtxt( + f.extractfile('CaliforniaHousing/cal_housing.data'), + delimiter=',') + # Columns are not in the same order compared to the previous + # URL resource on lib.stat.cmu.edu + columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0] + cal_housing = cal_housing[:, columns_index] + + joblib.dump(cal_housing, filepath, compress=6) remove(archive_path) - cal_housing = np.loadtxt(fileobj, delimiter=',') - # Columns are not in the same order compared to the previous - # URL resource on lib.stat.cmu.edu - columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0] - cal_housing = cal_housing[:, columns_index] - joblib.dump(cal_housing, filepath, compress=6) else: cal_housing = joblib.load(filepath) diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py index 7c3d6d3edde76..5b968907920fc 100644 --- a/sklearn/datasets/rcv1.py +++ b/sklearn/datasets/rcv1.py @@ -166,10 +166,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, Xy = load_svmlight_files(files, n_features=N_FEATURES) - # delete archives - for f in files: - remove(f.name) - # Training data is before testing data X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr() sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7])) @@ -177,10 +173,16 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(X, samples_path, compress=9) joblib.dump(sample_id, sample_id_path, compress=9) + + # delete archives + for f in files: + f.close() + remove(f.name) else: X = joblib.load(samples_path) sample_id = joblib.load(sample_id_path) + # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or not exists(topics_path)): @@ -195,20 +197,21 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8) sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32) category_names = {} - for line in GzipFile(filename=topics_archive_path, mode='rb'): - line_components = line.decode("ascii").split(u" ") - if len(line_components) == 3: - cat, doc, _ = line_components - if cat not in category_names: - n_cat += 1 - category_names[cat] = n_cat - - doc = int(doc) - if doc != doc_previous: - doc_previous = doc - n_doc += 1 - sample_id_bis[n_doc] = doc - y[n_doc, category_names[cat]] = 1 + with GzipFile(filename=topics_archive_path, mode='rb') as f: + for line in f: + line_components = line.decode("ascii").split(u" ") + if len(line_components) == 3: + cat, doc, _ = line_components + if cat not in category_names: + n_cat += 1 + category_names[cat] = n_cat + + doc = int(doc) + if doc != doc_previous: + doc_previous = doc + n_doc += 1 + sample_id_bis[n_doc] = doc + y[n_doc, category_names[cat]] = 1 # delete archive remove(topics_archive_path) diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 8735041d93e79..945fed5d02d2d 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -241,29 +241,27 @@ def fetch_species_distributions(data_home=None, logger.info('Downloading species data from %s to %s' % ( SAMPLES.url, data_home)) samples_path = _fetch_remote(SAMPLES, dirname=data_home) - X = np.load(samples_path) # samples.zip is a valid npz + with np.load(samples_path) as X: # samples.zip is a valid npz + for f in X.files: + fhandle = BytesIO(X[f]) + if 'train' in f: + train = _load_csv(fhandle) + if 'test' in f: + test = _load_csv(fhandle) remove(samples_path) - for f in X.files: - fhandle = BytesIO(X[f]) - if 'train' in f: - train = _load_csv(fhandle) - if 'test' in f: - test = _load_csv(fhandle) - logger.info('Downloading coverage data from %s to %s' % ( COVERAGES.url, data_home)) coverages_path = _fetch_remote(COVERAGES, dirname=data_home) - X = np.load(coverages_path) # coverages.zip is a valid npz + with np.load(coverages_path) as X: # coverages.zip is a valid npz + coverages = [] + for f in X.files: + fhandle = BytesIO(X[f]) + logger.debug(' - converting {}'.format(f)) + coverages.append(_load_coverage(fhandle)) + coverages = np.asarray(coverages, dtype=dtype) remove(coverages_path) - coverages = [] - for f in X.files: - fhandle = BytesIO(X[f]) - logger.debug(' - converting {}'.format(f)) - coverages.append(_load_coverage(fhandle)) - coverages = np.asarray(coverages, dtype=dtype) - bunch = Bunch(coverages=coverages, test=test, train=train,