FIX PermissionError in datasets fetchers on Windows (scikit-learn#9847)

jnothman · Oct 3, 2017 · ab33915 · ab33915
1 parent c59af71
commit ab33915
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 44 deletions.
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -49,6 +49,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def fetch_california_housing(data_home=None, download_if_missing=True):
     """Loader for the California housing dataset from StatLib.
 
@@ -96,20 +97,21 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
 
         logger.info('Downloading Cal. housing from {} to {}'.format(
             ARCHIVE.url, data_home))
+
         archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
 
-        fileobj = tarfile.open(
-            mode="r:gz",
-            name=archive_path).extractfile(
-                'CaliforniaHousing/cal_housing.data')
+        with tarfile.open(mode="r:gz", name=archive_path) as f:
+            cal_housing = np.loadtxt(
+                f.extractfile('CaliforniaHousing/cal_housing.data'),
+                delimiter=',')
+            # Columns are not in the same order compared to the previous
+            # URL resource on lib.stat.cmu.edu
+            columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
+            cal_housing = cal_housing[:, columns_index]
+
+            joblib.dump(cal_housing, filepath, compress=6)
         remove(archive_path)
 
-        cal_housing = np.loadtxt(fileobj, delimiter=',')
-        # Columns are not in the same order compared to the previous
-        # URL resource on lib.stat.cmu.edu
-        columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
-        cal_housing = cal_housing[:, columns_index]
-        joblib.dump(cal_housing, filepath, compress=6)
     else:
         cal_housing = joblib.load(filepath)
 

diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
@@ -166,21 +166,23 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
 
         Xy = load_svmlight_files(files, n_features=N_FEATURES)
 
-        # delete archives
-        for f in files:
-            remove(f.name)
-
         # Training data is before testing data
         X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
         sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
         sample_id = sample_id.astype(np.uint32)
 
         joblib.dump(X, samples_path, compress=9)
         joblib.dump(sample_id, sample_id_path, compress=9)
+
+        # delete archives
+        for f in files:
+            f.close()
+            remove(f.name)
     else:
         X = joblib.load(samples_path)
         sample_id = joblib.load(sample_id_path)
 
+
     # load target (y), categories, and sample_id_bis
     if download_if_missing and (not exists(sample_topics_path) or
                                 not exists(topics_path)):
@@ -195,20 +197,21 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
         sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
         category_names = {}
-        for line in GzipFile(filename=topics_archive_path, mode='rb'):
-            line_components = line.decode("ascii").split(u" ")
-            if len(line_components) == 3:
-                cat, doc, _ = line_components
-                if cat not in category_names:
-                    n_cat += 1
-                    category_names[cat] = n_cat
-
-                doc = int(doc)
-                if doc != doc_previous:
-                    doc_previous = doc
-                    n_doc += 1
-                    sample_id_bis[n_doc] = doc
-                y[n_doc, category_names[cat]] = 1
+        with GzipFile(filename=topics_archive_path, mode='rb') as f:
+            for line in f:
+                line_components = line.decode("ascii").split(u" ")
+                if len(line_components) == 3:
+                    cat, doc, _ = line_components
+                    if cat not in category_names:
+                        n_cat += 1
+                        category_names[cat] = n_cat
+
+                    doc = int(doc)
+                    if doc != doc_previous:
+                        doc_previous = doc
+                        n_doc += 1
+                        sample_id_bis[n_doc] = doc
+                    y[n_doc, category_names[cat]] = 1
 
         # delete archive
         remove(topics_archive_path)

diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
@@ -241,29 +241,27 @@ def fetch_species_distributions(data_home=None,
         logger.info('Downloading species data from %s to %s' % (
             SAMPLES.url, data_home))
         samples_path = _fetch_remote(SAMPLES, dirname=data_home)
-        X = np.load(samples_path)  # samples.zip is a valid npz
+        with np.load(samples_path) as X:  # samples.zip is a valid npz
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                if 'train' in f:
+                    train = _load_csv(fhandle)
+                if 'test' in f:
+                    test = _load_csv(fhandle)
         remove(samples_path)
 
-        for f in X.files:
-            fhandle = BytesIO(X[f])
-            if 'train' in f:
-                train = _load_csv(fhandle)
-            if 'test' in f:
-                test = _load_csv(fhandle)
-
         logger.info('Downloading coverage data from %s to %s' % (
             COVERAGES.url, data_home))
         coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
-        X = np.load(coverages_path)  # coverages.zip is a valid npz
+        with np.load(coverages_path) as X:  # coverages.zip is a valid npz
+            coverages = []
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                logger.debug(' - converting {}'.format(f))
+                coverages.append(_load_coverage(fhandle))
+            coverages = np.asarray(coverages, dtype=dtype)
         remove(coverages_path)
 
-        coverages = []
-        for f in X.files:
-            fhandle = BytesIO(X[f])
-            logger.debug(' - converting {}'.format(f))
-            coverages.append(_load_coverage(fhandle))
-        coverages = np.asarray(coverages, dtype=dtype)
-
         bunch = Bunch(coverages=coverages,
                       test=test,
                       train=train,