* Update requirements (#30)

* Remove unnecessary warnings by seaborn and matplotlib * Update tests * Update version * Add support for python 3.9 and remove support form 3.6 and 3.7 * Fix bug for empty data frame return in preprocess::get_correlated_features Co-authored-by: idan.morad <idan.morad@rsa.com>
idanmoradarthas · Feb 28, 2022 · 6a62475 · 6a62475
1 parent be4806e
commit 6a62475
Show file tree

Hide file tree

Showing 23 changed files with 28,310 additions and 43 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,8 @@
 language: python
 
 python:
-  - 3.6
-  - 3.7
   - 3.8
+  - 3.9
 
 before_install:
   - sudo apt-get install graphviz

diff --git a/.version b/.version
@@ -1 +1 @@
-version==1.7
+version==1.7.1
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -3,7 +3,7 @@ Installation
 ############
 .. highlight:: bash
 
-Data Science Utils is compatible with Python 3.6 or later. The simplest way to install Data Science Utils and its
+Data Science Utils is compatible with Python 3.8 or later. The simplest way to install Data Science Utils and its
 dependencies is from PyPI with pip, Python's preferred package installer::
 
     pip install data-science-utils

diff --git a/ds_utils/metrics.py b/ds_utils/metrics.py
@@ -48,7 +48,7 @@ def plot_confusion_matrix(y_test: numpy.ndarray, y_pred: numpy.ndarray, labels:
         tn, fp, fn, tp = cnf_matrix.ravel()
         npv, ppv, tnr, tpr = _calc_precision_recall(fn, fp, tn, tp)
 
-        table = numpy.array([[tn, fp, tnr], [fn, tp, tpr], [npv, ppv, numpy.NaN]], dtype=numpy.float)
+        table = numpy.array([[tn, fp, tnr], [fn, tp, tpr], [npv, ppv, numpy.NaN]], dtype=numpy.float64)
         df = pandas.DataFrame(table, columns=[f"{labels[0]} - Predicted", f"{labels[1]} - Predicted", "Recall"],
                               index=[f"{labels[0]} - Actual", f"{labels[1]} - Actual", "Precision"])
     else:
@@ -60,8 +60,8 @@ def plot_confusion_matrix(y_test: numpy.ndarray, y_pred: numpy.ndarray, labels:
         df = pandas.DataFrame(cnf_matrix, columns=[f"{label} - Predicted" for label in labels],
                               index=[f"{label} - Actual" for label in labels])
         df["Recall"] = tpr
-        df = df.append(
-            pandas.DataFrame([ppv], columns=[f"{label} - Predicted" for label in labels], index=["Precision"]),
+        df = pandas.concat(
+            [df, pandas.DataFrame([ppv], columns=[f"{label} - Predicted" for label in labels], index=["Precision"])],
             sort=False)
 
     figure, subplots = pyplot.subplots(nrows=3, ncols=1, gridspec_kw={'height_ratios': [1, 8, 1]})

diff --git a/ds_utils/preprocess.py b/ds_utils/preprocess.py
@@ -1,9 +1,10 @@
+import warnings
 from typing import Optional, Union, Callable, List
 
 import numpy
 import pandas
 import seaborn
-from matplotlib import axes, pyplot, dates
+from matplotlib import axes, pyplot, dates, ticker
 from scipy.cluster import hierarchy
 
 
@@ -34,19 +35,21 @@ def visualize_feature(series: pandas.Series, remove_na: bool = False, *, ax: Opt
         feature_series = series
 
     if str(feature_series.dtype).startswith("float"):
-        seaborn.distplot(feature_series, ax=ax, hist_kws=kwargs)
+        seaborn.histplot(feature_series, ax=ax, kde=True, **kwargs)
         labels = ax.get_xticks()
     elif str(feature_series.dtype).startswith("datetime"):
         feature_series.value_counts().plot(kind="line", ax=ax, **kwargs)
         labels = ax.get_xticks()
     else:
-        seaborn.countplot(_copy_series_or_keep_top_10(feature_series), ax=ax, **kwargs)
+        seaborn.countplot(x=_copy_series_or_keep_top_10(feature_series), ax=ax, **kwargs)
         labels = ax.get_xticklabels()
 
     if not ax.get_title():
         ax.set_title(f"{feature_series.name} ({feature_series.dtype})")
         ax.set_xlabel("")
 
+    ticks_loc = ax.get_xticks().tolist()
+    ax.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
     ax.set_xticklabels(labels, rotation=45, horizontalalignment='right')
 
     if str(feature_series.dtype).startswith("datetime"):
@@ -81,13 +84,18 @@ def get_correlated_features(data_frame: pandas.DataFrame, features: List[str], t
     correlations = _calc_corrections(data_frame[features + [target_feature]], method, min_periods)
     target_corr = correlations[target_feature].transpose()
     features_corr = correlations.loc[features, features]
-    corr_matrix = features_corr.where(numpy.triu(numpy.ones(features_corr.shape), k=1).astype(numpy.bool))
+    corr_matrix = features_corr.where(numpy.triu(numpy.ones(features_corr.shape), k=1).astype(numpy.bool_))
     corr_matrix = corr_matrix[(~numpy.isnan(corr_matrix))].stack().reset_index()
     corr_matrix = corr_matrix[corr_matrix[0].abs() >= threshold]
-    corr_matrix["level_0_target_corr"] = target_corr[corr_matrix["level_0"]].values.tolist()[0]
-    corr_matrix["level_1_target_corr"] = target_corr[corr_matrix["level_1"]].values.tolist()[0]
-    corr_matrix = corr_matrix.rename({0: "level_0_level_1_corr"}, axis=1).reset_index(drop=True)
-    return corr_matrix
+    if corr_matrix.shape[0] > 0:
+        corr_matrix["level_0_target_corr"] = target_corr[corr_matrix["level_0"]].values.tolist()[0]
+        corr_matrix["level_1_target_corr"] = target_corr[corr_matrix["level_1"]].values.tolist()[0]
+        corr_matrix = corr_matrix.rename({0: "level_0_level_1_corr"}, axis=1).reset_index(drop=True)
+        return corr_matrix
+    else:
+        warnings.warn(f"Correlation threshold {threshold} was too high. An empty frame was returned", UserWarning)
+        return pandas.DataFrame(
+            columns=['level_0', 'level_1', 'level_0_level_1_corr', 'level_0_target_corr', 'level_1_target_corr'])
 
 
 def visualize_correlations(data: pandas.DataFrame, method: Union[str, Callable] = 'pearson',
@@ -119,7 +127,7 @@ def visualize_correlations(data: pandas.DataFrame, method: Union[str, Callable]
         ax = pyplot.gca()
 
     corr = _calc_corrections(data, method, min_periods)
-    mask = numpy.triu(numpy.ones_like(corr, dtype=numpy.bool))
+    mask = numpy.triu(numpy.ones_like(corr, dtype=numpy.bool_))
     seaborn.heatmap(corr, mask=mask, annot=True, fmt=".3f", ax=ax, **kwargs)
     return ax
 
@@ -229,20 +237,24 @@ def plot_features_interaction(feature_1: str, feature_2: str, data: pandas.DataF
         elif str(data[feature_2].dtype).startswith("datetime"):
             # first feature is categorical and the second is datetime
             dup_df[feature_2] = data[feature_2].apply(dates.date2num)
-            chart = seaborn.violinplot(feature_2, feature_1, data=dup_df, ax=ax)
+            chart = seaborn.violinplot(x=feature_2, y=feature_1, data=dup_df, ax=ax)
+            ticks_loc = chart.get_xticks().tolist()
+            chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
             chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
             ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
         else:
             # first feature is categorical and the second is numeric
             dup_df[feature_2] = data[feature_2]
-            chart = seaborn.boxplot(feature_1, feature_2, data=dup_df, ax=ax, **kwargs)
+            chart = seaborn.boxplot(x=feature_1, y=feature_2, data=dup_df, ax=ax, **kwargs)
             chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
     elif str(data[feature_1].dtype).startswith("datetime"):
         if str(data[feature_2].dtype) in ["object", "category", "bool"]:
             # first feature is datetime and the second is categorical
             dup_df[feature_1] = data[feature_1].apply(dates.date2num)
             dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
-            chart = seaborn.violinplot(feature_1, feature_2, data=dup_df, ax=ax)
+            chart = seaborn.violinplot(x=feature_1, y=feature_2, data=dup_df, ax=ax)
+            ticks_loc = chart.get_xticks().tolist()
+            chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
             chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
             ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
         else:
@@ -254,15 +266,15 @@ def plot_features_interaction(feature_1: str, feature_2: str, data: pandas.DataF
         # first feature is numeric and the second is categorical
         dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
         dup_df[feature_1] = data[feature_1]
-        chart = seaborn.boxplot(feature_2, feature_1, data=dup_df, ax=ax, **kwargs)
+        chart = seaborn.boxplot(x=feature_2, y=feature_1, data=dup_df, ax=ax, **kwargs)
         chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
     elif str(data[feature_2].dtype).startswith("datetime"):
         # first feature is numeric and the second is datetime
         ax.plot(data[feature_2], data[feature_1], **kwargs)
         ax.set_xlabel(feature_2)
         ax.set_ylabel(feature_1)
     else:
-        # both feature are numeric
+        # both features are numeric
         ax.scatter(data[feature_1], data[feature_2], **kwargs)
         ax.set_xlabel(feature_1)
         ax.set_ylabel(feature_2)

diff --git a/ds_utils/strings.py b/ds_utils/strings.py
@@ -41,12 +41,12 @@ def append_tags_to_frame(X_train: pandas.DataFrame, X_test: pandas.DataFrame, fi
                                  min_df=min_df, max_features=max_features)
     x_train_count_matrix = vectorizer.fit_transform(X_train[field_name].dropna())
     x_train_tags = pandas.DataFrame(x_train_count_matrix.toarray(),
-                                    columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names()])
+                                    columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
     x_train_tags.index = X_train.index
 
     x_test_count_matrix = vectorizer.transform(X_test[field_name].dropna())
     x_test_tags = pandas.DataFrame(x_test_count_matrix.toarray(),
-                                   columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names()])
+                                   columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
     x_test_tags.index = X_test.index
 
     x_train_reduced = X_train.drop(columns=[field_name])
@@ -78,10 +78,10 @@ def extract_significant_terms_from_subset(data_frame: pandas.DataFrame, subset_d
     :author: `Eran Hirsch <https://github.com/eranhirs>`_
     """
     count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
-    matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names())
+    matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())
 
     subset_X = vectorizer.transform(subset_data_frame[field_name].dropna())
-    subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names())
+    subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names_out())
 
     subset_freq = subset_matrix_df.sum()
     superset_freq = matrix_df.sum()

diff --git a/ds_utils/xai.py b/ds_utils/xai.py
@@ -8,10 +8,7 @@
 from matplotlib import axes, pyplot, image
 from sklearn.tree import _tree as sklearn_tree, export_graphviz
 
-try:
-    from sklearn.tree import BaseDecisionTree
-except ImportError:
-    from sklearn.tree.tree import BaseDecisionTree
+from sklearn.tree import BaseDecisionTree
 
 
 def generate_decision_paths(classifier: BaseDecisionTree, feature_names: Optional[List[str]] = None,

diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -1,5 +1,5 @@
-conda-build==3.20.2
-anaconda-client==1.7.2
+conda-build==3.21.7
+anaconda-client==1.9.0
 m2-patch==2.7.5
 conda-verify==3.4.2
-ripgrep==11.0.2
+ripgrep==12.1.1
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,4 @@
-pytest==6.0.1
-pytest-cov==2.10.1
+pytest==6.2.5
+pytest-cov==3.0.0
 nose==1.3.7
-coverage==5.2.1
+coverage==6.3.2
diff --git a/requirements-docs.txt b/requirements-docs.txt
@@ -1,2 +1,2 @@
-sphinx==3.2.1
+sphinx==4.4.0
 sphinx_rtd_theme==0.4.3
diff --git a/requirements-pypi.txt b/requirements-pypi.txt
@@ -1 +1 @@
-twine==3.2.0
+twine==3.7.1
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,6 @@ scipy>=1.0.0
 pandas>=0.23.0
 matplotlib>2.02
 seaborn>=0.8.0
-scikit-learn>=0.21.0
+scikit-learn>=1.0.0
 pydotplus>=2.0.2
 joblib>=0.12
diff --git a/setup.py b/setup.py
@@ -21,9 +21,8 @@
                    "Intended Audience :: Education",
                    "Intended Audience :: Science/Research",
                    "License :: OSI Approved :: MIT License",
-                   "Programming Language :: Python :: 3.6",
-                   "Programming Language :: Python :: 3.7",
                    "Programming Language :: Python :: 3.8",
+                   "Programming Language :: Python :: 3.9",
                    "Topic :: Scientific/Engineering :: Artificial Intelligence"],
       keywords="data-science utilities python machine-learning scikit-learn matplotlib",
       packages=find_packages(exclude=['contrib', 'docs', 'tests']),

diff --git a/tests/baseline_images/test_visualization_aids/test_visualize_feature_float.png b/tests/baseline_images/test_visualization_aids/test_visualize_feature_float.png
diff --git a/...seline_images/test_visualization_aids/test_visualize_feature_float_exist_ax.png b/...seline_images/test_visualization_aids/test_visualize_feature_float_exist_ax.png
diff --git a/tests/baseline_images/test_visualization_aids/test_visualize_feature_remove_na.png b/tests/baseline_images/test_visualization_aids/test_visualize_feature_remove_na.png