Skip to content

Commit

Permalink
* Update requirements (#30)
Browse files Browse the repository at this point in the history
* Remove unnecessary warnings by seaborn and matplotlib
* Update tests
* Update version
* Add support for python 3.9 and remove support form 3.6 and 3.7
* Fix bug for empty data frame return in preprocess::get_correlated_features

Co-authored-by: idan.morad <idan.morad@rsa.com>
  • Loading branch information
idanmoradarthas and idan.morad committed Feb 28, 2022
1 parent be4806e commit 6a62475
Show file tree
Hide file tree
Showing 23 changed files with 28,310 additions and 43 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
@@ -1,9 +1,8 @@
language: python

python:
- 3.6
- 3.7
- 3.8
- 3.9

before_install:
- sudo apt-get install graphviz
Expand Down
2 changes: 1 addition & 1 deletion .version
@@ -1 +1 @@
version==1.7
version==1.7.1
2 changes: 1 addition & 1 deletion docs/source/installation.rst
Expand Up @@ -3,7 +3,7 @@ Installation
############
.. highlight:: bash

Data Science Utils is compatible with Python 3.6 or later. The simplest way to install Data Science Utils and its
Data Science Utils is compatible with Python 3.8 or later. The simplest way to install Data Science Utils and its
dependencies is from PyPI with pip, Python's preferred package installer::

pip install data-science-utils
Expand Down
6 changes: 3 additions & 3 deletions ds_utils/metrics.py
Expand Up @@ -48,7 +48,7 @@ def plot_confusion_matrix(y_test: numpy.ndarray, y_pred: numpy.ndarray, labels:
tn, fp, fn, tp = cnf_matrix.ravel()
npv, ppv, tnr, tpr = _calc_precision_recall(fn, fp, tn, tp)

table = numpy.array([[tn, fp, tnr], [fn, tp, tpr], [npv, ppv, numpy.NaN]], dtype=numpy.float)
table = numpy.array([[tn, fp, tnr], [fn, tp, tpr], [npv, ppv, numpy.NaN]], dtype=numpy.float64)
df = pandas.DataFrame(table, columns=[f"{labels[0]} - Predicted", f"{labels[1]} - Predicted", "Recall"],
index=[f"{labels[0]} - Actual", f"{labels[1]} - Actual", "Precision"])
else:
Expand All @@ -60,8 +60,8 @@ def plot_confusion_matrix(y_test: numpy.ndarray, y_pred: numpy.ndarray, labels:
df = pandas.DataFrame(cnf_matrix, columns=[f"{label} - Predicted" for label in labels],
index=[f"{label} - Actual" for label in labels])
df["Recall"] = tpr
df = df.append(
pandas.DataFrame([ppv], columns=[f"{label} - Predicted" for label in labels], index=["Precision"]),
df = pandas.concat(
[df, pandas.DataFrame([ppv], columns=[f"{label} - Predicted" for label in labels], index=["Precision"])],
sort=False)

figure, subplots = pyplot.subplots(nrows=3, ncols=1, gridspec_kw={'height_ratios': [1, 8, 1]})
Expand Down
40 changes: 26 additions & 14 deletions ds_utils/preprocess.py
@@ -1,9 +1,10 @@
import warnings
from typing import Optional, Union, Callable, List

import numpy
import pandas
import seaborn
from matplotlib import axes, pyplot, dates
from matplotlib import axes, pyplot, dates, ticker
from scipy.cluster import hierarchy


Expand Down Expand Up @@ -34,19 +35,21 @@ def visualize_feature(series: pandas.Series, remove_na: bool = False, *, ax: Opt
feature_series = series

if str(feature_series.dtype).startswith("float"):
seaborn.distplot(feature_series, ax=ax, hist_kws=kwargs)
seaborn.histplot(feature_series, ax=ax, kde=True, **kwargs)
labels = ax.get_xticks()
elif str(feature_series.dtype).startswith("datetime"):
feature_series.value_counts().plot(kind="line", ax=ax, **kwargs)
labels = ax.get_xticks()
else:
seaborn.countplot(_copy_series_or_keep_top_10(feature_series), ax=ax, **kwargs)
seaborn.countplot(x=_copy_series_or_keep_top_10(feature_series), ax=ax, **kwargs)
labels = ax.get_xticklabels()

if not ax.get_title():
ax.set_title(f"{feature_series.name} ({feature_series.dtype})")
ax.set_xlabel("")

ticks_loc = ax.get_xticks().tolist()
ax.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
ax.set_xticklabels(labels, rotation=45, horizontalalignment='right')

if str(feature_series.dtype).startswith("datetime"):
Expand Down Expand Up @@ -81,13 +84,18 @@ def get_correlated_features(data_frame: pandas.DataFrame, features: List[str], t
correlations = _calc_corrections(data_frame[features + [target_feature]], method, min_periods)
target_corr = correlations[target_feature].transpose()
features_corr = correlations.loc[features, features]
corr_matrix = features_corr.where(numpy.triu(numpy.ones(features_corr.shape), k=1).astype(numpy.bool))
corr_matrix = features_corr.where(numpy.triu(numpy.ones(features_corr.shape), k=1).astype(numpy.bool_))
corr_matrix = corr_matrix[(~numpy.isnan(corr_matrix))].stack().reset_index()
corr_matrix = corr_matrix[corr_matrix[0].abs() >= threshold]
corr_matrix["level_0_target_corr"] = target_corr[corr_matrix["level_0"]].values.tolist()[0]
corr_matrix["level_1_target_corr"] = target_corr[corr_matrix["level_1"]].values.tolist()[0]
corr_matrix = corr_matrix.rename({0: "level_0_level_1_corr"}, axis=1).reset_index(drop=True)
return corr_matrix
if corr_matrix.shape[0] > 0:
corr_matrix["level_0_target_corr"] = target_corr[corr_matrix["level_0"]].values.tolist()[0]
corr_matrix["level_1_target_corr"] = target_corr[corr_matrix["level_1"]].values.tolist()[0]
corr_matrix = corr_matrix.rename({0: "level_0_level_1_corr"}, axis=1).reset_index(drop=True)
return corr_matrix
else:
warnings.warn(f"Correlation threshold {threshold} was too high. An empty frame was returned", UserWarning)
return pandas.DataFrame(
columns=['level_0', 'level_1', 'level_0_level_1_corr', 'level_0_target_corr', 'level_1_target_corr'])


def visualize_correlations(data: pandas.DataFrame, method: Union[str, Callable] = 'pearson',
Expand Down Expand Up @@ -119,7 +127,7 @@ def visualize_correlations(data: pandas.DataFrame, method: Union[str, Callable]
ax = pyplot.gca()

corr = _calc_corrections(data, method, min_periods)
mask = numpy.triu(numpy.ones_like(corr, dtype=numpy.bool))
mask = numpy.triu(numpy.ones_like(corr, dtype=numpy.bool_))
seaborn.heatmap(corr, mask=mask, annot=True, fmt=".3f", ax=ax, **kwargs)
return ax

Expand Down Expand Up @@ -229,20 +237,24 @@ def plot_features_interaction(feature_1: str, feature_2: str, data: pandas.DataF
elif str(data[feature_2].dtype).startswith("datetime"):
# first feature is categorical and the second is datetime
dup_df[feature_2] = data[feature_2].apply(dates.date2num)
chart = seaborn.violinplot(feature_2, feature_1, data=dup_df, ax=ax)
chart = seaborn.violinplot(x=feature_2, y=feature_1, data=dup_df, ax=ax)
ticks_loc = chart.get_xticks().tolist()
chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
else:
# first feature is categorical and the second is numeric
dup_df[feature_2] = data[feature_2]
chart = seaborn.boxplot(feature_1, feature_2, data=dup_df, ax=ax, **kwargs)
chart = seaborn.boxplot(x=feature_1, y=feature_2, data=dup_df, ax=ax, **kwargs)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
elif str(data[feature_1].dtype).startswith("datetime"):
if str(data[feature_2].dtype) in ["object", "category", "bool"]:
# first feature is datetime and the second is categorical
dup_df[feature_1] = data[feature_1].apply(dates.date2num)
dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
chart = seaborn.violinplot(feature_1, feature_2, data=dup_df, ax=ax)
chart = seaborn.violinplot(x=feature_1, y=feature_2, data=dup_df, ax=ax)
ticks_loc = chart.get_xticks().tolist()
chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
else:
Expand All @@ -254,15 +266,15 @@ def plot_features_interaction(feature_1: str, feature_2: str, data: pandas.DataF
# first feature is numeric and the second is categorical
dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
dup_df[feature_1] = data[feature_1]
chart = seaborn.boxplot(feature_2, feature_1, data=dup_df, ax=ax, **kwargs)
chart = seaborn.boxplot(x=feature_2, y=feature_1, data=dup_df, ax=ax, **kwargs)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
elif str(data[feature_2].dtype).startswith("datetime"):
# first feature is numeric and the second is datetime
ax.plot(data[feature_2], data[feature_1], **kwargs)
ax.set_xlabel(feature_2)
ax.set_ylabel(feature_1)
else:
# both feature are numeric
# both features are numeric
ax.scatter(data[feature_1], data[feature_2], **kwargs)
ax.set_xlabel(feature_1)
ax.set_ylabel(feature_2)
Expand Down
8 changes: 4 additions & 4 deletions ds_utils/strings.py
Expand Up @@ -41,12 +41,12 @@ def append_tags_to_frame(X_train: pandas.DataFrame, X_test: pandas.DataFrame, fi
min_df=min_df, max_features=max_features)
x_train_count_matrix = vectorizer.fit_transform(X_train[field_name].dropna())
x_train_tags = pandas.DataFrame(x_train_count_matrix.toarray(),
columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names()])
columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
x_train_tags.index = X_train.index

x_test_count_matrix = vectorizer.transform(X_test[field_name].dropna())
x_test_tags = pandas.DataFrame(x_test_count_matrix.toarray(),
columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names()])
columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
x_test_tags.index = X_test.index

x_train_reduced = X_train.drop(columns=[field_name])
Expand Down Expand Up @@ -78,10 +78,10 @@ def extract_significant_terms_from_subset(data_frame: pandas.DataFrame, subset_d
:author: `Eran Hirsch <https://github.com/eranhirs>`_
"""
count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names())
matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())

subset_X = vectorizer.transform(subset_data_frame[field_name].dropna())
subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names())
subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names_out())

subset_freq = subset_matrix_df.sum()
superset_freq = matrix_df.sum()
Expand Down
5 changes: 1 addition & 4 deletions ds_utils/xai.py
Expand Up @@ -8,10 +8,7 @@
from matplotlib import axes, pyplot, image
from sklearn.tree import _tree as sklearn_tree, export_graphviz

try:
from sklearn.tree import BaseDecisionTree
except ImportError:
from sklearn.tree.tree import BaseDecisionTree
from sklearn.tree import BaseDecisionTree


def generate_decision_paths(classifier: BaseDecisionTree, feature_names: Optional[List[str]] = None,
Expand Down
6 changes: 3 additions & 3 deletions requirements-conda.txt
@@ -1,5 +1,5 @@
conda-build==3.20.2
anaconda-client==1.7.2
conda-build==3.21.7
anaconda-client==1.9.0
m2-patch==2.7.5
conda-verify==3.4.2
ripgrep==11.0.2
ripgrep==12.1.1
6 changes: 3 additions & 3 deletions requirements-dev.txt
@@ -1,4 +1,4 @@
pytest==6.0.1
pytest-cov==2.10.1
pytest==6.2.5
pytest-cov==3.0.0
nose==1.3.7
coverage==5.2.1
coverage==6.3.2
2 changes: 1 addition & 1 deletion requirements-docs.txt
@@ -1,2 +1,2 @@
sphinx==3.2.1
sphinx==4.4.0
sphinx_rtd_theme==0.4.3
2 changes: 1 addition & 1 deletion requirements-pypi.txt
@@ -1 +1 @@
twine==3.2.0
twine==3.7.1
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -3,6 +3,6 @@ scipy>=1.0.0
pandas>=0.23.0
matplotlib>2.02
seaborn>=0.8.0
scikit-learn>=0.21.0
scikit-learn>=1.0.0
pydotplus>=2.0.2
joblib>=0.12
3 changes: 1 addition & 2 deletions setup.py
Expand Up @@ -21,9 +21,8 @@
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence"],
keywords="data-science utilities python machine-learning scikit-learn matplotlib",
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
Expand Down
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 6a62475

Please sign in to comment.