diff --git a/.github/release_message.sh b/.github/release_message.sh new file mode 100644 index 0000000..cf04476 --- /dev/null +++ b/.github/release_message.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +previous_tag=$(git tag --sort=-creatordate | sed -n 2p) +git shortlog "${previous_tag}.." | s \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0f1ba9c..1dfcdd8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,25 +1,129 @@ -name: API workflow +# name: API workflow -on: [push, pull_request] +# on: [push, pull_request] + +# jobs: +# build: +# runs-on: ubuntu-latest +# name: Test python API +# steps: +# - uses: actions/checkout@v1 +# - name: Install requirements +# run: pip install -r requirements.txt +# - name: Run tests and collect coverage +# run: pytest --cov . +# - name: Upload coverage reports to Codecov +# run: | +# # Replace `linux` below with the appropriate OS +# # Options are `alpine`, `linux`, `macos`, `windows` +# curl -Os https://uploader.codecov.io/latest/linux/codecov +# chmod +x codecov +# ./codecov -t ${CODECOV_TOKEN} +# - name: Upload coverage reports to Codecov +# uses: codecov/codecov-action@v3 +# env: +# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + + + +# This is a basic workflow to help you get started with Actions + +name: CI + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the main branch + push: + branches: [ main ] + pull_request: + branches: [ main ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: jobs: - build: - runs-on: ubuntu-latest - name: Test python API + linter: + strategy: + fail-fast: false + matrix: + python-version: [3.9] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Debugging + run: | + ls -la + cat Makefile + make virtualenv + - name: Install project + run: | + make virtualenv + source .venv/bin/activate + make install + - name: Run linter + run: make lint + + tests_linux: + needs: linter + strategy: + fail-fast: false + matrix: + python-version: [3.9] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v1 - - name: Install requirements - run: pip install -r requirements.txt - - name: Run tests and collect coverage - run: pytest --cov . - - name: Upload coverage reports to Codecov - run: | - # Replace `linux` below with the appropriate OS - # Options are `alpine`, `linux`, `macos`, `windows` - curl -Os https://uploader.codecov.io/latest/linux/codecov - chmod +x codecov - ./codecov -t ${CODECOV_TOKEN} - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install project + run: make install + - name: Run tests + run: make test + - name: "Upload coverage to Codecov" + uses: codecov/codecov-action@v3 + # with: + # fail_ci_if_error: true + + # tests_mac: + # needs: linter + # strategy: + # fail-fast: false + # matrix: + # python-version: [3.9] + # os: [macos-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + # - name: Install project + # run: make install + # - name: Run tests + # run: make test + + # tests_win: + # needs: linter + # strategy: + # fail-fast: false + # matrix: + # python-version: [3.9] + # os: [windows-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + # - name: Install Pip + # run: pip install --user --upgrade pip + # - name: Install project + # run: pip install -e .[test] + # - name: run tests + # run: pytest -s -vvvv -l --tb=long tests \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 7a88e1d..867fd32 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -1,6 +1,6 @@ name: Pylint -on: [push] +on: [push, pull_request] jobs: build: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..06ea5bc --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,50 @@ +name: Upload Python Package + +on: + push: + # Sequence of patterns matched against refs/tags + tags: + - '*' # Push events to matching v*, i.e. v1.0, v20.15.10 + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + release: + name: Create Release + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + with: + # by default, it uses a depth of 1 + # this fetches all history so that we can read each commit + fetch-depth: 0 + - name: Generate Changelog + run: .github/release_message.sh > release_message.md + - name: Release + uses: softprops/action-gh-release@v1 + with: + body_path: release_message.md + + deploy: + needs: release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b541671 --- /dev/null +++ b/Makefile @@ -0,0 +1,126 @@ +.ONESHELL: +ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')") +USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes") + +.PHONY: help +help: ## Show the help. + @echo "Usage: make " + @echo "" + @echo "Targets:" + @fgrep "##" Makefile | fgrep -v fgrep + + +.PHONY: show +show: ## Show the current environment. + @echo "Current environment:" + @if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi + @echo "Running using $(ENV_PREFIX)" + @$(ENV_PREFIX)python -V + @$(ENV_PREFIX)python -m site + +.PHONY: install +install: ## Install the project in dev mode. + @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi + @echo "Don't forget to run 'make virtualenv' if you got errors." + $(ENV_PREFIX)pip install -e .[test] + +.PHONY: fmt +fmt: ## Format code using black & isort. + $(ENV_PREFIX)isort plotsandgraphs/ + $(ENV_PREFIX)black -l 79 plotsandgraphs/ + $(ENV_PREFIX)black -l 79 tests/ + +.PHONY: lint +lint: ## Run pep8, black, mypy linters. + @echo "Running linters ..." + @echo "--- Running flake8 ---" + $(ENV_PREFIX)flake8 plotsandgraphs/ + @echo "--- Running black ---" + $(ENV_PREFIX)black -l 79 --check plotsandgraphs/ + $(ENV_PREFIX)black -l 79 --check tests/ + @echo "--- Running mypy ---" + $(ENV_PREFIX)mypy --ignore-missing-imports plotsandgraphs/ + +.PHONY: test +test: lint ## Run tests and generate coverage report. + $(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=plotsandgraphs -l --tb=short --maxfail=1 tests/ + $(ENV_PREFIX)coverage xml + $(ENV_PREFIX)coverage html + +.PHONY: watch +watch: ## Run tests on every change. + ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/ + +.PHONY: clean +clean: ## Clean unused files. + @find ./ -name '*.pyc' -exec rm -f {} \; + @find ./ -name '__pycache__' -exec rm -rf {} \; + @find ./ -name 'Thumbs.db' -exec rm -f {} \; + @find ./ -name '*~' -exec rm -f {} \; + @rm -rf .cache + @rm -rf .pytest_cache + @rm -rf .mypy_cache + @rm -rf build + @rm -rf dist + @rm -rf *.egg-info + @rm -rf htmlcov + @rm -rf .tox/ + @rm -rf docs/_build + +.PHONY: virtualenv +virtualenv: ## Create a virtual environment. + @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi + @echo "creating virtualenv ..." + @rm -rf .venv + @python3 -m venv .venv + @./.venv/bin/pip install -U pip + @./.venv/bin/pip install -e .[test] + @echo + @echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!" + +.PHONY: release +release: ## Create a new tag for release. + @echo "WARNING: This operation will create s version tag and push to github" + @read -p "Version? (provide the next x.y.z semver) : " TAG + @echo "$${TAG}" > plotsandgraphs/VERSION + @$(ENV_PREFIX)gitchangelog > HISTORY.md + @git add plotsandgraphs/VERSION HISTORY.md + @git commit -m "release: version $${TAG} 🚀" + @echo "creating git tag : $${TAG}" + @git tag $${TAG} + @git push -u origin HEAD --tags + @echo "Github Actions will detect the new tag and release the new version." + +.PHONY: docs +docs: ## Build the documentation. + @echo "building documentation ..." + @$(ENV_PREFIX)mkdocs build + URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL || open $$URL + +.PHONY: switch-to-poetry +switch-to-poetry: ## Switch to poetry package manager. + @echo "Switching to poetry ..." + @if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi + @rm -rf .venv + @poetry init --no-interaction --name=a_flask_test --author=rochacbruno + @echo "" >> pyproject.toml + @echo "[tool.poetry.scripts]" >> pyproject.toml + @echo "plotsandgraphs = 'plotsandgraphs.__main__:main'" >> pyproject.toml + @cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done + @cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done + @poetry install --no-interaction + @mkdir -p .github/backup + @mv requirements* .github/backup + @mv setup.py .github/backup + @echo "You have switched to https://python-poetry.org/ package manager." + @echo "Please run 'poetry shell' or 'poetry run plotsandgraphs'" + +.PHONY: init +init: ## Initialize the project based on an application template. + @./.github/init.sh + + +# This project has been generated from rochacbruno/python-project-template +# __author__ = 'rochacbruno' +# __repo__ = https://github.com/rochacbruno/python-project-template +# __sponsor__ = https://github.com/sponsors/rochacbruno/ \ No newline at end of file diff --git a/plotsandgraphs/VERSION b/plotsandgraphs/VERSION new file mode 100644 index 0000000..9a26661 --- /dev/null +++ b/plotsandgraphs/VERSION @@ -0,0 +1 @@ +0.1.01 \ No newline at end of file diff --git a/plotsandgraphs/__init__.py b/plotsandgraphs/__init__.py new file mode 100644 index 0000000..e7f5024 --- /dev/null +++ b/plotsandgraphs/__init__.py @@ -0,0 +1,2 @@ +from . import binary_classifier +from . import compare_distributions \ No newline at end of file diff --git a/plotsandgraphs/binary_classifier.py b/plotsandgraphs/binary_classifier.py new file mode 100644 index 0000000..3c69486 --- /dev/null +++ b/plotsandgraphs/binary_classifier.py @@ -0,0 +1,448 @@ +import matplotlib.pyplot as plt +from matplotlib.colors import to_rgba +from matplotlib.figure import Figure +import seaborn as sns +import numpy as np +import pandas as pd +from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score, precision_recall_curve +from sklearn.calibration import calibration_curve +from sklearn.utils import resample +from pathlib import Path +from tqdm import tqdm +from typing import Optional + + +def plot_accuracy(y_true, y_pred, name='', save_fig_path=None) -> Figure: + """ Really ugly plot, I am not sure if the scalar value for accuracy should receive an entire plot.""" + accuracy = accuracy_score(y_true, y_pred) + + # accuracy = 0 + # for t in range(max_seq_len): + # accuracy += accuracy_score( y[:,t,0].round() , y_pred[:,t] ) + # accuracy = accuracy / max_seq_len + fig= plt.figure( figsize=(4,5)) + plt.bar( np.array([0]), np.array([ accuracy ])) + # axs[0].set_xticks(ticks=range(2)) + # axs[0].set_xticklabels(["train", "test"]) + plt.ylabel('Accuracy') + plt.ylim([0,1]) + # axs[0].set_xlabel('Features') + title = "Predictor model: {}".format(name ) + plt.title(title) + plt.tight_layout() + + if (save_fig_path != None): + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + return fig + +def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=None) -> Figure: + import matplotlib.colors as colors + + # Compute the confusion matrix + cm = confusion_matrix(y_true, y_pred.round()) + # normalize the confusion matrix + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + + # Create the ConfusionMatrixDisplay instance and plot it + cmd = ConfusionMatrixDisplay(cm, display_labels=['class 0\nnegative', 'class 1\npositive']) + fig, ax = plt.subplots(figsize=(4,4)) + cmd.plot(cmap='YlOrRd', values_format='', colorbar=False, ax=ax, text_kw={'visible':False}) + cmd.texts_ = [] + cmd.text_ = [] + + text_labels = ['TN', 'FP', 'FN', 'TP'] + cmap_min, cmap_max = cmd.im_.cmap(0), cmd.im_.cmap(1.0) + for i in range(2): + for j in range(2): + ax.text(j, i, f"{text_labels[i * 2 + j]}\n{cmd.im_.get_array()[i, j]:.2%}", + ha="center", va="center", color=cmap_min if cmd.im_.get_array()[i, j] > 0.5 else cmap_max) + + ax.vlines([0.5], *ax.get_ylim(), color='white', linewidth=1) + ax.hlines([0.49], *ax.get_xlim(), color='white', linewidth=1) + ax.spines[:].set_visible(False) + + + bounds = np.linspace(0, 1, 11) + cmap = plt.cm.get_cmap('YlOrRd', len(bounds)+1) + norm = colors.BoundaryNorm(bounds, cmap.N) + cbar = ax.figure.colorbar(cmd.im_, ax=ax, cmap=cmap, norm=norm, boundaries=bounds, ticks=bounds[::2], location="right", shrink=0.8) + # cbar.set_ticks(np.arange(0,1.1,0.1)) + cbar.ax.yaxis.set_ticks_position('both') + cbar.outline.set_visible(False) + plt.tight_layout() + + if (save_fig_path != None): + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + + return fig + + + + +def plot_classification_report(y_test: np.ndarray, + y_pred: np.ndarray, + title='Classification Report', + figsize=(8, 4), + save_fig_path=None, **kwargs): + """ + TODO: save all these plots + Plot the classification report of sklearn + + Parameters + ---------- + y_test : pandas.Series of shape (n_samples,) + Targets. + y_pred : pandas.Series of shape (n_samples,) + Predictions. + title : str, default = 'Classification Report' + Plot title. + fig_size : tuple, default = (8, 6) + Size (inches) of the plot. + dpi : int, default = 70 + Image DPI. + save_fig_path : str, defaut=None + Full path where to save the plot. Will generate the folders if they don't exist already. + **kwargs : attributes of classification_report class of sklearn + + Returns + ------- + fig : Matplotlib.pyplot.Figure + Figure from matplotlib + ax : Matplotlib.pyplot.Axe + Axe object from matplotlib + """ + import matplotlib as mpl + import matplotlib.colors as colors + import seaborn as sns + import pathlib + + fig, ax = plt.subplots(figsize=figsize) + + cmap = 'YlOrRd' + + clf_report = classification_report(y_test, y_pred, output_dict=True, **kwargs) + keys_to_plot = [key for key in clf_report.keys() if key not in ('accuracy', 'macro avg', 'weighted avg')] + df = pd.DataFrame(clf_report, columns=keys_to_plot).T + #the following line ensures that dataframe are sorted from the majority classes to the minority classes + df.sort_values(by=['support'], inplace=True) + + #first, let's plot the heatmap by masking the 'support' column + rows, cols = df.shape + mask = np.zeros(df.shape) + mask[:,cols-1] = True + + bounds = np.linspace(0, 1, 11) + cmap = plt.cm.get_cmap('YlOrRd', len(bounds)+1) # type: ignore + norm = colors.BoundaryNorm(bounds, cmap.N) # type: ignore[attr-defined] + + ax = sns.heatmap(df, mask=mask, annot=False, cmap=cmap, fmt='.3g', + cbar_kws={'ticks':bounds[::2], 'norm':norm, 'boundaries':bounds}, + vmin=0.0, + vmax=1.0, + linewidths=2, linecolor='white' + ) + cbar = ax.collections[0].colorbar + cbar.ax.yaxis.set_ticks_position('both') + + cmap_min, cmap_max = cbar.cmap(0), cbar.cmap(1.0) + + # add text annotation to heatmap + dx, dy = 0.5, 0.5 + for i in range(rows): + for j in range(cols-1): + text = f"{df.iloc[i, j]:.2%}" #if (j 0.5 else cmap_max) + + #then, let's add the support column by normalizing the colors in this column + mask = np.zeros(df.shape) + mask[:,:cols-1] = True + + ax = sns.heatmap(df, mask=mask, annot=False, cmap=cmap, cbar=False, + linewidths=2, linecolor='white', fmt='.0f', + vmin=df['support'].min(), + vmax=df['support'].sum(), + norm=mpl.colors.Normalize(vmin=df['support'].min(), + vmax=df['support'].sum()) + ) + + cmap_min, cmap_max = cbar.cmap(0), cbar.cmap(1.0) + for i in range(rows): + j = cols-1 + text = f"{df.iloc[i, j]:.0f}" #if (j 0.5 else cmap_max) + + plt.title(title) + plt.xticks(rotation = 45) + plt.yticks(rotation = 360) + plt.tight_layout() + + if (save_fig_path != None): + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + + return fig, ax + + + + + +def plot_roc_curve( + y_true: np.ndarray, + y_score: np.ndarray, + figsize=(5,5), + save_fig_path=None, + confidence_interval: float=0.95, + highlight_roc_area=True, + n_bootstraps=None) -> Figure: + """ + Creates a ROC curve for a binary classifier. Includes the option for bootstrapping. + + Parameters + ---------- + y_true : np.ndarray + The actual labels of the data. Either 0 or 1. + y_score : np.ndarray + The output scores of the classifier. Between 0 and 1. + figsize : tuple, optional + The size of the figure. By default (5,5). + save_fig_path : str, optional + Path to folder where the figure should be saved. If None then plot is not saved, by default None. E.g. 'figures/roc_curve.png'. + confidence_interval : float, optional + The confidence interval to use for the calibration plot. By default 0.95. Between 0 and 1. Has no effect when not using n_bootstraps. + highlight_roc_area : bool, optional + Whether to highlight the area under the ROC curve. By default True. Has no effect when using n_bootstraps. + n_bootstraps : int, optional + Number of bootstrap samples to use for the calibration plot. Recommended minimum: 1000, moderate: 5000-10000, high: 50000-100000. + If None, then no bootstrapping is done. By default None. + + Returns + ------- + fig : matplotlib.pyplot figure + The figure of the calibration plot + """ + + # create figure + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + + if n_bootstraps is None: + base_fpr, mean_tprs, thresholds = roc_curve(y_true, y_score) + mean_auc = auc(base_fpr, mean_tprs) + if highlight_roc_area is True: + plt.fill_between(base_fpr, 0, mean_tprs, alpha=0.2, zorder=2) + if confidence_interval is not None: + print('Warning: confidence_intervals is not None, but n_bootstraps is None. Confidence intervals will not be plotted.') + else: + # Bootstrapping for AUROC + bootstrap_aucs, bootstrap_tprs = [], [] + base_fpr = np.linspace(0, 1, 101) + for _ in tqdm(range(n_bootstraps), desc='Bootstrapping'): + indices = resample(np.arange(len(y_true)), replace=True) + fpr_i, tpr_i, _ = roc_curve(y_true[indices], y_score[indices]) + roc_auc_i = auc(fpr_i, tpr_i) + bootstrap_aucs.append(roc_auc_i) + + # Interpolate tpr_i to base_fpr, so we have the tpr for the same fpr values for each bootstrap iteration + tpr_i_interp = np.interp(base_fpr, fpr_i, tpr_i) + tpr_i_interp[0] = 0.0 + bootstrap_tprs.append(tpr_i_interp) + + mean_auc = np.mean(bootstrap_aucs) + tprs = np.array(bootstrap_tprs) + mean_tprs = tprs.mean(axis=0) + + # visualize confidence intervals + if confidence_interval is not None: + CI_upper = confidence_interval + (1-confidence_interval)/2 + CI_lower = (1-confidence_interval)/2 + tprs_upper = np.quantile(tprs, CI_upper, axis=0) + tprs_lower = np.quantile(tprs, CI_lower, axis=0) + auc_upper = np.quantile(bootstrap_aucs, CI_upper) + auc_lower = np.quantile(bootstrap_aucs, CI_lower) + label = f'{confidence_interval:.0%} CI: [{auc_lower:.2f}, {auc_upper:.2f}]' + plt.fill_between(base_fpr, tprs_lower, tprs_upper, alpha=0.3, label=label, zorder=2) + + if highlight_roc_area is True: + print('Warning: highlight_roc_area is True, but n_bootstraps is not None. The area under the ROC curve will not be highlighted.') + + plt.plot(base_fpr, mean_tprs, label=f'ROC curve (AUROC = {mean_auc:.2f})', zorder=3) + plt.plot([0, 1], [0, 1], 'k--', label='Random classifier') + plt.xlim([0.0, 1.01]) + plt.ylim([-0.01, 1.01]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver Operating Characteristic (ROC)') + # reverse legend entry order + handles, labels = plt.gca().get_legend_handles_labels() + handles = handles[::-1] + labels = labels[::-1] + plt.legend(handles, labels, loc="lower right", frameon=False) + ax.spines[:].set_visible(False) + ax.grid(True, linestyle='-', linewidth=0.5, color='grey', alpha=0.5) + ax.set_yticks(np.arange(0, 1.1, 0.2)) + plt.tight_layout() + + if save_fig_path: + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + + return fig + + + +def plot_calibration_curve(y_prob: np.ndarray, y_true: np.ndarray, save_fig_path=None): + """ + Creates calibration plot for a binary classifier. + + Parameters + ---------- + y_prob : np.ndarray + The output probabilities of the classifier. Between 0 and 1. + y_true : np.ndarray + The actual labels of the data. Either 0 or 1. + save_fig_path : _type_, optional + Path to folder where the figure should be saved. If None then plot is not saved, by default None + + Returns + ------- + fig : matplotlib.pyplot figure + The figure of the calibration plot + """ + prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=10, strategy='uniform') + expected_cal_error = np.abs(prob_pred-prob_true).mean().round(2) + fig = plt.figure(figsize=(5,5)) + ax = fig.add_subplot(111) + + # Calculate bar width + bar_width = (prob_pred[1:] - prob_pred[:-1]).mean() * 0.75 + + # Plotting + ax.bar(prob_pred, prob_true, width=bar_width, zorder=3, facecolor=to_rgba('C0',0.75), edgecolor='midnightblue', linewidth=2, label=f'True Calibration') + ax.bar(prob_pred, prob_pred - prob_true, bottom=prob_true, width=bar_width, zorder=3, alpha=0.5, edgecolor='red', fill=False, linewidth=2, label=f'Mean ECE = {expected_cal_error}', hatch='//') + ax.plot([0, 1], [0, 1], linestyle='--', color='grey', zorder=3, label='Perfect Calibration') + + # Labels and titles + ax.set(xlabel='Predicted probability', ylabel='True probability') + plt.xlim([0.0, 1.005]) + plt.ylim([-0.01, 1.0]) + ax.legend(loc='upper left', frameon=False) + + # show y-grid + ax.spines[:].set_visible(False) + ax.grid(True, linestyle='-', linewidth=0.5, color='grey', alpha=0.5) + ax.set_yticks(np.arange(0, 1.1, 0.2)) + ax.set_xticks(np.arange(0, 1.1, 0.2)) + plt.tight_layout() + + # save plot + if (save_fig_path != None): + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + + return fig + + +def plot_y_prob_histogram(y_prob: np.ndarray, save_fig_path=None) -> Figure: + fig = plt.figure(figsize=(5,5)) + ax = fig.add_subplot(111) + ax.hist(y_prob, bins=10, alpha=0.9, edgecolor='midnightblue', linewidth=2, rwidth=1) + # same histogram as above, but with border lines + # ax.hist(y_prob, bins=10, alpha=0.5, edgecolor='black', linewidth=1.2) + ax.set(xlabel='Predicted probability [-]', ylabel='Count [-]', xlim=(-0.01, 1.0)) + ax.set_title('Histogram of predicted probabilities') + + ax.spines[:].set_visible(False) + ax.grid(True, linestyle='-', linewidth=0.5, color='grey', alpha=0.5) + ax.set_xticks(np.arange(0, 1.1, 0.2)) + plt.tight_layout() + + # save plot + if (save_fig_path != None): + path = Path(save_fig_path) + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_fig_path, bbox_inches='tight') + + return fig + + + +def plot_pr_curve( + y_true: np.ndarray, + y_score: np.ndarray, + figsize=(5,5), + save_fig_path: Optional[str]=None, + color: Optional[str]= None, + label: Optional[str]=None, + title: Optional[str]=None + ) -> Figure: + """ + Visualize the Precision-Recall curve for a binary classifier. + + Parameters + ---------- + y_true : np.ndarray + The actual labels of the data. Either 0 or 1. + y_score : np.ndarray + The output scores of the classifier. Between 0 and 1. + figsize : tuple, optional + The size of the figure. By default (5,5). + save_fig_path : str, optional + Path to folder where the figure should be saved. If None then plot is not saved, by default None. E.g. 'figures/pr_curve.png'. + color : str, optional + Color of the PR curve, by default None. + label : str, optional + Custom label for the plot. If None, a default label is used. By default None. + + Returns + ------- + fig : matplotlib.pyplot figure + The figure of the PR curve + """ + + # Create a new figure + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + + # Compute Precision-Recall curve and area for each class + precision, recall, _ = precision_recall_curve(y_true, y_score) + + pr_auc = auc(recall, precision) + + if label is None: + # Use a default label if none is provided + label = 'PR curve' + + label += f' (area = {pr_auc:.3f})' + + # Plot Precision-Recall curve + ax.plot(recall, precision, label=label, color=color) + ax.set_xlim((0.0, 1.01)) + ax.set_ylim((-0.01, 1.01)) + ax.set_xlabel('Recall') + ax.set_ylabel('Precision') + if title is not None: + ax.set_title(title) + ax.legend(loc="lower right") + ax.spines[:].set_visible(False) + ax.grid(True, linestyle='-', linewidth=0.5, color='grey', alpha=0.5) + ax.set_yticks(np.arange(0, 1.1, 0.2)) + plt.tight_layout() + + # Save the figure if save_fig_path is specified + if save_fig_path: + plt.savefig(save_fig_path, bbox_inches='tight') + + return fig + diff --git a/plotsandgraphs/compare_distributions.py b/plotsandgraphs/compare_distributions.py new file mode 100644 index 0000000..c9cbda2 --- /dev/null +++ b/plotsandgraphs/compare_distributions.py @@ -0,0 +1,103 @@ +import numpy as np +import matplotlib.pyplot as plt +import matplotlib as mpl +import pandas as pd +from typing import List, Tuple, Optional + + +def plot_raincloud(df: pd.DataFrame, + x_col: str, + y_col: str, + colors: Optional[List[str]] = None, + order: Optional[List[str]] = None, + title: Optional[str] = None, + x_label: Optional[str] = None, + x_range: Optional[Tuple[float, float]] = None, + show_violin = True, + show_scatter = True, + show_boxplot = True): + + """ + Generate a raincloud plot using Pandas DataFrame. + + Parameters: + - df (pd.DataFrame): The data frame containing the data. + - x_col (str): The column name for the x-axis data. + - y_col (str): The column name for the y-axis categories. + - colors (List[str], optional): List of colors for each category. Defaults to tab10 cmap. + - order (List[str], optional): Order of categories on y-axis. Defaults to unique values in y_col. + - title (str, optional): Title of the plot. + - x_label (str, optional): Label for the x-axis. + - x_range (Tuple[float, float], optional): Range for the x-axis. + - show_violin (bool, optional): Whether to show violin plot. Defaults to True. + - show_scatter (bool, optional): Whether to show scatter plot. Defaults to True. + - show_boxplot (bool, optional): Whether to show boxplot. Defaults to True. + + Returns: + - matplotlib.figure.Figure: The generated plot figure. + """ + + fig, ax = plt.subplots(figsize=(16, 8)) + offset = 0.2 # Offset value to move plots + + if order is None: + order = df[y_col].unique() + + # if colors are none, use distinct colors for each group + if colors is None: + cmap = plt.get_cmap('tab10') + colors = [mpl.colors.to_hex(cmap(i)) for i in np.linspace(0, 1, len(order))] + else: + assert len(colors) == len(order), 'colors and order must be the same length' + colors = colors + + # Boxplot + if show_boxplot: + bp = ax.boxplot([df[df[y_col] == grp][x_col].values for grp in order], + patch_artist=True, vert=False, positions=np.arange(1 + offset, len(order) + 1 + offset), widths=0.2) + + # Customize boxplot colors + for patch, color in zip(bp['boxes'], colors): + patch.set_facecolor(color) + patch.set_alpha(0.8) + + # Set median line color to black + for median in bp['medians']: + median.set_color('black') + + # Violinplot + if show_violin: + vp = ax.violinplot([df[df[y_col] == grp][x_col].values for grp in order], + positions=np.arange(1 + offset, len(order) + 1 + offset), showmeans=False, showextrema=False, showmedians=False, vert=False) + + # Customize violinplot colors + for idx, b in enumerate(vp['bodies']): + b.get_paths()[0].vertices[:, 1] = np.clip(b.get_paths()[0].vertices[:, 1], idx + 1 + offset, idx + 2 + offset) + b.set_color(colors[idx]) + + # Scatterplot with jitter + if show_scatter: + for idx, grp in enumerate(order): + features = df[df[y_col] == grp][x_col].values + y = np.full(len(features), idx + 1 - offset) + jitter_amount = 0.12 + y += np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(y)) + plt.scatter(features, y, s=10, c=colors[idx], alpha=0.3, facecolors='none') + + # Labels + plt.yticks(np.arange(1, len(order) + 1), order) + + if x_label is None: + x_label = x_col + plt.xlabel(x_label) + if title: + plt.title(title + '\n') + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.xaxis.grid(True) + + if x_range: + plt.xlim(x_range) + + return fig diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d390ec0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,5 @@ +[tool.pylint."FORMAT"] +max-line-length = 120 + +[tool.pylint."BASIC"] +variable-rgx = "[a-z_][a-z0-9_]{0,30}$|[a-z0-9_]+([A-Z][a-z0-9_]+)*$" # Allow snake case and camel case for variable names diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..f660f6a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,10 @@ +# This requirements are for development and testing only, not for production. +pytest +coverage +flake8 +black +isort +pytest-cov +mypy +gitchangelog +mkdocs \ No newline at end of file diff --git a/setup.py b/setup.py index dcc5f73..823dfc7 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,68 @@ +"""Python setup.py for plotsandgraphs package""" +import io +import os from setuptools import setup, find_packages +# setup( +# name='plotsandgraphs', +# version='0.1.0', +# packages=find_packages(include=['plotsandgraphs', 'plotsandgraphs.*']) +# ) + +PROJECT_NAME = 'plotsandgraphs' + + + + +def read(*paths, **kwargs): + """Read the contents of a text file safely. + >>> read("project_name", "VERSION") + '0.1.0' + >>> read("README.md") + ... + """ + + content = "" + with io.open( + os.path.join(os.path.dirname(__file__), *paths), + encoding=kwargs.get("encoding", "utf8"), + ) as open_file: + content = open_file.read().strip() + return content + + +def read_requirements(path): + return [ + line.strip() + for line in read(path).split("\n") + if not line.startswith(('"', "#", "-", "git+")) + ] + + setup( - name='plotsandgraphs', - version='0.1.0', - packages=find_packages(include=['plotsandgraphs', 'plotsandgraphs.*']) + name=PROJECT_NAME, + version=read(PROJECT_NAME, "VERSION"), + description="Create plots and graphs for your Machine Learning projects.", + url="https://github.com/joshuawe/plots_and_graphs", + long_description=read("README.md"), + long_description_content_type="text/markdown", + author="Joshua Wendland and Fabian Krüger", + packages=find_packages(exclude=["tests", ".github"]), + install_requires=read_requirements("requirements.txt"), + entry_points={ + "console_scripts": ["project_name = project_name.__main__:main"] + }, + extras_require={"test": read_requirements("requirements-test.txt")}, + license='GNU General Public License v3.0', + keywords=['plots', 'graphs', 'machine learning', 'data science', 'data visualization', 'data analysis', 'matplotlib'], + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Environment :: Console', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering :: Artificial Intelligence' + ], ) \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_test.py b/tests/test_test.py new file mode 100644 index 0000000..b2820fc --- /dev/null +++ b/tests/test_test.py @@ -0,0 +1,4 @@ +# This is just a test for a test + +def test_test(): + assert True \ No newline at end of file