Skip to content

Commit

Permalink
Merge pull request #81 from iprafols/add_feature_importance
Browse files Browse the repository at this point in the history
Add feature importance
  • Loading branch information
iprafols committed Oct 3, 2023
2 parents 406322a + 97321a8 commit fecd691
Show file tree
Hide file tree
Showing 25 changed files with 1,496 additions and 110 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python 3.8
uses: actions/setup-python@v1
with:
python-version: 3.8
python-version: ["3.9", "3.10", "3.11"]
- name: Install dependencies
run: |
sudo apt-get -y install libbz2-dev
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
timeout-minutes: 60
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v2
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,5 +103,8 @@ venv.bak/
# mypy
.mypy_cache/

# storage_files
**/.DS_Store

# test results
py/squeze/tests/results/
6 changes: 3 additions & 3 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ function-naming-style=snake_case
#function-rgx=

# Good variable names which should always be accepted, separated by a comma.
good-names=X,y,df,PeakFinder,PeakFinderType,gs,ax
good-names=X,y,z,w,df,PeakFinder,PeakFinderType,gs,ax

# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
Expand Down Expand Up @@ -488,5 +488,5 @@ min-public-methods=0

# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception
overgeneral-exceptions=builtin.BaseException,
builtin.Exception
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ for an explanation on the arguments add `-h` to the previous line
### For Developers
Before submitting a PR please make sure to:
1. For every file you have modified run
```yapf --style google file.py > aux
mv aux file.py
```
yapf --style google file.py -i
```
to ensure the coding styles are maintained.
2. Consider using pylint to help in the debug process. From the repo folder run
Expand Down
3 changes: 2 additions & 1 deletion bin/format_superset_dr12q.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def main(cmdargs):
#print "missing file {}".format(spectrum_file)

# save spectra in the current plate
save_json("{}_plate{:04d}.json".format(args.out, plate), spectra)
if spectra.size() > 0:
save_json("{}_plate{:04d}.json".format(args.out, plate), spectra)

if len(missing_files) > 0:
print("Missing files:")
Expand Down
12 changes: 6 additions & 6 deletions py/squeze/candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,22 +762,22 @@ def merge(self, others_list):
raise Error("The function merge is available in the " +
f"merge mode only. Detected mode is {self.mode}")

other_dfs = [self.candidates]
for index, candidates_filename in enumerate(others_list):
self.userprint(f"Merging... {index} of {len(others_list)}")
self.userprint(f"Loading... {index} of {len(others_list)}")

try:
# load candidates
other = load_df(candidates_filename)

# append to candidates list
self.candidates = self.candidates.append(other,
ignore_index=True)
other_dfs.append(load_df(candidates_filename))

except TypeError:
self.userprint(
f"Error occured when loading file {candidates_filename}.")
self.userprint("Ignoring file")

self.candidates = pd.concat(other_dfs, ignore_index=True)


def plot_histograms(self, plot_col, normed=True):
""" Plot the histogram of the specified column
Expand Down
2 changes: 1 addition & 1 deletion py/squeze/candidates_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def compute_line_ratios(wave, flux, ivar, peak_indexs, significances, try_lines,
z_try = z_try - 1.0
if z_try < 0.0:
continue
oneplusz = (1.0 + z_try)
oneplusz = 1.0 + z_try

candidate_info = []

Expand Down
4 changes: 0 additions & 4 deletions py/squeze/peak_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@

defaults = {
# This variable sets the width (in pixels) of the typical peak to be detected.
# This parameter will be passed to the peak finding function. Check the
# documentation on the module squeze_peak_finder for more details
"width": 70,
# This variable sets the minimum signal-to-noise ratio of a peak.
# This parameter will be passed to the peak finding function. Check the
# documentationon the module squeze_peak_finder for more details
"min significance": 6,
}

Expand Down
264 changes: 264 additions & 0 deletions py/squeze/peak_finder_power_law.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""
SQUEzE
======
This file provides a peak finder to be used by SQUEzE
"""
__author__ = "Ignasi Perez-Rafols (iprafols@gmail.com)"

import itertools

from numba import njit
import numpy as np
from scipy import odr

from squeze.utils import quietprint, verboseprint

accepted_options = ["min significance"]

defaults = {
# This variable sets the minimum sigmas of an outlier.
"min significance": 2.0,
}


class PeakFinderPowerLaw:
""" Create and manage the peak finder used by SQUEzE
CLASS: PeakFinder
PURPOSE: Create and manage the peak finder used by SQUEzE. This
peak finder looks for peaks by fiting a power law to the continum of the
spectra and locating the outliers. It also computes the significance of
the peaks and filters the results according to their significances.
"""

def __init__(self, config):
""" Initialize class instance
Arguments
---------
config: configparser.SectionProxy
Parsed options to initialize class
"""
self.min_significance = config.getfloat("min significance")

def find_peaks(self, spectrum):
""" Find significant peaks in a given spectrum.
Arguments
---------
spectrum : Spectrum
The spectrum where peaks are looked for
Return
------
An array with the position of the peaks
"""
wavelength = spectrum.wave
flux = spectrum.flux
ivar = spectrum.ivar
outliers_mask = np.ones_like(flux, dtype=bool)
significances = np.zeros_like(flux)
best_fit = np.array((0.0, 0.0))

userprint=quietprint
do_fit = True
index = 0
while do_fit:
userprint(
f"Running iteration {index}, {outliers_mask.sum()} pixels are "
f"considered in the fit. Best fit is {best_fit}")
index += 1
if index > 100:
userprint = verboseprint
# fit power law
new_outliers_mask, new_significances, new_best_fit = fit_power_law(
wavelength,
flux,
ivar,
outliers_mask,
self.min_significance)

if np.allclose(best_fit, new_best_fit, equal_nan=True):
do_fit = False
else:
outliers_mask = new_outliers_mask
significances = new_significances
best_fit = new_best_fit

userprint(
f"Fit converged after {index} iterations. {outliers_mask.sum()} "
f"pixels are considered in the fit. Best fit is {best_fit}")

# if fit did not converge, return
if any(best_fit == np.nan):
return np.array([]), np.array([])

# select only peaks
peaks = select_peaks(
wavelength, flux, outliers_mask, best_fit)

# compress neighbouring pixels into a single pixel
peak_indexs, peak_significances = compress(peaks, significances)

# return
return peak_indexs, peak_significances

def compress(peaks, significances):
"""Compress the neighbouring peak indexs into a single peak
Arguments
---------
peaks: array of bool
Pixels mask, only pixels with peaks=True are considered to be peaks
significances: array of float
Significance of the outlier detection, computed as
abs(flux - bestfit_flux)*np.sqrt(ivar)
Return
------
compressed_peak_indexs: array of int
Array containing the indexs of the compressed peaks. Contiguous pixels
are compressed by performing a weighted average according to their
significance
compressed_significances: array of float
Significance of the compressed peaks. Computed by adding the significances
of the relevant detections
"""
#find peak indexs
peak_indexs = np.array([index for index, peak in enumerate(peaks) if peak])

# compress
groups = list(group_contiguous(peak_indexs))
compressed_peak_indexs = np.zeros(len(groups), dtype=int)
compressed_significances = np.zeros_like(compressed_peak_indexs, dtype=float)
for index, group in enumerate(groups):
# single pixel
if group[1] == group[0]:
compressed_peak_indexs[index] = group[0]
compressed_significances[index] = significances[group[0]]
# grouped pixels
else:
aux = np.arange(group[0], group[1]+1, dtype=int)
compressed_peak_indexs[index] = int(round(np.average(
aux, weights=significances[aux]), 0))
compressed_significances[index] = significances[aux].sum()


return compressed_peak_indexs, compressed_significances

def fit_power_law(wavelength, flux, ivar, outliers_mask, min_significance):
""" Perform a power-law fit, then compute the outliers
Arguments
---------
wavelength: array of float
The wavelength
flux: array of float
The flux to fit
ivar: array of float
The inverse variance of the flux
outliers_mask: array of bool
Outliers mask, only pixels with outliers_mask=True are used in the fit
min_significance: float
The minimum significance to select outliers
Return
------
new_outliers_mask: array of bool
Outliers mask, only pixels with outliers_mask=True are used in the fit
significances: array of float
Significance of the outlier detection, computed as
abs(flux - bestfit_flux)*np.sqrt(ivar)
best_fit: (float, float)
The best fit power law amplitude and index
"""
# do the actual fit
data = odr.Data(wavelength, flux, we=ivar)
odr_instance = odr.ODR(
data,
POWER_LAW_MODEL,
beta0=(flux.mean(), 0.0))
odr_instance.set_job(fit_type=0)
fit_output = odr_instance.run()

# figure out the outliers
bestfit_flux = power_law(fit_output.beta, wavelength)
significances = np.abs(flux-bestfit_flux)*np.sqrt(ivar)
new_outliers_mask = np.zeros_like(outliers_mask)
new_outliers_mask[np.where(
significances > min_significance
)] = True
new_outliers_mask &= outliers_mask

return new_outliers_mask, significances, fit_output.beta

def group_contiguous(data):
"""Group continuous elements together
For example for data = [0, 1, 2, 3, 7, 8, 9] yield (0, 3) and (7, 9).
To generate a list call is as
`groups=list(group_contiguous(data))`
Arguments
---------
data: array of int
Sorted sequence of integers to group
"""
for _, group in itertools.groupby(enumerate(data), lambda x: x[1] - x[0]):
group = list(group)
yield group[0][1], group[-1][1]

@njit()
def power_law(parameters, x_data):
"""Power law function
Arguments
---------
parameters: (float, float)
The amplitude and power law index
x_data: array of float
The points where to compute the power law
"""
amplitude, power_law_index = parameters
return amplitude*x_data**(-power_law_index)

@njit()
def select_peaks(wavelength, flux, outliers_mask, power_law_params):
""" Select which of the outliers are peaks
Arguments
---------
wavelength: array of float
The wavelength
flux: array of float
The flux to fit
outliers_mask: array of bool
Outliers mask, only pixels with outliers_mask=True are used in the fit
power_law_params: (float, float)
The power law amplitude and index
Return
------
peaks: array of bool
Pixels mask, only pixels with peaks=True are considered to be peaks
"""
# figure out the peaks
bestfit_flux = power_law(power_law_params, wavelength)
peaks = outliers_mask & (flux > bestfit_flux)

return peaks

POWER_LAW_MODEL = odr.Model(power_law)

0 comments on commit fecd691

Please sign in to comment.