Skip to content
This repository has been archived by the owner on Feb 18, 2023. It is now read-only.

Commit

Permalink
Add lukas fingerprint, #3
Browse files Browse the repository at this point in the history
Therefore add a large randomly generated csv file for testing, because
the pearson correlation needs a lot of data because of the split into
100 bins.

Close #3
  • Loading branch information
joclement committed Oct 23, 2020
1 parent 02ad6d1 commit f4e99ec
Show file tree
Hide file tree
Showing 11 changed files with 1,123 additions and 6 deletions.
80 changes: 77 additions & 3 deletions src/thesis/fingerprint.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Tuple, Union
import math
from typing import Callable, Tuple, Union

import numpy as np
import pandas as pd
import scipy.stats as stats

Expand All @@ -20,6 +22,16 @@
TD_WEIB_B = "TimeDiff Weibull B"
PDS_PER_SEC = "Number of PDs/sec"

PD_MEAN = "PD Mean"
PD_MAX = "PD Max"
PD_DIFF_MEAN = "PD Diff Mean"
PD_DIFF_SKEW = "PD Diff Skewness"
PD_DIFF_KURT = "PD Diff Kurtosis"
PD_DIFF_WEIB_A = "PD Diff Weibull A"
TD_MEDIAN = "TimeDiff Median"
CORR_PD_DIFF_TO_PD = "Correlate PD Diff - PD"
CORR_NEXT_PD_TO_PD = "Correlate Next PD - PD"


# TODO Issue #22: ensure that weibull fit is correct
def calc_weibull_params(data: Union[list, pd.Series]) -> Tuple[float, float]:
Expand Down Expand Up @@ -50,8 +62,70 @@ def tu_graz(df: pd.DataFrame) -> pd.Series:
return finger


def build_set(measurements: list) -> pd.DataFrame:
fingers = pd.DataFrame([tu_graz(measurement) for measurement in measurements])
def _correlate_pd_and_pd_diff(pds, pd_diffs):
indices = pd.cut(pds, 100, labels=list(range(100)), precision=20)

pd_boxes = [[] for _ in range(100)]
pd_diff_boxes = [[] for _ in range(100)]
for idx, box_idx in enumerate(indices):
pd_boxes[box_idx].append(pds[idx])
pd_diff_boxes[box_idx].append(pd_diffs[idx])

if any([len(pd_box) == 0 for pd_box in pd_boxes]):
raise ValueError("Correlation can not not be computed: Too few data points")
pd_means = [np.mean(pd_box) for pd_box in pd_boxes]
pd_diff_means = [np.mean(pd_diff_box) for pd_diff_box in pd_diff_boxes]

correlation_coefficiient, _ = stats.pearsonr(pd_means, pd_diff_means)
if math.isnan(correlation_coefficiient):
raise ValueError("Correlation between PD and Next PD could not be computed.")
return correlation_coefficiient


def _correlate_pd_and_next_pd(pds, next_pds):
indices = pd.cut(pds, 100, labels=list(range(100)), precision=20)

pd_boxes = [[] for _ in range(100)]
next_pd_boxes = [[] for _ in range(100)]
for idx, box_idx in enumerate(indices):
pd_boxes[box_idx].append(pds[idx])
next_pd_boxes[box_idx].append(next_pds[idx])

pd_means = [np.mean(pd_box) for pd_box in pd_boxes]
next_pd_means = [np.mean(next_pd_box) for next_pd_box in next_pd_boxes]

correlation_coefficiient, _ = stats.pearsonr(pd_means, next_pd_means)
if math.isnan(correlation_coefficiient):
raise ValueError("Correlation between PD and Next PD could not be computed.")
return correlation_coefficiient


def lukas(df: pd.DataFrame) -> pd.Series:
finger = pd.Series(dtype=float)

finger[PD_MEAN] = df[data.PD].mean()
finger[PD_VAR] = df[data.PD].var()
finger[PD_MAX] = df[data.PD].max()
finger[PD_WEIB_A], finger[PD_WEIB_B] = calc_weibull_params(df[data.PD])

pd_diff = df[data.PD].diff()[1:].reset_index(drop=True)
finger[PD_DIFF_MEAN] = pd_diff.mean()
finger[PD_DIFF_SKEW] = pd_diff.skew()
finger[PD_DIFF_KURT] = pd_diff.kurt()
finger[PD_DIFF_WEIB_A], _ = calc_weibull_params(pd_diff)

finger[TD_MEDIAN] = df[data.TIMEDIFF].median()

finger[CORR_PD_DIFF_TO_PD] = _correlate_pd_and_pd_diff(df[data.PD][:-1], pd_diff)
finger[CORR_NEXT_PD_TO_PD] = _correlate_pd_and_next_pd(
df[data.PD][:-1], df[data.PD][1:].reset_index(drop=True)
)

return finger


def build_set(measurements: list, fingerprint: Callable = tu_graz) -> pd.DataFrame:
fingers = pd.DataFrame([fingerprint(measurement) for measurement in measurements])

defects = [measurement[data.CLASS][0] for measurement in measurements]
fingers[data.CLASS] = pd.Series(defects, dtype="category")
Expand Down
4 changes: 4 additions & 0 deletions src/thesis/measure_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ def _echo_measurement_info(df):


def _echo_fingerprint_info(df):
click.echo("Fingerprint TU Graz:")
click.echo(fingerprint.tu_graz(df))
click.echo("")
click.echo("Fingerprint Lukas:")
click.echo(fingerprint.lukas(df))


def _ensure_unique(csv_filepaths: list):
Expand Down

0 comments on commit f4e99ec

Please sign in to comment.