In [283]:
import pandas as pd
import numpy as np
from pathlib import Path
import ipytest

pd.options.mode.copy_on_write = True

In [284]:
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    # Calculate mean and standard deviation for each column
    means = df.mean()
    stds = df.std()

    # Identify rows where any value deviates more than 3 standard deviations
    mask = (np.abs(df - means) <= (3 * stds)).all(axis=1)

    # Return DataFrame without outliers
    return df.loc[mask]

def test_remove_outliers():
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                       'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                       'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
    df_out = remove_outliers(df)
    assert df_out.shape[0] == 10



In [285]:
def compute_column_means(df: pd.DataFrame) -> pd.DataFrame:
    # Compute the mean for each column
    column_means = df.mean().to_frame().T
    
    return column_means

def test_compute_column_means():
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                       'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                       'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
    df_out = compute_column_means(df)
    for col in df.columns:
        assert df_out[col][0] == df[col].mean()
    assert df_out.shape[1] == 3

In [286]:

ipytest.config(rewrite_asserts=True, magics=True)
ipytest.run()

TRIANGLE_MEASUREMENTS_PATH = Path("triangle-measurements-data.csv")
assert TRIANGLE_MEASUREMENTS_PATH.exists()

platform darwin -- Python 3.11.0, pytest-7.2.2, pluggy-1.0.0
rootdir: /Users/igorkrzywda/Workspace/Projects/air-cs/miernictwo-2/statistics
plugins: anyio-3.6.2
collected 2 items

t_21e4ebcd6a19432ca61ee32e475a5016.py [32m.[0m[32m.[0m[32m                                                     [100%][0m



In [287]:
def compute_standard_undertainty(df: pd.DataFrame) -> pd.DataFrame:
    # compute standard uncertainty for each column defined as standard deviation from the mean divided by the square root of the number of measurements
    column_std = df.std().to_frame().T
    column_std = column_std / np.sqrt(df.shape[0])
    return column_std

def compute_measurement_uncertainty(df: pd.DataFrame, device_uncertainty: float) -> pd.DataFrame:
    # compute uncertaining for eaach column defined as square root of the sum of the standard uncertainty squared and the device uncertainty squared
    column_std = compute_standard_undertainty(df)
    column_std = np.sqrt(column_std**2 + device_uncertainty**2)
    return column_std

def compute_extended_standard_uncertainty(df: pd.DataFrame) -> pd.DataFrame:
    # compute extended standard uncertainty for each column defined as the standard uncertainty multipied by t-distribution value for 97% confidence interval
    column_std = compute_standard_undertainty(df)
    column_std = column_std * 3.64
    return column_std

# Wyniki pomiarów

In [288]:
raw_triangles_measurements = pd.read_csv(TRIANGLE_MEASUREMENTS_PATH)
display(raw_triangles_measurements)

Unnamed: 0,a,b,c,ha,hb,hc
0,91.1,81.3,74.3,63.5,70.9,77.35
1,91.2,81.35,74.25,63.0,71.0,77.5
2,90.9,81.15,74.2,63.25,70.9,77.6
3,91.1,81.15,74.3,63.5,71.9,77.3
4,91.2,81.3,74.3,63.4,71.4,77.6
5,90.75,80.9,74.1,63.7,70.7,77.5
6,90.9,80.0,74.0,63.0,70.65,77.3
7,91.3,81.25,74.35,63.3,70.9,77.5
8,91.0,81.2,74.15,63.15,70.75,77.3
9,91.0,81.2,74.2,63.05,70.9,77.65


## Przefiltrowane wyniki

In [289]:
sanitized_triangles_measurements = remove_outliers(raw_triangles_measurements)
display(sanitized_triangles_measurements)

Unnamed: 0,a,b,c,ha,hb,hc
0,91.1,81.3,74.3,63.5,70.9,77.35
1,91.2,81.35,74.25,63.0,71.0,77.5
2,90.9,81.15,74.2,63.25,70.9,77.6
3,91.1,81.15,74.3,63.5,71.9,77.3
4,91.2,81.3,74.3,63.4,71.4,77.6
5,90.75,80.9,74.1,63.7,70.7,77.5
7,91.3,81.25,74.35,63.3,70.9,77.5
8,91.0,81.2,74.15,63.15,70.75,77.3
9,91.0,81.2,74.2,63.05,70.9,77.65
10,91.1,81.25,74.3,63.25,70.75,77.5


## Średnie pomiarów boków

In [290]:
means = compute_column_means(sanitized_triangles_measurements)
# rename columns to have a mean sign above them in latex
means.columns = [f"s({col})" for col in means.columns]
display(means)

Unnamed: 0,s(a),s(b),s(c),s(ha),s(hb),s(hc)
0,91.045833,81.216667,74.220833,63.258333,70.933333,77.433333


## Niepewności standardowe mierzonych wartości

In [291]:
standard_uncertainty = compute_standard_undertainty(sanitized_triangles_measurements)
standard_uncertainty.columns = [f"u(s({col}))" for col in standard_uncertainty.columns]
display(standard_uncertainty)

Unnamed: 0,u(s(a)),u(s(b)),u(s(c)),u(s(ha)),u(s(hb)),u(s(hc))
0,0.047457,0.047805,0.026442,0.0712,0.109464,0.045366


## Niepewności standardowe całkowite

In [292]:
uncertainty = compute_measurement_uncertainty(sanitized_triangles_measurements, 0.05)
uncertainty.columns = [f"u({col})" for col in uncertainty.columns]
display(uncertainty)

Unnamed: 0,u(a),u(b),u(c),u(ha),u(hb),u(hc)
0,0.068936,0.069176,0.056561,0.087003,0.120343,0.067514


## Niepewności standardowe rozszerzone

In [293]:
extended_uncertainties = compute_extended_standard_uncertainty(sanitized_triangles_measurements)
extended_uncertainties.columns = [f"U({col})" for col in extended_uncertainties.columns]
display(extended_uncertainties)

Unnamed: 0,U(a),U(b),U(c),U(ha),U(hb),U(hc)
0,0.172745,0.174012,0.096249,0.259168,0.398448,0.165133


# Analiza pól trójkątów

In [294]:

def compute_triangle_surface_heron_method(df: pd.DataFrame) -> pd.DataFrame:
    # Compute the surface of the triangle using the Heron's method
    a = df['a']
    b = df['b']
    c = df['c']
    s = (a + b + c) / 2
    surface = np.sqrt(s * (s - a) * (s - b) * (s - c))
    
    return surface

def compute_triangle_surface_area(df: pd.DataFrame, side: str, height: str, descriptor: str) -> pd.DataFrame:
    # Compute the surface of the triangle using the area formula
    a = df[side]
    h = df[height]
    surface = a * h / 2
    
    # return series with results
    return surface

def compute_triangle_surface_all_methods(df: pd.DataFrame) -> pd.DataFrame:
    # Compute the surface of the triangle using all methods and create new data frame with just sufraces
    surfaces = pd.DataFrame()
    surfaces["area_heron"] = compute_triangle_surface_heron_method(df)
    surfaces["area_ah"] = compute_triangle_surface_area(df, "a", "ha", "a")
    surfaces["area_bh"] = compute_triangle_surface_area(df, "b", "hb", "b")
    surfaces["area_ch"] = compute_triangle_surface_area(df, "c", "hc", "c")

    return surfaces

In [295]:

def compute_combined_standard_deviation(areas_df: pd.DataFrame, means_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute the combined standard deviation for each row in the areas_df DataFrame.
    defined as sqrt(sum((area - mean)**2) / n(n-1)) and return dataframe with results
    """

    combined_std = np.sqrt(((areas_df - means_df)**2).sum() / (len(areas_df) * (len(areas_df) - 1)))
    
    # Create a new DataFrame with one row
    result = pd.DataFrame([combined_std.values], columns=combined_std.index)
    return result
    # compute the combined standard deviation for each row


In [296]:
raw_sufraces = compute_triangle_surface_all_methods(sanitized_triangles_measurements)
display(raw_sufraces)

Unnamed: 0,area_heron,area_ah,area_bh,area_ch
0,2864.421615,2892.425,2882.085,2873.5525
1,2865.697052,2872.8,2887.925,2877.1875
2,2854.502018,2874.7125,2876.7675,2878.96
3,2860.579109,2892.425,2917.3425,2871.695
4,2865.941423,2891.04,2902.41,2882.84
5,2842.806665,2890.3875,2859.815,2871.375
7,2867.695318,2889.645,2880.3125,2881.0625
8,2855.779797,2873.325,2872.45,2865.8975
9,2857.30088,2868.775,2878.54,2880.815
10,2863.142005,2881.0375,2874.21875,2879.125


In [297]:
sanitized_surfaces = remove_outliers(raw_sufraces)
display(sanitized_surfaces)

Unnamed: 0,area_heron,area_ah,area_bh,area_ch
0,2864.421615,2892.425,2882.085,2873.5525
1,2865.697052,2872.8,2887.925,2877.1875
2,2854.502018,2874.7125,2876.7675,2878.96
3,2860.579109,2892.425,2917.3425,2871.695
4,2865.941423,2891.04,2902.41,2882.84
5,2842.806665,2890.3875,2859.815,2871.375
7,2867.695318,2889.645,2880.3125,2881.0625
8,2855.779797,2873.325,2872.45,2865.8975
9,2857.30088,2868.775,2878.54,2880.815
10,2863.142005,2881.0375,2874.21875,2879.125


## Średnie wartości pól

In [298]:
mean_sufraces = compute_column_means(raw_sufraces)
display(mean_sufraces)

Unnamed: 0,area_heron,area_ah,area_bh,area_ch
0,2859.056424,2879.704375,2880.483646,2873.585833


## Średnie wartości na podstawie śrenich z pomiarów

In [299]:
# rename columns in means to be a,b,c, ha, hb, hc in mean 
means_tmp = means.copy()
means_tmp.columns = ["a", "b", "c", "ha", "hb", "hc"]
mean_surfaces_from_means = compute_triangle_surface_all_methods(means_tmp)
display(mean_surfaces_from_means)

Unnamed: 0,area_heron,area_ah,area_bh,area_ch
0,2859.057058,2879.703837,2880.484444,2873.583264


## Odchylenia standardowe pól

In [300]:
standard_deviations = raw_sufraces.std().to_frame().T
standard_deviations.columns = [f"s({col})" for col in standard_deviations.columns]
display(standard_deviations)

Unnamed: 0,s(area_heron),s(area_ah),s(area_bh),s(area_ch)
0,8.043574,12.495609,16.31731,7.9147


## Złozone niepewnosci pomiarowe

In [301]:

combined_standard_deviations = compute_combined_standard_deviation(raw_sufraces, mean_sufraces)
display(combined_standard_deviations)

Unnamed: 0,area_heron,area_ah,area_bh,area_ch
0,0.46698,1.107188,0.13938,0.002901
