<a href="https://colab.research.google.com/github/iGeology-Illinois/geol-581-module-3-principles-of-uncertainty-chapter-4-seanb7/blob/main/Python%20Lab%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Database Setup & Data Import
# Create SQLite data tables with CPT data collected from W. Roosevelt and S. Clark Site
import os
import pandas as pd
from sqlalchemy import create_engine

# 1) Create an engine and connect to 'cpt_data.db' (it will be created if not existing)
engine = create_engine('sqlite:///Lab 4 Site Investigation.db')

# 2) Read Excel files into a DataFrame
# Get the current working directory
current_directory = os.getcwd()

# List of Excel file names
file_names = [
    '24-61-27761_CP24-COOK-01-BSC.XLS',
    '24-61-27761_SP24-COOK-02-BSC.XLS',
    '24-61-27761_CP24-COOK-03-BSC.XLS',
    '24-61-27761_CP24-COOK-04-BSC.XLS',
    '24-61-27761_CP24-COOK-06-BSC.XLS',
    '24-61-27761_SP24-COOK-07-BSC.XLS',
    '24-61-27761_SP24-COOK-02-OFF01-BSC.XLS',
    '24-61-27761_SP24-COOK-02-OFF02-BSC.XLS',
    '24-61-27761_CP24-COOK-06-OFF01-BSC.XLS'
]

# Loop through the file names and read each Excel file
for file_name in file_names:
    # Construct the full file path
    file_path = os.path.join(current_directory, file_name)

    # Check if the file exists
    if os.path.exists(file_path):
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path, sheet_name='Sheet1')
        print(f"Successfully read: {file_name}")  # Print a success message
    else:
        print(f"File not found: {file_name}")  # Print an error message

# Example: rename columns or filter out header rows, if needed
# Suppose we skip the first 39 rows with metadata, and then read the main data
df_main = pd.read_excel('24-61-27761_CP24-COOK-01-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_SP24-COOK-02-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_CP24-COOK-03-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_CP24-COOK-04-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_CP24-COOK-06-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_SP24-COOK-07-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_SP24-COOK-02-OFF01-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_SP24-COOK-02-OFF02-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main = pd.read_excel('24-61-27761_CP24-COOK-06-OFF01-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main.columns = ['Layer', 'Depth_m', 'Depth_ft', 'qc_tsf', 'qt_tsf', 'fs_tsf', 'u_ft', 'Rf_pct']

# 3) Write to SQL table
# Use a context manager to ensure the connection is closed properly
with engine.begin() as conn: # using a context manager to handle the connection
    df_main.to_sql('cpt_cook_01', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_02', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_03', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_04', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_06', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_07', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_02_off01', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_02_off02', con=conn, if_exists='replace', index=False)
    df_main.to_sql('cpt_cook_06_off01', con=conn, if_exists='replace', index=False)

# 4) Run a SELECT query to see results
# Correct the query to select from one table and apply the LIMIT clause correctly
query1 = "SELECT * FROM cpt_cook_01 LIMIT 453"
query2 = "SELECT * FROM cpt_cook_02 LIMIT 453"
query3 = "SELECT * FROM cpt_cook_03 LIMIT 453"
query4 = "SELECT * FROM cpt_cook_04 LIMIT 453"
query5 = "SELECT * FROM cpt_cook_06 LIMIT 453"
query6 = "SELECT * FROM cpt_cook_07 LIMIT 453"
query7 = "SELECT * FROM cpt_cook_02_off01 LIMIT 453"
query8 = "SELECT * FROM cpt_cook_02_off02 LIMIT 453"
query9 = "SELECT * FROM cpt_cook_06_off01 LIMIT 453"
result1 = pd.read_sql(query1, con=engine)
result2 = pd.read_sql(query2, con=engine)
result3 = pd.read_sql(query3, con=engine)
result4 = pd.read_sql(query4, con=engine)
result5 = pd.read_sql(query5, con=engine)
result6 = pd.read_sql(query6, con=engine)
result7 = pd.read_sql(query7, con=engine)
result8 = pd.read_sql(query8, con=engine)
result9 = pd.read_sql(query9, con=engine)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)
print(result6)
print(result7)
print(result8)
print(result9)

Successfully read: 24-61-27761_CP24-COOK-01-BSC.XLS
Successfully read: 24-61-27761_SP24-COOK-02-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-03-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-04-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-06-BSC.XLS
Successfully read: 24-61-27761_SP24-COOK-07-BSC.XLS
Successfully read: 24-61-27761_SP24-COOK-02-OFF01-BSC.XLS
Successfully read: 24-61-27761_SP24-COOK-02-OFF02-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-06-OFF01-BSC.XLS
    Layer  Depth_m  Depth_ft   qc_tsf      qt_tsf  fs_tsf   u_ft    Rf_pct
0       1    0.025   0.08202   13.162   13.162893   0.159  0.143  1.207941
1       2    0.050   0.16404   41.110   41.115450   0.758  0.873  1.843589
2       3    0.075   0.24606   50.933   50.947240   1.032  2.281  2.025625
3       4    0.100   0.32808   55.949   55.970157   1.271  3.389  2.270853
4       5    0.125   0.41010   66.339   66.368135   1.897  4.667  2.858299
5       6    0.150   0.49212   93.819   93.853141   1.899  5.469

In [32]:
# Data Cleaning and Validation
import sqlite3

# Connect to the SQLite database
# Use the same database file path
conn = sqlite3.connect('Lab 4 Site Investigation.db')
cursor = conn.cursor()

# Remove rows with null values in the 'Rf_pct' and fs_tsf column (assuming 'friction ration' maps to 'Rf_pct')
cursor.execute('DELETE FROM cpt_cook_01 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_02 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_03 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_04 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_06 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_07 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_02_off01 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_02_off02 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')
cursor.execute('DELETE FROM cpt_cook_06_off01 WHERE Rf_pct IS NULL OR fs_tsf IS NULL')

# Retrieving average tip resistance (qt) for cpt_cook_01
query_avg1 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_01
"""
query_avg2 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_02
"""
query_avg3 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_03
"""
query_avg4 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_04
"""
query_avg5 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_06
"""
query_avg6 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_07
"""
query_avg7 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_02_off01
"""
query_avg8 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_02_off02
"""
query_avg9 = """
    SELECT AVG(qt_tsf) AS avg_qt
    FROM cpt_cook_06_off01
"""

avg_qt1 = pd.read_sql(query_avg1, con=engine)
avg_qt2 = pd.read_sql(query_avg2, con=engine)
avg_qt3 = pd.read_sql(query_avg3, con=engine)
avg_qt4 = pd.read_sql(query_avg4, con=engine)
avg_qt5 = pd.read_sql(query_avg5, con=engine)
avg_qt6 = pd.read_sql(query_avg6, con=engine)
avg_qt7 = pd.read_sql(query_avg7, con=engine)
avg_qt8 = pd.read_sql(query_avg8, con=engine)
avg_qt9 = pd.read_sql(query_avg9, con=engine)

print("Average qt (tsf):", avg_qt1['avg_qt'][0])
print("Average qt (tsf):", avg_qt2['avg_qt'][0])
print("Average qt (tsf):", avg_qt3['avg_qt'][0])
print("Average qt (tsf):", avg_qt4['avg_qt'][0])
print("Average qt (tsf):", avg_qt5['avg_qt'][0])
print("Average qt (tsf):", avg_qt6['avg_qt'][0])
print("Average qt (tsf):", avg_qt7['avg_qt'][0])
print("Average qt (tsf):", avg_qt8['avg_qt'][0])
print("Average qt (tsf):", avg_qt9['avg_qt'][0])

# Finding rows with negative friction ratio
query_neg_rf1 = """
    SELECT *
    FROM cpt_cook_01
    WHERE rf_pct < 0
"""
query_neg_rf2 = """
    SELECT *
    FROM cpt_cook_02
    WHERE rf_pct < 0
"""
query_neg_rf3 = """
    SELECT *
    FROM cpt_cook_03
    WHERE rf_pct < 0
"""
query_neg_rf4 = """
    SELECT *
    FROM cpt_cook_04
    WHERE rf_pct < 0
"""
query_neg_rf5 = """
    SELECT *
    FROM cpt_cook_06
    WHERE rf_pct < 0
"""
query_neg_rf6 = """
    SELECT *
    FROM cpt_cook_07
    WHERE rf_pct < 0
"""
query_neg_rf7 = """
    SELECT *
    FROM cpt_cook_02_off01
    WHERE rf_pct < 0
"""
query_neg_rf8 = """
    SELECT *
    FROM cpt_cook_02_off02
    WHERE rf_pct < 0
"""
query_neg_rf9 = """
    SELECT *
    FROM cpt_cook_06_off01
    WHERE rf_pct < 0
"""
negatives1 = pd.read_sql(query_neg_rf1, con=engine)
negatives2 = pd.read_sql(query_neg_rf2, con=engine)
negatives3 = pd.read_sql(query_neg_rf3, con=engine)
negatives4 = pd.read_sql(query_neg_rf4, con=engine)
negatives5 = pd.read_sql(query_neg_rf5, con=engine)
negatives6 = pd.read_sql(query_neg_rf6, con=engine)
negatives7 = pd.read_sql(query_neg_rf7, con=engine)
negatives8 = pd.read_sql(query_neg_rf8, con=engine)
negatives9 = pd.read_sql(query_neg_rf9, con=engine)

print("Negative friction ratio rows:\n", negatives1)
print("Negative friction ratio rows:\n", negatives2)
print("Negative friction ratio rows:\n", negatives3)
print("Negative friction ratio rows:\n", negatives4)
print("Negative friction ratio rows:\n", negatives5)
print("Negative friction ratio rows:\n", negatives6)
print("Negative friction ratio rows:\n", negatives7)
print("Negative friction ratio rows:\n", negatives8)
print("Negative friction ratio rows:\n", negatives9)


# Remove duplicate rows based on a specific column - Changed 'friction ration' to 'Rf_pct'
cursor.execute('''
    DELETE FROM cpt_cook_01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_01
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_03
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_03
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_04
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_04
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_06
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_06
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_07
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_07
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02_off01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02_off01
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02_off02
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02_off02
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_06_off01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_06_off01
        GROUP BY Rf_pct
    )
''')

 # Fetch cleaned data into DataFrames for printing and further processing
cleaned_dataframe1 = pd.read_sql('SELECT * FROM cpt_cook_01', con=engine)
cleaned_dataframe2 = pd.read_sql('SELECT * FROM cpt_cook_02', con=engine)
cleaned_dataframe3 = pd.read_sql('SELECT * FROM cpt_cook_03', con=engine)
cleaned_dataframe4 = pd.read_sql('SELECT * FROM cpt_cook_04', con=engine)
cleaned_dataframe6 = pd.read_sql('SELECT * FROM cpt_cook_06', con=engine)
cleaned_dataframe7 = pd.read_sql('SELECT * FROM cpt_cook_07', con=engine)
cleaned_dataframe2_off01 = pd.read_sql('SELECT * FROM cpt_cook_02_off01', con=engine)
cleaned_dataframe2_off02 = pd.read_sql('SELECT * FROM cpt_cook_02_off02', con=engine)
cleaned_dataframe6_off01 = pd.read_sql('SELECT * FROM cpt_cook_06_off01', con=engine)

# Commit the changes
conn.commit()


print("Duplicate rows removed successfully.")
print("Data cleaning and validation completed.")

# Print Cleaned Dataframes
print(cleaned_dataframe1)
print(cleaned_dataframe2)
print(cleaned_dataframe3)
print(cleaned_dataframe4)
print(cleaned_dataframe6)
print(cleaned_dataframe7)
print(cleaned_dataframe2_off01)
print(cleaned_dataframe2_off02)

Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Average qt (tsf): 100.09606497446707
Negative friction ratio rows:
 Empty DataFrame
Columns: [Layer, Depth_m, Depth_ft, qc_tsf, qt_tsf, fs_tsf, u_ft, Rf_pct]
Index: []
Negative friction ratio rows:
 Empty DataFrame
Columns: [Layer, Depth_m, Depth_ft, qc_tsf, qt_tsf, fs_tsf, u_ft, Rf_pct]
Index: []
Negative friction ratio rows:
 Empty DataFrame
Columns: [Layer, Depth_m, Depth_ft, qc_tsf, qt_tsf, fs_tsf, u_ft, Rf_pct]
Index: []
Negative friction ratio rows:
 Empty DataFrame
Columns: [Layer, Depth_m, Depth_ft, qc_tsf, qt_tsf, fs_tsf, u_ft, Rf_pct]
Index: []
Negative friction ratio rows:
 Empty DataFrame
Columns: [Layer, Depth_m, Depth_ft, qc_tsf, qt_tsf, fs_tsf, u_ft, Rf_pct]
Index: []
Negative fri

In [None]:
# Combine cleaned dataframes
combined_data = pd.concat([
    cleaned_dataframe1,
    cleaned_dataframe2,
    cleaned_dataframe3,
    cleaned_dataframe4,
    cleaned_dataframe6,
    cleaned_dataframe7,
    cleaned_dataframe2_off01,
    cleaned_dataframe2_off02,
    cleaned_dataframe6_off01
])
print(combined_data)

    Layer  Depth_m  Depth_ft   qc_tsf      qt_tsf  fs_tsf   u_ft    Rf_pct
0       1    0.025   0.08202   13.162   13.162893   0.159  0.143  1.207941
1       2    0.050   0.16404   41.110   41.115450   0.758  0.873  1.843589
2       3    0.075   0.24606   50.933   50.947240   1.032  2.281  2.025625
3       4    0.100   0.32808   55.949   55.970157   1.271  3.389  2.270853
4       5    0.125   0.41010   66.339   66.368135   1.897  4.667  2.858299
..    ...      ...       ...      ...         ...     ...    ...       ...
32     33    0.825   2.70666  165.495  165.516812   1.585  3.494  0.957607
33     34    0.850   2.78868  194.289  194.311580   0.000  3.617  0.000000
34     35    0.875   2.87070  234.192  234.222883   0.000  4.947  0.000000
35     36    0.900   2.95272  252.265  252.299260   0.000  5.488  0.000000
36     37    0.925   3.03474  292.813  292.851980   0.000  6.244  0.000000

[333 rows x 8 columns]


In [None]:
# Statistical Summary
# Run summary statisitcs for each file
import pandas as pd
import sqlite3

summary_statistics = combined_data.describe()
summary_statistics1 = cleaned_dataframe1.describe()
summary_statistics2 = cleaned_dataframe2.describe()
summary_statistics3 = cleaned_dataframe3.describe()
summary_statistics4 = cleaned_dataframe4.describe()
summary_statistics6 = cleaned_dataframe6.describe()
summary_statistics7 = cleaned_dataframe7.describe()
summary_statistics2_off01 = cleaned_dataframe2_off01.describe()
summary_statistics2_off02 = cleaned_dataframe2_off02.describe()
summary_statistics6_off01 = cleaned_dataframe6_off01.describe()

# Define key columns for statistical analysis. Depth.1 is in ft.
key_columns = [ "Depth.1", "qc", "qt", "fs", "u", "Rf",]

print(summary_statistics)
print(summary_statistics1)
print(summary_statistics2)
print(summary_statistics3)
print(summary_statistics4)
print(summary_statistics6)
print(summary_statistics7)
print(summary_statistics2_off01)
print(summary_statistics2_off02)
print(summary_statistics6_off01)

            Layer     Depth_m    Depth_ft      qc_tsf      qt_tsf      fs_tsf  \
count  333.000000  333.000000  333.000000  333.000000  333.000000  333.000000   
mean    19.000000    0.475000    1.558380  113.009432  113.044333    1.223000   
std     10.693146    0.267329    0.877052   61.025649   61.027489    0.737575   
min      1.000000    0.025000    0.082020   13.162000   13.162893    0.000000   
25%     10.000000    0.250000    0.820200   73.284000   73.322330    0.822000   
50%     19.000000    0.475000    1.558380   93.406000   93.441727    1.232000   
75%     28.000000    0.700000    2.296560  132.125000  132.144165    1.731000   
max     37.000000    0.925000    3.034740  292.813000  292.851980    3.669000   

             u_ft      Rf_pct  
count  333.000000  333.000000  
mean     5.590649    1.373041  
std      1.960417    0.963863  
min      0.143000    0.000000  
25%      4.667000    0.957607  
50%      6.251000    1.207941  
75%      7.098000    1.477400  
max      7.952

In [42]:
# Note of any differences between soundings (deeper phreatic) Cpt_Cook_07 and CPT_Cook_01
# Assign a unique sounding ID to each DataFrame
cleaned_dataframe1['sounding_id'] = 'CP24-COOK-01-BSC'
cleaned_dataframe2['sounding_id'] = 'SP24-COOK-02-BSC'
cleaned_dataframe3['sounding_id'] = 'CP24-COOK-03-BSC'
cleaned_dataframe4['sounding_id'] = 'CP24-COOK-04-BSC'
cleaned_dataframe6['sounding_id'] = 'CP24-COOK-06-BSC'
cleaned_dataframe7['sounding_id'] = 'SP24-COOK-07-BSC'
cleaned_dataframe2_off01['sounding_id'] = 'SP24-COOK-02-OFF01-BSC'
cleaned_dataframe2_off02['sounding_id'] = 'SP24-COOK-02-OFF02-BSC'
cleaned_dataframe6_off01['sounding_id'] = 'CP24-COOK-06-OFF01-BSC'

# Now, concatenate the DataFrames
combined_data = pd.concat([
    cleaned_dataframe1,
    cleaned_dataframe2,
    cleaned_dataframe3,
    cleaned_dataframe4,
    cleaned_dataframe6,
    cleaned_dataframe7,
    cleaned_dataframe2_off01,
    cleaned_dataframe2_off02,
    cleaned_dataframe6_off01
])

# Perform the groupby operation
soundings = combined_data.groupby('sounding_id')

# Print the result
print(soundings)

# Access the descriptive statistics for each sounding using a loop
for sounding_id, group_data in soundings:
    print(f"Descriptive Statistics for {sounding_id}:\n")
    print(group_data.describe())
    print("\n")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7948b51a7850>
Descriptive Statistics for CP24-COOK-01-BSC:

           Layer    Depth_m   Depth_ft      qc_tsf      qt_tsf     fs_tsf  \
count  34.000000  34.000000  34.000000   34.000000   34.000000  34.000000   
mean   17.500000   0.437500   1.435350  100.061147  100.096065   1.330912   
std     9.958246   0.248956   0.816775   44.474026   44.476746   0.678347   
min     1.000000   0.025000   0.082020   13.162000   13.162893   0.000000   
25%     9.250000   0.231250   0.758685   71.291250   71.335227   0.892000   
50%    17.500000   0.437500   1.435350   90.181500   90.224672   1.249000   
75%    25.750000   0.643750   2.112015  120.215250  120.253905   1.768500   
max    34.000000   0.850000   2.788680  216.572000  216.619932   3.669000   

            u_ft     Rf_pct  
count  34.000000  34.000000  
mean    5.593382   1.494191  
std     2.066476   0.923042  
min     0.143000   0.000000  
25%     3.879500   1.068862  
50%     

In [None]:
# Visulation of Soil Type Interpretation Robertson et al 1986
# Import necessary libraries
import numpy as np

# Define the main function for the algorithm
def robertson_algorithm(data, parameters):
    """
    Implements the algorithm described by Robertson et al. (1986).

    Parameters:
    data (array-like): Input data for the algorithm.
    parameters (dict): Dictionary of parameters required by the algorithm.

    Returns:
    result: The result of the algorithm.
    """
    # Initialize variables
    result = []

    # Example of processing data
    for item in data:
        processed_item = item * parameters.get('multiplier', 1)
        result.append(processed_item)

    # Example of a more complex operation
    result = np.array(result) ** parameters.get('exponent', 1)

    return result

# Example usage
if __name__ == "__main__":
    # Example data and parameters
    data = [1, 2, 3, 4, 5]
    parameters = {
        'multiplier': 2,
        'exponent': 3
    }

    # Run the algorithm
    result = robertson_algorithm(data, parameters)
    print("Result:", result)

Result: [   8   64  216  512 1000]


In [45]:
# Boxplot for each key column comparing to each file

import matplotlib.pyplot as plt
import pandas as pd

# Updated key columns to match the ones used in summary statistics
key_columns = ["Depth.1", "qc", "qt", "fs", "u", "Rf"]

# **Create a dictionary to store the cleaned DataFrames**
cleaned_dataframes = {
    'CP24-COOK-01-BSC': cleaned_dataframe1,
    'SP24-COOK-02-BSC': cleaned_dataframe2,
    'CP24-COOK-03-BSC': cleaned_dataframe3,
    'CP24-COOK-04-BSC': cleaned_dataframe4,
    'CP24-COOK-06-BSC': cleaned_dataframe6,
    'SP24-COOK-07-BSC': cleaned_dataframe7,
    'SP24-COOK-02-OFF01-BSC': cleaned_dataframe2_off01,
    'SP24-COOK-02-OFF02-BSC': cleaned_dataframe2_off02,
    'CP24-COOK-06-OFF01-BSC': cleaned_dataframe6_off01
}

# Create a combined boxplot for each key column across all datasets
for column in key_columns:
    data = {}

    # Collect and convert data from all datasets where the column exists
    for file_name, df_cleaned in cleaned_dataframes.items():
        if column in df_cleaned.columns:
            df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='coerce')  # Convert to numeric, setting errors to NaN
            data[file_name] = df_cleaned[column].dropna()  # Drop NaNs for plotting

    if data:  # Only plot if there is valid numeric data
        plt.figure(figsize=(12, 6))
        plt.boxplot(data.values(), labels=data.keys(), vert=True)
        plt.title(f"Box Plot Comparison for {column}")
        plt.ylabel("Value")
        plt.xlabel("Dataset")
        plt.xticks(rotation=30)
        plt.grid(True)
        plt.show()