<a href="https://colab.research.google.com/github/iGeology-Illinois/geol-581-module-3-principles-of-uncertainty-chapter-4-seanb7/blob/main/Python%20Lab%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
# Database Setup & Data Import
# Create SQLite data tables with CPT data collected from W. Roosevelt and S. Clark Site
import os
import pandas as pd
from sqlalchemy import create_engine

# 1) Create an engine and connect to 'cpt_data.db' (it will be created if not existing)
engine = create_engine('sqlite:///Lab 4 Site Investigation.db')

# 2) Read Excel files into a DataFrame
# Get the current working directory
current_directory = os.getcwd()

# List of Excel file names
file_names = [
    '24-61-27761_CP24-COOK-01-BSC.XLS',
    '24-61-27761_CP24-COOK-02-BSC.XLS',
    '24-61-27761_CP24-COOK-03-BSC.XLS',
    '24-61-27761_CP24-COOK-04-BSC.XLS',
    '24-61-27761_CP24-COOK-06-BSC.XLS',
    '24-61-27761_CP24-COOK-07-BSC.XLS',
    '24-61-27761_CP24-COOK-02-OFF01-BSC.XLS',
    '24-61-27761_CP24-COOK-02-OFF02-BSC.XLS',
    '24-61-27761_CP24-COOK-06-OFF01-BSC.XLS'
]

# Loop through the file names and read each Excel file
for file_name in file_names:
    # Construct the full file path
    file_path = os.path.join(current_directory, file_name)

    # Check if the file exists
    if os.path.exists(file_path):
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path, sheet_name='Sheet1')
        print(f"Successfully read: {file_name}")  # Print a success message
    else:
        print(f"File not found: {file_name}")  # Print an error message

# Example: rename columns or filter out header rows, if needed
# Suppose we skip the first 39 rows with metadata, and then read the main data
df_main = pd.read_excel('24-61-27761_CP24-COOK-01-BSC.XLS', sheet_name='Sheet1', skiprows=39)
df_main.columns = ['Layer', 'Depth_m', 'Depth_ft', 'qc_tsf', 'qt_tsf', 'fs_tsf', 'u_ft', 'Rf_pct']

# 3) Write to SQL table
df_main.to_sql('cpt_cook_01', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_02', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_03', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_04', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_06', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_07', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_02_off01', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_02_off02', con=engine, if_exists='replace', index=False)
df_main.to_sql('cpt_cook_06_off01', con=engine, if_exists='replace', index=False)

# 4) Run a SELECT query to see results
# Correct the query to select from one table and apply the LIMIT clause correctly
query1 = "SELECT * FROM cpt_cook_01 LIMIT 5"
query2 = "SELECT * FROM cpt_cook_02 LIMIT 5"
query3 = "SELECT * FROM cpt_cook_03 LIMIT 5"
query4 = "SELECT * FROM cpt_cook_04 LIMIT 5"
query5 = "SELECT * FROM cpt_cook_06 LIMIT 5"
query6 = "SELECT * FROM cpt_cook_07 LIMIT 5"
query7 = "SELECT * FROM cpt_cook_02_off01 LIMIT 5"
query8 = "SELECT * FROM cpt_cook_02_off02 LIMIT 5"
query9 = "SELECT * FROM cpt_cook_06_off01 LIMIT 5"
result1 = pd.read_sql(query1, con=engine)
result2 = pd.read_sql(query2, con=engine)
result3 = pd.read_sql(query3, con=engine)
result4 = pd.read_sql(query4, con=engine)
result5 = pd.read_sql(query5, con=engine)
result6 = pd.read_sql(query6, con=engine)
result7 = pd.read_sql(query7, con=engine)
result8 = pd.read_sql(query8, con=engine)
result9 = pd.read_sql(query9, con=engine)
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)
print(result6)
print(result7)
print(result8)
print(result9)

Successfully read: 24-61-27761_CP24-COOK-01-BSC.XLS
File not found: 24-61-27761_CP24-COOK-02-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-03-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-04-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-06-BSC.XLS
File not found: 24-61-27761_CP24-COOK-07-BSC.XLS
File not found: 24-61-27761_CP24-COOK-02-OFF01-BSC.XLS
File not found: 24-61-27761_CP24-COOK-02-OFF02-BSC.XLS
Successfully read: 24-61-27761_CP24-COOK-06-OFF01-BSC.XLS
   Layer  Depth_m  Depth_ft  qc_tsf     qt_tsf  fs_tsf   u_ft      Rf_pct
0      1    0.025   0.08202   0.036   0.036730   0.038  0.117  103.456543
1      2    0.050   0.16404  21.657  21.678931   0.080  3.513    0.369022
2      3    0.075   0.24606  41.663  41.696486   0.097  5.364    0.232634
3      4    0.100   0.32808  64.817  64.861517   0.201  7.131    0.309891
4      5    0.125   0.41010  82.667  82.724046   0.490  9.138    0.592331
   Layer  Depth_m  Depth_ft  qc_tsf     qt_tsf  fs_tsf   u_ft      Rf_pct
0      1

In [89]:
# Data Cleaning and Validation
import sqlite3

# Connect to the SQLite database
# Use the same database file path
conn = sqlite3.connect('Lab 4 Site Investigation.db') # Fixed filename
cursor = conn.cursor()

# Remove rows with null values in the 'Rf_pct' column (assuming 'friction ration' maps to 'Rf_pct')
cursor.execute('DELETE FROM cpt_cook_01 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_02 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_03 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_04 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_06 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_07 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_02_off01 WHERE Rf_pct IS NULL')
cursor.execute('DELETE FROM cpt_cook_06_off01 WHERE Rf_pct IS NULL')

# Commit the changes after cleaning
conn.commit()

# Remove duplicate rows based on a specific column - Changed 'friction ration' to 'Rf_pct'
cursor.execute('''
    DELETE FROM cpt_cook_01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_01
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_03
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_03
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_04
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_04
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_06
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_06
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_07
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_07
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02_off01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02_off01
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_02_off02
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_02_off02
        GROUP BY Rf_pct
    )
''')
cursor.execute('''
    DELETE FROM cpt_cook_06_off01
    WHERE rowid NOT IN (
        SELECT MIN(rowid)
        FROM cpt_cook_06_off01
        GROUP BY Rf_pct
    )
''')
# Commit the changes after cleaning
conn.commit()


In [93]:
#Statistical Summary
import sqlite3
import pandas as pd
summary_statistics = ()
key_columns = ['Layer', 'Depth_m', 'Depth_ft', 'qc_tsf', 'qt_tsf', 'fs_tsf', 'u_ft', 'Rf_pct']
plot_data = ('cpt_cook_01', 'cpt_cook_02', 'cpt_cook_03', 'cpt_cook_04', 'cpt_cook_06', 'cpt_cook_07', 'cpt_cook_02_off01', 'cpt_cook_02_off02', 'cpt_cook_06_off01')

Summary Statistics:
