In [1]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")

# Example usage
process_json_data('mammals_rand_2.json', 542039)
df = pd.read_json('mammals_rand_2_norm.json', orient='records', lines=False)

I am 95% sure the average weight of mammals_rand_2.json is between: (776.8295924453171, 985.1758975441455) grams for total population of 542039
I am 95% sure the average length of mammals_rand_2.json is between: (442.5705479528069, 510.12521619144184) mm for total population of 542039
Normalized data saved to: mammals_rand_2_norm.json


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['length_mm'].replace(0, np.nan, inplace=True)


In [3]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")

# Example usage
process_json_data('fishes_rand_2.json', 10705)
df = pd.read_json('fishes_rand_2_norm.json', orient='records', lines=False)

I am 95% sure the average weight of fishes_rand_2.json is between: (nan, nan) grams for total population of 10705
I am 95% sure the average length of fishes_rand_2.json is between: (96.20364166632231, 101.6979589434338) mm for total population of 10705
Normalized data saved to: fishes_rand_2_norm.json


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using '

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")

# Example usage
process_json_data('inv_rand_2.json', 45245)
df = pd.read_json('inv_rand_2_norm.json', orient='records', lines=False)

I am 95% sure the average weight of inv_rand_2.json is between: (nan, nan) grams for total population of 45245
I am 95% sure the average length of inv_rand_2.json is between: (28.417793918070984, 37.22003216888554) mm for total population of 45245
Normalized data saved to: inv_rand_2_norm.json


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
  df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")


# Example usage
process_json_data('birds_rand_2.json', 558383)
df = pd.read_json('birds_rand_2_norm.json', orient='records', lines=False)

I am 95% sure the average weight of birds_rand_2.json is between: (81.11527238796427, 120.64582486889358) grams for total population of 558383
I am 95% sure the average length of birds_rand_2.json is between: (142.10368839942882, 157.04248474988108) mm for total population of 558383
Normalized data saved to: birds_rand_2_norm.json


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplac

In [12]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")


# Example usage
process_json_data('herp_10k_rand_1.json', 2341)
df = pd.read_json('herp_10k_rand_1_norm.json', orient='records', lines=False)

I am 95% sure the average weight of herp_10k_rand_1.json is between: (9.94748373171621, 12.544933850701371) grams for population of 2341
I am 95% sure the average length of herp_10k_rand_1.json is between: (196.0430569479021, 253.3864167363084) mm for population of 2341
Normalized data saved to: herp_10k_rand_1_norm.json


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplac

In [5]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000

    # BEGIN: Identify and print count of unique instances of numbers followed by anything other than mm, m, cm, g, or kg, and what they are
    other_units = df['physicalDescriptions'].str.extractall(r'(\d+\.?\d*)\s?([^\d\s]+)').droplevel(0)
    other_units = other_units[~other_units[1].isin(['mm', 'cm', 'm', 'g', 'kg'])]
    unique_other_units = other_units[1].value_counts()
    if unique_other_units.empty:
        print("No outlier measurements found")
    else:
        print("Count of unique instances of numbers followed by anything other than mm, m, cm, g, or kg:")
        print(unique_other_units)
    # END
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")

    print(f"Total count of rows: {len(df)}")


# Example usage
process_json_data('anthro_rand_2.json', 3000000)
df = pd.read_json('anthro_rand_2_norm.json', orient='records', lines=False)

Count of unique instances of numbers followed by anything other than mm, m, cm, g, or kg:
1
cm',    4340
cm']    3234
mm',     540
g']      277
g',      261
mm']     218
m']       15
m',       14
Name: count, dtype: int64
I am 95% sure the average weight of anthro_rand_2.json is between: (23.963245429128563, 32.23694468493988) grams for total population of 3000000
I am 95% sure the average length of anthro_rand_2.json is between: (393.8682118794665, 459.678878396451) mm for total population of 3000000
Normalized data saved to: anthro_rand_2_norm.json
Total count of rows: 3744


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplac

In [16]:
import pandas as pd
import numpy as np
from scipy import stats

def process_json_data(json_file, population_size):
    # Read JSON file into DataFrame
    df = pd.read_json(json_file)
    
    # Ensure physicalDescriptions are strings
    df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)

    # Extract the year from the 'date' column and store it in a new 'year' column
    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    
    # Filter out rows without numerical values in physicalDescriptions
    df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
    
    # Extract and process weight (g and kg)
    df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
    df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
    
    # Extract and process length (mm, cm, m)
    df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*)mm', expand=False).astype(float)
    
    # Apply cm to mm conversion only where length is missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*)cm', expand=False).fillna(0).astype(float) * 10
    
    # Extract and process length from '36mm x 15mm' format
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*)mm\s*x\s*\d+\.?\d*mm', expand=False).astype(float)
    
    # Apply m to mm conversion only where length is still missing
    df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*)m', expand=False).fillna(0).astype(float) * 1000
    
    # Replace 0 values with NaN in weight and length
    df['weight_g'].replace(0, np.nan, inplace=True)
    df['length_mm'].replace(0, np.nan, inplace=True)
    
    # Sample size
    n = len(df)
    
    # Sample mean and standard deviation for weight
    sample_mean_weight = df['weight_g'].mean()
    sample_std_weight = df['weight_g'].std(ddof=1)
    
    # Standard error of the mean for weight
    sem_weight = sample_std_weight / np.sqrt(n)
    
    # Confidence level
    confidence_level = 0.95
    
    # Degrees of freedom
    df_degrees = n - 1
    
    # t-score for the confidence level
    t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)
    
    # Margin of error for weight
    margin_of_error_weight = t_score * sem_weight
    
    # Confidence interval for weight
    confidence_interval_weight = (sample_mean_weight - margin_of_error_weight, sample_mean_weight + margin_of_error_weight)
    
    print(f"I am 95% sure the average weight of {json_file} is between: {confidence_interval_weight} grams for total population of {population_size}")
    
    # Sample mean and standard deviation for length
    sample_mean_length = df['length_mm'].mean()
    sample_std_length = df['length_mm'].std(ddof=1)
    
    # Standard error of the mean for length
    sem_length = sample_std_length / np.sqrt(n)
    
    # Margin of error for length
    margin_of_error_length = t_score * sem_length
    
    # Confidence interval for length
    confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)
    
    print(f"I am 95% sure the average length of {json_file} is between: {confidence_interval_length} mm for total population of {population_size}")
    
    # Export the DataFrame as a JSON array instead of newline-delimited JSON
    output_file = json_file.replace('.json', '_norm.json')
    df.to_json(output_file, orient='records', lines=False)  # Remove lines=True to generate proper JSON array
    print(f"Normalized data saved to: {output_file}")

    print(f"Total count of rows: {len(df)}")


# Example usage
process_json_data('ento_rand_2.json', 193424)
df = pd.read_json('ento_rand_2_norm.json', orient='records', lines=False)

I am 95% sure the average weight of ento_rand_2.json is between: (nan, nan) grams for total population of 193424
I am 95% sure the average length of ento_rand_2.json is between: (38.275533516111615, 39.860830120252025) mm for total population of 193424
Normalized data saved to: ento_rand_2_norm.json
Total count of rows: 88


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
  df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False).fillna(0).astype(float)
  df['weight_g'] += df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000
  df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*)cm', expand=False).fillna(0).astype(float) * 10
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

No outlier measurements found
I am 95% sure the average weight of minsci_rand_2.json is between: (nan, nan) grams for total population of 3000000
I am 95% sure the average length of minsci_rand_2.json is between: (nan, nan) mm for total population of 3000000
Normalized data saved to: minsci_rand_2_norm.json
Total count of rows: 0


  df = df[df['physicalDescriptions'].str.contains(r'\b\d+\.?\d*\s?(mm|cm|m)\b', na=False)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['length_mm'].replace(0, np.nan, inplace=True)
