## Import JSON for NHMS - Mammals

In [1]:
import pandas as pd

# Load the JSON file into a DataFrame
# df = pd.read_json('normal_test.json')
df = pd.read_json('mammals_10k_rand_1.json')

# Print the head of the DataFrame
# print(df.head())

## Explore physical descriptions
For sort by: most relevant in mammals collection only

In [2]:
# create dataset where physicalDescriptions strings includes at least one number. Print head and total count. 
df['physicalDescriptions'] = df['physicalDescriptions'].astype(str)
df = df[df['physicalDescriptions'].str.contains(r'\d', na=False)]
# print(df['physicalDescriptions'].head(20))
# print(df['physicalDescriptions'].count())

1     ['35 g', '256 mm', '152 mm', '26 mm', '19 mm',...
2     ['35 g', '127 mm', '48 mm', '14 mm', '17 mm', ...
6     ['213 mm', '71 mm', '29 mm', 'Skin', 'Skull', ...
11    ['35 g', '256 mm', '152 mm', '26 mm', '19 mm',...
12    ['35 g', '127 mm', '48 mm', '14 mm', '17 mm', ...
16    ['213 mm', '71 mm', '29 mm', 'Skin', 'Skull', ...
20    ['4 g', '83 mm', '35 mm', '9 mm', '13 mm', 'Sk...
27    ['230 mm', '101 mm', '29 mm', 'Skin', 'Skull',...
31    ['15 g', '82 mm', '21 mm', '13 mm', '23 mm', '...
32    ['1180 mm', '620 mm', '130 mm', '40 mm', 'Anat...
33    ['3.4 g', '83 mm', '35 mm', '11 mm', 'Skin', '...
34    ['221 mm', '68 mm', '29 mm', 'Skin', 'Skull', ...
35    ['225 mm', '105 mm', '36 mm', 'Skin', 'Skull',...
41    ['114 mm', '43 mm', '315 mm', 'Skin', 'Skull',...
42    ['81 mm', '35 mm', '8 mm', 'Skin', 'Skull', 'M...
43    ['111 mm', '21 mm', '20 mm', 'Skin', 'Skull', ...
47    ['40 g', '250 mm', '140 mm', '28 mm', '17 mm',...
48    ['7 g', '91 mm', '33 mm', '8 mm', '16 mm',

# With 405 records out of 1010 with numbers in physicalDescriptions
we can see that 40% of the records have numbers in the physicalDescriptions field.

Repeat this process for the first number that is followed by ' mm' and add it to a new column called "length", and turn that number into an integer.
If ' mm' is not found but a ' cm' is found, convert the ' cm' to ' mm' and add it to the "length" column.
astly, if there are no instances of ' g' or ' mm' in the physicalDescription, add a NaN to the weight_g and length columns.

In [3]:
# for each row and in physicalDescription, identify the first number that is followed by ' g' and add it to a new column called "weight_g", and turn that number into an integer.
# Adjust regex to capture decimal and integer values followed by ' g'
df['weight_g'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) g', expand=False)
df['weight_g'] = df['weight_g'].fillna(0).astype(float)
df['weight_g'] = df['weight_g'] + df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) kg', expand=False).fillna(0).astype(float) * 1000

In [4]:
# for physicalDescription rows that have missing values for length, if the first number found in the physicalDescription is followed by ' cm', multiply that number by 10 and add it to the length column.
# and if the first number found in the physicalDescription is followed by ' m', multiply that number by 1000 and add it to the length column.
# Extract mm values first
df['length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) mm', expand=False).astype(float)

# Apply cm to mm conversion only where length is missing
df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) cm', expand=False).fillna(0).astype(float) * 10

# Apply m to mm conversion only where length is still missing
df.loc[df['length_mm'].isna(), 'length_mm'] = df['physicalDescriptions'].str.extract(r'(\d+\.?\d*) m', expand=False).fillna(0).astype(float) * 1000


In [5]:
import numpy as np

df['weight_g'].replace(0, np.nan, inplace=True)
df['length_mm'].replace(0, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weight_g'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['length_mm'].replace(0, np.nan, inplace=True)


# Attempt 1 of confidence interval for rest of population   

In [6]:
from scipy import stats;
import numpy as np

# Sample size
n = len(df)

# Population size
N = 500000

# Sample mean and standard deviation
sample_mean = df['weight_g'].mean()
sample_std = df['weight_g'].std(ddof=1)

# Standard error of the mean
sem = sample_std / np.sqrt(n)

# Confidence level

confidence_level = 0.95

# Degrees of freedom
df_degrees = n - 1

# t-score for the confidence level
t_score = stats.t.ppf((1 + confidence_level) / 2, df_degrees)

# Margin of error
margin_of_error = t_score * sem

# Confidence interval
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

print(f"I am 95% sure the average weight of this dataset is between: {confidence_interval} grams")

I am 95% sure the average weight of this dataset is between: (3690.6833354853925, 5552.366453671234) grams


In [7]:
# Sample mean and standard deviation for length
sample_mean_length = df['length_mm'].mean()
sample_std_length = df['length_mm'].std(ddof=1)

# Standard error of the mean for length
sem_length = sample_std_length / np.sqrt(n)

# t-score for the confidence level (already defined as t_score)
# Margin of error for length
margin_of_error_length = t_score * sem_length

# Confidence interval for length
confidence_interval_length = (sample_mean_length - margin_of_error_length, sample_mean_length + margin_of_error_length)

print(f"I am 95% sure the average length of this dataset is between: {confidence_interval_length} grams")

I am 95% sure the average length of this dataset is between: (329.5507903976483, 370.698943832259) grams


In [8]:
# Export the DataFrame as a new JSON document with "norm" appended to the original name
df.to_json('mammals_10k_rand_1_norm.json', orient='records', lines=True)