In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [3]:
CROCODILE_DATA = "../data/raw/crocodile_dataset.csv"
df = pd.read_csv(CROCODILE_DATA)

In [5]:
df.head()

Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
0,1,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,1.9,62.0,Adult,Male,31-03-2018,Belize,Swamps,Least Concern,Allison Hill,Cause bill scientist nation opportunity.
1,2,American Crocodile,Crocodylus acutus,Crocodylidae,Crocodylus,4.09,334.5,Adult,Male,28-01-2015,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operatio...
2,3,Orinoco Crocodile,Crocodylus intermedius,Crocodylidae,Crocodylus,1.08,118.2,Juvenile,Unknown,07-12-2010,Venezuela,Flooded Savannas,Critically Endangered,Melissa Peterson,Democratic shake bill here grow gas enough ana...
3,4,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,2.42,90.4,Adult,Male,01-11-2019,Mexico,Rivers,Least Concern,Edward Fuller,Officer relate animal direction eye bag do.
4,5,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylidae,Crocodylus,3.75,269.4,Adult,Unknown,15-07-2019,India,Rivers,Vulnerable,Donald Reid,Class great prove reduce raise author play mov...


In [7]:
df['Common Name'].value_counts()

Common Name
New Guinea Crocodile                         68
Borneo Crocodile (disputed)                  67
American Crocodile                           66
Morelet's Crocodile                          64
Cuban Crocodile                              59
Saltwater Crocodile                          58
Philippine Crocodile                         58
Orinoco Crocodile                            58
West African Dwarf Crocodile                 57
Central African Slender-snouted Crocodile    56
West African Slender-snouted Crocodile       55
West African Crocodile                       52
Hall's New Guinea Crocodile                  49
Congo Dwarf Crocodile                        48
Nile Crocodile                               48
Mugger Crocodile (Marsh Crocodile)           47
Siamese Crocodile                            45
Freshwater Crocodile (Johnstone's)           45
Name: count, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observation ID        1000 non-null   int64  
 1   Common Name           1000 non-null   object 
 2   Scientific Name       1000 non-null   object 
 3   Family                1000 non-null   object 
 4   Genus                 1000 non-null   object 
 5   Observed Length (m)   1000 non-null   float64
 6   Observed Weight (kg)  1000 non-null   float64
 7   Age Class             1000 non-null   object 
 8   Sex                   1000 non-null   object 
 9   Date of Observation   1000 non-null   object 
 10  Country/Region        1000 non-null   object 
 11  Habitat Type          1000 non-null   object 
 12  Conservation Status   1000 non-null   object 
 13  Observer Name         1000 non-null   object 
 14  Notes                 1000 non-null   object 
dtypes: float64(2), int64(1

In [None]:
# 1. Define the Features (X)
features = ['Observed Length (m)', 'Observed Weight (kg)']
X = df[features]

# 2. Define the Target (Y) - The string names
y = df['Common Name']

# 3. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
model_multi = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)

# Train the model
model_multi.fit(X_scaled, y)

In [None]:
# 2,American Crocodile,Crocodylus acutus,Crocodylidae,Crocodylus,4.09,334.5,Adult,Male,28-01-2015,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operation speak according.

# --- 1. Define the new observation data ---
# REPLACE THESE VALUES with the actual length and weight you want to predict
new_length = 4.09    # meters (e.g., a medium-sized crocodile)
new_weight = 334.5  # kilograms

# Create a DataFrame for the new input.
# The model expects a 2D array/DataFrame with the same column names as used during training.
new_data = pd.DataFrame({
    'Observed Length (m)': [new_length],
    'Observed Weight (kg)': [new_weight]
})

print(f"New Observation: Length={new_length}m, Weight={new_weight}kg")

# --- 2. Scale the input data ---
# IMPORTANT: Use the scaler fitted on the training data (X)
new_data_scaled = scaler.transform(new_data)

# --- 3. Run the Prediction (Predict the Class) ---
predicted_name = model_multi.predict(new_data_scaled)[0]

# --- 4. Run Prediction Probabilities (Predict the Likelihood for each class) ---
# This gives more insight than just the final class
predicted_probas = model_multi.predict_proba(new_data_scaled)[0]

# --- 5. Display Results ---
print("\n--- Prediction Results ---")

# Predicted Class
print(f"Predicted Common Name: {predicted_name}")

# Predicted Probabilities
probas_df = pd.DataFrame({
    'Common Name': model_multi.classes_,
    'Probability': predicted_probas.round(4)
}).sort_values(by='Probability', ascending=False)

print("\nProbabilities for each class:")
display(probas_df)

New Observation: Length=4.09m, Weight=334.5kg

--- Prediction Results ---
Predicted Common Name: American Crocodile

Probabilities for each class:


Unnamed: 0,Common Name,Probability
0,American Crocodile,0.273
11,Orinoco Crocodile,0.2169
10,Nile Crocodile,0.1353
8,Mugger Crocodile (Marsh Crocodile),0.1275
13,Saltwater Crocodile,0.0731
17,West African Slender-snouted Crocodile,0.0582
1,Borneo Crocodile (disputed),0.037
15,West African Crocodile,0.0259
14,Siamese Crocodile,0.0159
4,Cuban Crocodile,0.0117
