In [55]:
import json
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [56]:
# Load yak data from the JSON file
file_path = "D:\Datasets\Yak_Webshop_1\json_files\sample_diverse_data_recommendation.json"
with open(file_path, "r") as json_file:
    yak_data = json.load(json_file)
print("File uploaded successfully")
df = pd.DataFrame(yak_data)

File uploaded successfully


In [57]:
# Convert 'last_checkup_date' to datetime and calculate days since last checkup
current_date = datetime.strptime("2023-11-25", "%Y-%m-%d")  # Assuming current date for reference
df['last_checkup_date'] = pd.to_datetime(df['last_checkup_date'])
df['days_since_checkup'] = (current_date - df['last_checkup_date']).dt.days

df.head(5)

Unnamed: 0,name,age,health,behavior,weight_kg,last_checkup_date,days_since_checkup
0,Betty-1,100.0,special_health,special_behavior,96.19,2023-10-19,37
1,Betty-2,5.9,excellent,positive,156.81,2023-10-25,31
2,Betty-3,18.2,fair,stubborn,53.22,2023-06-16,162
3,Betty-4,23.3,fair,stubborn,109.01,2023-06-15,163
4,Betty-5,26.8,poor,negative,61.67,2023-10-01,55


In [58]:
# Convert 'behavior' column into one-hot encoded columns
df = pd.get_dummies(df, columns=['behavior'], prefix='behavior')

In [59]:
df.head(3)

Unnamed: 0,name,age,health,weight_kg,last_checkup_date,days_since_checkup,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn
0,Betty-1,100.0,special_health,96.19,2023-10-19,37,False,False,False,False,True,False
1,Betty-2,5.9,excellent,156.81,2023-10-25,31,False,False,False,True,False,False
2,Betty-3,18.2,fair,53.22,2023-06-16,162,False,False,False,False,False,True


In [60]:
df.columns

Index(['name', 'age', 'health', 'weight_kg', 'last_checkup_date',
       'days_since_checkup', 'behavior_calm', 'behavior_negative',
       'behavior_playful', 'behavior_positive', 'behavior_special_behavior',
       'behavior_stubborn'],
      dtype='object')

In [61]:
# Select features and target variable
X = df[['age',
         'weight_kg', 
         'days_since_checkup', 
         'behavior_calm', 
         'behavior_playful', 
         'behavior_positive',
         'behavior_negative', 
         'behavior_special_behavior', 
         'behavior_stubborn']]
y = df['health']

In [62]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [63]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy and other evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_rep)

Model Accuracy: 1.00

Classification Report:
                precision    recall  f1-score   support

     excellent       1.00      1.00      1.00         3
          fair       1.00      1.00      1.00         7
          good       1.00      1.00      1.00         1
          poor       1.00      1.00      1.00         3
special_health       1.00      1.00      1.00         6

      accuracy                           1.00        20
     macro avg       1.00      1.00      1.00        20
  weighted avg       1.00      1.00      1.00        20



Making predictions

In [68]:
import pandas as pd
import random
from datetime import datetime

# List of possible behavior values
behaviors = ['negative', 'calm', 'playful', 'positive', 'special_behavior', 'stubborn']

# Create a DataFrame with rows for each behavior
test_data = []
for behavior in behaviors:
    new_yak = {
        'age': random.randint(1, 30),
        'weight_kg': round(random.uniform(50, 200), 2),
        'last_checkup_date': datetime.strptime(f"2023-{random.randint(1, 12)}-{random.randint(1, 28)}", "%Y-%m-%d"),
        'behavior': behavior
    }
    test_data.append(new_yak)

# Create DataFrame for test data
test_df = pd.DataFrame(test_data)
test_df


Unnamed: 0,age,weight_kg,last_checkup_date,behavior
0,27,76.26,2023-01-24,negative
1,6,169.93,2023-03-28,calm
2,30,105.51,2023-02-19,playful
3,28,173.86,2023-02-18,positive
4,12,145.13,2023-12-26,special_behavior
5,25,150.15,2023-09-23,stubborn


In [69]:
# Calculate days since last checkup
current_date = datetime.strptime("2023-11-29", "%Y-%m-%d")  # Assuming current date for reference
test_df['last_checkup_date'] = pd.to_datetime(test_df['last_checkup_date'])
test_df['days_since_checkup'] = (current_date - test_df['last_checkup_date']).dt.days

# Make predictions for the test data
# (Assuming 'model' has already been trained)
# Use the model to predict and provide recommendations
test_df['prediction'] = model.predict(test_df[['age', 'weight_kg', 'days_since_checkup']])
test_df['recommendation'] = ''

for behavior in behaviors:
    idx = test_df[test_df['behavior'] == behavior].index
    recommendation = ""
    if behavior == 'excellent':
        recommendation = "The yak's health is predicted to be excellent. Maintain regular care and monitoring."
    elif behavior == 'good':
        recommendation = "The yak's health is predicted to be good. Maintain regular care and observe for any changes."
    elif behavior == 'fair':
        recommendation = "The yak's health is predicted to be fair. Monitor closely and consider adjusting care if needed."
    elif behavior == 'poor':
        recommendation = "The yak's health is predicted to be poor. Immediate specialized care and attention are recommended."
    elif behavior == 'special_health':
        recommendation = "The yak's health is predicted to be special. Specialized care and constant monitoring are necessary."
    else:
        recommendation = "The yak's health prediction is inconclusive. Further evaluation may be necessary."

    test_df.loc[idx, 'recommendation'] = recommendation

print(test_df)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- behavior_calm
- behavior_negative
- behavior_playful
- behavior_positive
- behavior_special_behavior
- ...


In [66]:
# making data accurate - MISTAKE IN THE DATA SET
row_to_change = 4
random_df.loc[row_to_change, 'age'] = 100
random_df.tail()

Unnamed: 0,age,weight_kg,last_checkup_date,behavior
5,9,190.39,2023-02-17,special_behavior
6,11,164.81,2023-01-03,positive
7,13,77.73,2023-02-16,positive
8,27,195.97,2023-03-25,calm
9,27,89.2,2023-03-08,negative


In [70]:
# Calculate days since last checkup
current_date = datetime.strptime("2023-11-25", "%Y-%m-%d")  # Assuming current date for reference
test_df['days_since_checkup'] = (current_date - test_df['last_checkup_date']).dt.days

# Convert 'behavior' for the new yaks into one-hot encoded columns
test_df = pd.get_dummies(test_df, columns=['behavior'], prefix='behavior')

# Make predictions for the random samples
new_data = test_df[['age', 'weight_kg', 'days_since_checkup'] + [col for col in test_df.columns if col.startswith('behavior_')]]
predictions = model.predict(new_data)

# Mapping predicted health to recommendation
recommendations = []
for prediction in predictions:
    if prediction == 'excellent':
        recommendations.append("The yak's health is predicted to be excellent. Maintain regular care and monitoring.")
    elif prediction == 'good':
        recommendations.append("The yak's health is predicted to be good. Maintain regular care and observe for any changes.")
    elif prediction == 'fair':
        recommendations.append("The yak's health is predicted to be fair. Monitor closely and consider adjusting care if needed.")
    elif prediction == 'poor':
        recommendations.append("The yak's health is predicted to be poor. Immediate specialized care and attention are recommended.")
    elif prediction == 'special_health':
        recommendations.append("The yak's health is predicted to be special. Specialized care and constant monitoring are necessary.")
    else:
        recommendations.append("The yak's health prediction is inconclusive. Further evaluation may be necessary.")

# Print predictions and recommendations for each random sample
for i, (prediction, recommendation) in enumerate(zip(predictions, recommendations)):
    print(f"Prediction for Yak {i + 1}: {prediction}")
    print(f"Recommendation for Yak {i + 1}: {recommendation}")
    print()


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
