In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats, fft
from datetime import timedelta



  from pandas.core import (


**Load the dataset**

In [2]:
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    return df

### **Dataset First View**

In [None]:
df.head()

### **Dataset Rows & Columns count**

In [7]:
df = load_dataset(r'C:/Users/lamia/Downloads/dataset4_truncated.csv')
print("Number of rows are: ",df.shape[0])
print("Number of columns are: ",df.shape[1])

Number of rows are:  179759
Number of columns are:  19


**Displaying Column Names**


In [None]:
print(df.columns)

**Define behavioral and health-related features**

In [None]:
behavioral_features = ['IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL']
health_indicators = ['oestrus', 'calving', 'lameness', 'mastitis',
                     'other_disease', 'accidents', 'disturbance',
                     'mixing', 'management_changes', 'OK']

**Distribution of health states**

In [None]:
health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']

df_health = df[health_states]
health_counts = df_health.sum().sort_values(ascending=False)
total = len(df)
health_percent = (health_counts / total) * 100

safe_health_percent = health_percent.copy()
safe_health_percent[safe_health_percent == 0] = 0.01

plt.figure(figsize=(12, 6))
bars = plt.bar(safe_health_percent.index, safe_health_percent.values, color='lightcoral')

for i, bar in enumerate(bars):
    original = health_percent.values[i]
    plt.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
             f'{original:.2f}%', ha='center', va='bottom')

plt.yscale('log')
plt.title('Proportion of Health States (Log Scale)', fontweight='bold')
plt.xlabel('Health States', fontweight='bold')
plt.ylabel('Percentage (log scale)', fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7, which='both')
plt.tight_layout()
plt.show()

**Unique Value Counts per Event**


In [None]:
target_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'LPS', 'acidosis', 'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes', 'OK']


print("Unique Value Counts per Event:")
for col in target_columns:
    value_counts = df[col].value_counts()
    print(f"{col}:")
    print(value_counts)
    print("---------------------------------")

**Count the number of cows in the dataset**

In [None]:
num_cows = df['cow'].nunique()
print(f"Total number of unique cows: {num_cows}")

**Display the ids of cows**

In [None]:
unique_cow_ids = df['cow'].unique()
print("List of unique cow IDs:")
print(unique_cow_ids)  # Prints all IDs as an array

sorted_cow_ids = sorted(df['cow'].unique())
print("\nSorted list of cow IDs:")
print(sorted_cow_ids)

**Duplicate Values**


In [None]:
# Dataset Duplicate Value Count
dup = df.duplicated().sum()
print(f'number of duplicated rows are {dup}')

**Missing Values**

In [None]:
# Missing Values/Null Values Count
df.isnull().sum()

**Matrix of missing values**

In [None]:
import missingno as msno
import matplotlib.pyplot as plt


data = pd.read_csv(path)


plt.figure(figsize=(15, 8))
msno.matrix(data, color=(0, 0.5, 0.5))  # Affichage des valeurs manquantes en matrice
plt.title("Matrice des valeurs manquantes", fontsize=18, fontweight='bold')
plt.show()

**Outliers detection**

Détails du calcul théorique :

1. Plage minimale ( -828 ) :

Si une vache passe 1 heure entière (3600 secondes) au repos : -0.23 * 3600 + 0.16 * 0 + 0.42 * 0 = -828

2. Plage maximale ( 1512 ) :

Si une vache passe 1 heure entière (3600 secondes) à manger : -0.23 * 0 + 0.16 * 0 + 0.42 * 3600 = 1512

In [None]:

min_theorique = -828
max_theorique = 1512

# Calculer les min/max observés
min_obs = df['ACTIVITY_LEVEL'].min()
max_obs = df['ACTIVITY_LEVEL'].max()

print(f"Plage théorique : [{min_theorique}, {max_theorique}]")
print(f"Plage observée  : [{min_obs:.2f}, {max_obs:.2f}]")

# Identifier les valeurs hors plage
outliers = df[(df['ACTIVITY_LEVEL'] < min_theorique) | (df['ACTIVITY_LEVEL'] > max_theorique)]
num_outliers = len(outliers)


plt.figure(figsize=(10, 6))
plt.hist(df['ACTIVITY_LEVEL'], bins=50, color='skyblue', edgecolor='black')
plt.axvline(x=min_theorique, color='red', linestyle='--', label='Borne min théorique')
plt.axvline(x=max_theorique, color='green', linestyle='--', label='Borne max théorique')
plt.xlabel('Niveau d\'activité')
plt.ylabel('Fréquence')
plt.title('Distribution des niveaux d\'activité')
plt.legend()
plt.xlim(-1000, 1600)  # Pour mieux visualiser les éventuels outliers
plt.show()


print("\n[ Rapport d'anomalies ]")
print(f"- Valeurs hors plage théorique : {num_outliers} ({num_outliers/len(df)*100:.2f}% du dataset)")

if num_outliers > 0:
    print("- Exemples de lignes problématiques :")
    print(outliers[['DATETIME', 'ACTIVITY_LEVEL']].head())
else:
    print("- Aucune valeur hors plage détectée.")

In [None]:
# Plot histograms for the selected columns to understand their distributions
df[cols].hist(bins=20, figsize=(12, 10))
plt.suptitle("Data Distribution")
plt.show()


**Visualization of Health States Proportions**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("dataset3-1 (1).csv")

health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'LPS',
                 'acidosis', 'other_disease', 'OK']

df_health = df[health_states]

health_counts = df_health.sum().sort_values(ascending=False)
total = len(df)
health_percent = (health_counts / total) * 100

plt.figure(figsize=(12, 6))
bars = plt.bar(health_percent.index, health_percent.values, color='lightcoral')

# Formatting: 2 decimal places
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%',  # .2f = 2 decimal places
             ha='center', va='bottom')

plt.title('Proportion of Health States (in %)', fontweight='bold')
plt.xlabel('Health States', fontweight='bold')
plt.ylabel('Percentage', fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


**Missing hours**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("dataset3-1 (1).csv")

# Ensure the datetime column is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Count the number of observations (rows) per cow per day
obs_counts = df.groupby(['cow', 'date']).size().reset_index(name='hour_count')

# Identify rows with missing hours (less than 24)
missing_obs = obs_counts[obs_counts['hour_count'] < 24]

# Create a plot showing missing hour count per cow
plt.figure(figsize=(12, 6))
missing_obs.groupby('cow')['hour_count'].apply(lambda x: (24 - x).sum()).sort_values(ascending=False).plot(kind='bar')
plt.title("Total Missing Hours per Cow (only cows with missing 24h records)")
plt.xlabel("Cow ID")
plt.ylabel("Total Missing Hours")
plt.tight_layout()
plt.show()


**Check Date Range and Missing Dates**

In [None]:
# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Get min/max dates
min_date = df['date'].min()
max_date = df['date'].max()
print(f"Date range: {min_date} to {max_date}")

# Generate all expected dates in the range
all_dates = pd.date_range(start=min_date, end=max_date, freq='D')
print(f"Total expected days: {len(all_dates)}")

# Check missing dates
missing_dates = all_dates.difference(df['date'].unique())
print(f"\nMissing dates {len(missing_dates)}:", ', '.join(missing_dates.strftime('%Y-%m-%d')))


**Analysis of Cows with Multiple Diseases**

In [None]:
maladie_cols = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease']

df_non_ok = df[df['OK'] == 0].copy()

df_non_ok['nb_maladies'] = df_non_ok[maladie_cols].sum(axis=1)

# Keep only rows with more than one disease
multi_maladie = df_non_ok[df_non_ok['nb_maladies'] > 1].copy()

# Extract the names of the diseases present
def get_maladies_presentes(row):
    return [col for col in maladie_cols if row[col] == 1]

multi_maladie['maladies_presentes'] = multi_maladie.apply(get_maladies_presentes, axis=1)

# Display results
print(f"Number of cases with multiple diseases: {len(multi_maladie)}")
print(multi_maladie[['cow', 'date', 'hour', 'maladies_presentes']])


**Investigate the correlation between the "OK" label and other health states**



In [None]:
health_labels = [
    'oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'accidents',
]

violations = df[(df['OK'] == 1) & (df[health_labels].sum(axis=1) > 0)]

if violations.empty:
    print("✅ Aucun cas trouvé où OK == 1 et un état de santé est aussi à 1. Tout est cohérent.")
else:
    print("⚠️ Des cas incohérents ont été trouvés ! Voici les lignes concernées :")
    print(violations)


**Correlation Analysis**

In [None]:
df = df.drop(columns=["cow", "date", "LPS", "acidosis", "ACTIVITY_LEVEL"])

# Define columns
behavioral_features = ['IN_ALLEYS', 'REST', 'EAT']
health_features = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease',
                   'accidents', 'disturbance', 'mixing', 'management_changes', 'OK']

# --- 1. Spearman Correlation between Behavioral Features ---
behavior_corr = df[behavioral_features].corr(method='spearman')

plt.figure(figsize=(6, 4))
sns.heatmap(behavior_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Spearman Correlation - Behavioral Features")
plt.show()

# --- 2. Pearson Correlation (Phi) between Health Features ---
health_corr = df[health_features].corr(method='pearson')

plt.figure(figsize=(10, 8))
sns.heatmap(health_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Phi Correlation (Pearson) - Health Features")
plt.show()

# --- 3. Point-Biserial Correlation: Behavioral vs Health States ---
point_biserial_matrix = pd.DataFrame(index=health_features, columns=behavioral_features)

for h in health_features:
    for b in behavioral_features:
        valid_data = df[[b, h]].dropna()
        if valid_data[h].nunique() > 1:  # Ensure variable is not constant
            corr, _ = pointbiserialr(valid_data[h], valid_data[b])
            point_biserial_matrix.loc[h, b] = corr
        else:
            point_biserial_matrix.loc[h, b] = np.nan

point_biserial_matrix = point_biserial_matrix.astype(float)

plt.figure(figsize=(8, 6))
sns.heatmap(point_biserial_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Point-Biserial Correlation - Behavioral vs Health Features")
plt.xlabel("Behavioral Features")
plt.ylabel("Health Features")
plt.show()

In [None]:


# Collect correlations
correlation_data = []

for h in health_features:
    for b in behavioral_features:
        valid_data = df[[b, h]].dropna()
        if valid_data[h].nunique() > 1:
            corr, pval = pointbiserialr(valid_data[h], valid_data[b])
            correlation_data.append({
                'Health State': h,
                'Behavioral Feature': b,
                'Correlation': corr,
                'p-value': pval
            })

# Convert to DataFrame
corr_df = pd.DataFrame(correlation_data)

# Plot
plt.figure(figsize=(8, 5))
for health_state in corr_df["Health State"].unique():
    subset = corr_df[corr_df["Health State"] == health_state]
    plt.bar(subset["Behavioral Feature"] + f" ({health_state})", subset["Correlation"], label=health_state)

plt.axhline(0, color='gray', linestyle='--')
plt.title("Point-Biserial Correlation: Behavioral vs Health States")
plt.ylabel("Correlation Coefficient")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

phi_corr = df[["oestrus", "OK"]].corr(method="pearson").iloc[0, 1]

# Prepare plot
plt.figure(figsize=(6, 3))
x = ["oestrus vs OK"]
y = [phi_corr]

# Lollipop style plot
plt.plot(x, y, marker='o', linestyle='-', color='steelblue')
plt.axhline(0, color='gray', linestyle='--')
plt.ylim(-1, 1)
plt.title("Phi Correlation Between Binary Health Features")
plt.ylabel("Correlation Coefficient")
plt.grid(axis='y', linestyle=':')
plt.tight_layout()
plt.show()
