In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = 'outbreaks.csv'
df = pd.read_csv(data_path)

In [None]:
required_columns = ['Year', 'Month', 'Illnesses']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

In [None]:
df['Illnesses'] = df['Illnesses'].fillna(0)

In [None]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}


In [None]:
if df['Month'].dtype == object:
    df['Month'] = df['Month'].map(month_mapping)

In [None]:
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(DAY=1))

In [None]:
time_data = df.groupby('Date')[['Illnesses']].sum().reset_index()

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(14, 7))
sns.lineplot(x='Date', y='Illnesses', data=time_data, marker='o', label='Total Illnesses')
plt.title('Trend of Foodborne Illnesses Over Time', fontsize=16)
plt.xlabel('Time (Year-Month)', fontsize=12)
plt.ylabel('Total Illnesses', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
time_data['Rolling_Avg'] = time_data['Illnesses'].rolling(window=6).mean()  # 6-month rolling average
plt.figure(figsize=(14, 6))
sns.lineplot(x='Date', y='Illnesses', data=time_data, label='Original', alpha=0.4, linestyle='--')
sns.lineplot(x='Date', y='Rolling_Avg', data=time_data, label='6-Month Rolling Average', color='red')
plt.title('Trend with 6-Month Rolling Average', fontsize=16)
plt.xlabel('Time (Year-Month)', fontsize=12)
plt.ylabel('Total Illnesses', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
yearly_data = df.groupby('Year')[['Illnesses']].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='Year', y='Illnesses', data=yearly_data, palette='viridis')
plt.title('Total Illnesses by Year', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Total Illnesses', fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
df['Month'] = pd.Categorical(df['Month'], categories=range(1, 13), ordered=True)
monthly_data = df.groupby(['Year', 'Month'])[['Illnesses']].sum().reset_index()
monthly_data['Month'] = monthly_data['Month'].map({v: k for k, v in month_mapping.items()})  # Map month back to names

plt.figure(figsize=(14, 7))
sns.boxplot(x='Month', y='Illnesses', data=monthly_data, palette='coolwarm')
plt.title('Monthly Variation in Foodborne Illnesses', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Illnesses', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
factors = ['Food', 'Ingredient', 'Species', 'Serotype/Genotype']
factor_analysis = {}

for factor in factors:
    grouped_data = df.groupby(factor)[['Illnesses']].sum().reset_index().sort_values(by='Illnesses', ascending=False)
    factor_analysis[factor] = grouped_data
    print(f"\nTop factors for {factor} contributing to illnesses:")
    print(grouped_data.head(10))

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Illnesses', y=factor, data=grouped_data.head(10), palette='viridis')
plt.title(f'Top 10 {factor} Contributing to Illnesses', fontsize=16)
plt.xlabel('Total Illnesses', fontsize=12)
plt.ylabel(factor, fontsize=12)
plt.grid(True)
plt.show()

In [None]:
factor_totals = {}

for factor in factors:
    total_illnesses = df.groupby(factor)['Illnesses'].sum().sum()  # Sum illnesses for the factor
    factor_totals[factor] = total_illnesses

In [None]:
factor_comparison = pd.DataFrame(list(factor_totals.items()), columns=['Factor', 'Total_Illnesses'])
factor_comparison = factor_comparison.sort_values(by='Total_Illnesses', ascending=False)
print("\nTotal Illnesses by Factor:")
print(factor_comparison)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Total_Illnesses', y='Factor', data=factor_comparison, palette='coolwarm')
plt.title('Prominent Factor Contributing to Illnesses', fontsize=16)
plt.xlabel('Total Illnesses', fontsize=12)
plt.ylabel('Factor', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
for factor in factors:
    grouped_data = df.groupby(factor)['Hospitalizations'].sum().reset_index().sort_values(by='Hospitalizations', ascending=False)
    factor_totals[factor] = grouped_data
    print(f"\nTop factors for {factor} contributing to hospitalizations:")
    print(grouped_data.head(10))

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Hospitalizations', y=factor, data=grouped_data.head(10), palette='viridis')
plt.title(f'Top 10 {factor} Contributing to Hospitalizations', fontsize=16)
plt.xlabel('Total Hospitalizations', fontsize=12)
plt.ylabel(factor, fontsize=12)
plt.grid(True)
plt.show()

In [None]:
total_hospitalizations_by_factor = {factor: df.groupby(factor)['Hospitalizations'].sum().sum() for factor in factors}

comparison_df = pd.DataFrame(total_hospitalizations_by_factor.items(), columns=['Factor', 'Total_Hospitalizations']).sort_values(by='Total_Hospitalizations', ascending=False)

# Visualize the comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Total_Hospitalizations', y='Factor', data=comparison_df, palette='coolwarm')
plt.title('Prominent Factor Contributing to Hospitalizations', fontsize=16)
plt.xlabel('Total Hospitalizations', fontsize=12)
plt.ylabel('Factor', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
total_fatalities_by_factor = {}

for factor in factors:
  total_fatalities = df.groupby(factor)['Fatalities'].sum().sum()  # Sum fatalities for the factor
  total_fatalities_by_factor[factor] = total_fatalities

In [None]:
comparison_df = pd.DataFrame(
    list(total_fatalities_by_factor.items()),
    columns=['Factor', 'Total_Fatalities']
).sort_values(by='Total_Fatalities', ascending=False)

In [None]:
print("\nTotal Fatalities by Factor:")
print(comparison_df)

# Visualize the comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Total_Fatalities', y='Factor', data=comparison_df, palette='coolwarm')
plt.title('Prominent Factors Contributing to Fatalities', fontsize=16)
plt.xlabel('Total Fatalities', fontsize=12)
plt.ylabel('Factor', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
location_data = df.groupby('Location')['Illnesses'].sum().reset_index().sort_values(by='Illnesses', ascending=False)

# Display top locations
print("\nTop Locations Contributing to Foodborne Illnesses:")
print(location_data)

# Visualize the data
plt.figure(figsize=(12, 6))
sns.barplot(x='Illnesses', y='Location', data=location_data.head(10), palette='coolwarm')
plt.title('Top Locations Contributing to Foodborne Illnesses', fontsize=16)
plt.xlabel('Total Illnesses', fontsize=12)
plt.ylabel('Location', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
state_data = df.groupby('State')['Illnesses'].sum().reset_index().sort_values(by='Illnesses', ascending=False)

# Display top states
print("\nTop States Contributing to Foodborne Illnesses:")
print(state_data.head(10))

# Visualize the data
plt.figure(figsize=(12, 6))
sns.barplot(x='Illnesses', y='State', data=state_data.head(10), palette='viridis')
plt.title('Top States Contributing to Foodborne Illnesses', fontsize=16)
plt.xlabel('Total Illnesses', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.grid(True)
plt.show()