In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
avocado = pd.read_csv('./avocado.csv')
missing_analysis = pd.DataFrame({
    'Missing_Count': avocado.isnull().sum(),
})

missing_analysis = missing_analysis.sort_values('Missing_Count', ascending=False)

updated_data = avocado.copy()

# median represents typical average reasonably well
if updated_data['AveragePrice'].isnull().sum() > 0:
    price_median = updated_data['AveragePrice'].median()
    updated_data['AveragePrice'] = updated_data['AveragePrice'].fillna(price_median)

# sales seem to vary by region and type
volume_cols = ['TotalVolume', 'Small', 'Large', 'TotalBags']
for col in volume_cols:
    if updated_data[col].isnull().sum() > 0:
        # fill missing values with mean of same region and type
        missing_before = updated_data[col].isnull().sum()
        updated_data[col] = updated_data.groupby(['Region', 'Type'])[col].transform(
            lambda x: x.fillna(x.mean())
        )
        
        # fallback strategy use overall median if no group mean available
        if updated_data[col].isnull().sum() > 0:
            overall_median = updated_data[col].median()
            updated_data[col] = updated_data[col].fillna(overall_median)
        
# forward fill to handle chronological dates 
if updated_data['Date'].isnull().sum() > 0:
    date_missing = updated_data['Date'].isnull().sum()
    updated_data['Date'] = updated_data['Date'].ffill()

# mode of most common region which reflects dominant market
if updated_data['Region'].isnull().sum() > 0:
    region_mode = updated_data['Region'].mode()[0]
    region_missing = updated_data['Region'].isnull().sum()
    updated_data['Region'] = updated_data['Region'].fillna(region_mode)

# used mode to fill nan for type
if updated_data['Type'].isnull().sum() > 0:
    type_mode = updated_data['Type'].mode()[0]
    type_missing = updated_data['Type'].isnull().sum()
    updated_data['Type'] = updated_data['Type'].fillna(type_mode)

# median year avoids bias of recent or old years
if updated_data['Year'].isnull().sum() > 0:
    year_median = updated_data['Year'].median()
    year_missing = updated_data['Year'].isnull().sum()
    updated_data['Year'] = updated_data['Year'].fillna(year_median)

# 0 logical default for no sales
if updated_data['AllSizes'].isnull().sum() > 0:
    allsizes_missing = updated_data['AllSizes'].isnull().sum()
    updated_data['AllSizes'] = updated_data['AllSizes'].fillna(0)

# before changes
print(missing_analysis)

updated_missing_analysis = pd.DataFrame({
    'Missing_Count': updated_data.isnull().sum(),
})

updated_missing_analysis = updated_missing_analysis.sort_values('Missing_Count', ascending=False)

# after empty cells handled
print(updated_missing_analysis)


              Missing_Count
Type                    204
Year                    196
Small                   194
TotalVolume             192
AveragePrice            184
AllSizes                184
TotalBags               184
Large                   178
Date                    176
Region                  169
              Missing_Count
Date                      0
AveragePrice              0
TotalVolume               0
Small                     0
Large                     0
AllSizes                  0
TotalBags                 0
Type                      0
Year                      0
Region                    0


In [50]:
avocado['Type'] = avocado['Type'].astype('category')
avocado['Year'] = avocado['Year'].astype('category')
avocado['Region'] = avocado['Region'].astype('category')

avocado_filtered = avocado[~avocado['Region'].isin(['TotalUS', 'West'])].copy()
avocado_filtered['Date'] = pd.to_datetime(avocado_filtered['Date'])
avocado_filtered = avocado_filtered.sort_values('Date')

avocado_filtered['Year_numeric'] = pd.to_numeric(avocado_filtered['Year'], errors='coerce')

mean_2016 = avocado_filtered[avocado_filtered['Year_numeric'] == 2016]['AveragePrice'].mean()
mean_2017 = avocado_filtered[avocado_filtered['Year_numeric'] == 2017]['AveragePrice'].mean()

print(f"mean of avocado price comparison:")
print(f"2016: ${mean_2016:.2f}")
print(f"2017: ${mean_2017:.2f}")

mean of avocado price comparison:
2016: $1.34
2017: $1.52


In [60]:
volume_by_region = avocado_filtered.groupby('Region', observed=True)['TotalVolume'].sum().sort_values(ascending=True)

# Check total number of regions
total_regions = len(volume_by_region)

# horizontal bar plot
plt.figure(figsize=(12, max(8, total_regions * 0.4)))  # Dynamic height based on number of regions
volume_by_region.plot(kind='barh')
plt.xlabel('Total Volume')
plt.ylabel('Region')
plt.title(f'All Regions by Total Avocado Sales Volume')
plt.tight_layout()
plt.savefig('./4_3_barplot_volume_region.png', dpi=300, bbox_inches='tight')
plt.close()

highest_volume_region = volume_by_region.idxmax()
print(f"Region with highest volume: {highest_volume_region}")

# histogram
state_data = avocado_filtered[avocado_filtered['Region'] == highest_volume_region]
plt.figure(figsize=(10, 6))
plt.hist(state_data['AveragePrice'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Average Price ($)')
plt.ylabel('Frequency')
plt.title(f'Distribution of Avocado Prices in {highest_volume_region}')
plt.savefig('./4_3_histogram_price.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"  Mean price: ${state_data['AveragePrice'].mean():.2f}")


# Correlation between price and volume
price_volume_corr = state_data['AveragePrice'].corr(state_data['TotalVolume'])
print(f"correlation between average price and total volume: {price_volume_corr:.4f}")

Region with highest volume: California
  Mean price: $1.40
correlation between average price and total volume: -0.7914
