# Processing real data with Pandas

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("data/pop_year.csv")
data

In [None]:
# Verify if all totals are correctly calculated
if any(data.total != (data.male + data.female)):
    print("ERRORS")
else:
    print("No Errors")

In [None]:
# Largest total population
max(data.total)

# np.max(data[:, 1])

In [None]:
# Largest male population
max(data.male)

In [None]:
# Smallest male population
min(data.male)

In [None]:
# Year with larger total population
data.iloc[data.total.argmax()]
# data[np.argmax(data[:,1])]

In [None]:
# Maximum difference between two male populations
max(data.male.diff().dropna())

In [None]:
# Which are the years of the maximum difference in total population?
idx_of_max = data.total.diff(1).argmax()
data.iloc[idx_of_max-1].year, data.iloc[idx_of_max].year

In [None]:
# What is the largest difference between the male and female population
max((data.male - data.female).abs())

Now, we can also create all sort of fancy graphs with matplotlib and seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(x='year', y='male', data=data, label='Male')
sns.lineplot(x='year', y='female', data=data, label='Female')
plt.xlabel('Year')
plt.ylabel('Population')
plt.legend()
plt.title('Population Comparison Over the Years')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'data'
sns.set(style="whitegrid")

# Stacked area plot
plt.figure(figsize=(6, 4))
plt.stackplot(data['year'], data['male'], data['female'], labels=['Male', 'Female'], alpha=0.7)
plt.xlabel('Year')
plt.ylabel('Population')
plt.legend(loc='upper left')
plt.title('Population Distribution Over the Years (Male vs. Female)')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'data'
# Assuming 'year' column contains the years you want to plot
years_to_plot = [1960, 1980, 2000, 2020]

# Create a subplot with 2 rows and 2 columns for four pie charts
fig, axes = plt.subplots(2, 2, figsize=(7, 7))
fig.suptitle('Population Distribution by Gender for Different Years')

for i, year in enumerate(years_to_plot):
    row = i // 2
    col = i % 2
    ax = axes[row, col]

    # Filter the data for the current year
    year_data = data[data['year'] == year]

    # Calculate the sum of male and female populations for the current year
    male_population = year_data['male'].sum()
    female_population = year_data['female'].sum()

    # Create a pie chart for the current year
    labels = ['Male', 'Female']
    sizes = [male_population, female_population]
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    ax.set_title(f'Year {year}')

# Adjust the layout to prevent overlapping
plt.tight_layout()
plt.subplots_adjust(top=0.85)

plt.show()
