In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Data reading

In [None]:
file_path = 'gdp_world.csv'

# Specify the columns to include
selected_columns = ['year', 'country', 'pop', 'labor_force', 'gdp', 'unemployment_r', 'pop_over_65', 'working_age_pop_pct', 'gdp_over_pop']

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, usecols=selected_columns)

In [None]:
# Filter rows for "United States" and drop NA values
us_df = df[df['country'] == 'United States'].dropna()
# Display the shape of the new DataFrame
print("Shape of the 'United States' DataFrame:", us_df.shape)

In [None]:
world_df = df[df['country'] == 'World']
world_df = world_df[world_df['year'].isin(us_df['year'])].dropna()
# Display the shape of the new DataFrame
print("Shape of the 'World' DataFrame:", world_df.shape)

Q2. (35 pts) Pivot table and line plots

In [None]:
concatenated_df = pd.concat([us_df, world_df])
pivot_table = concatenated_df.pivot_table(values=['pop', 'gdp_over_pop'], index='year', columns='country')

In [None]:
concatenated_df = pd.concat([us_df, world_df])
# Create a line plot for 'gdp' with 'year' on the X-axis
plt.figure(figsize=(10, 6))
for country in concatenated_df['country'].unique():
    country_data = concatenated_df[concatenated_df['country'] == country]
    plt.plot(country_data['year'], country_data['gdp'], label=country)

# Add legends, title, and label for X-axis
plt.legend()
plt.title('GDP Over Years for World and United States')
plt.xlabel('Year')
plt.ylabel('GDP')

# Show the plot
plt.show()

In [None]:
# Create a line plot for 'gdp_over_pop' with 'year' on the X-axis
plt.figure(figsize=(10, 6))
for country in concatenated_df['country'].unique():
    country_data = concatenated_df[concatenated_df['country'] == country]
    plt.plot(country_data['year'], country_data['gdp_over_pop'], label=country)

# Add legends, title, and label for X-axis
plt.legend()
plt.title('GDP per Capita Over Years for World and United States')
plt.xlabel('Year')
plt.ylabel('GDP per Capita')

# Show the plot
plt.show()

Q3. (40 pts) Correlation analysis

In [None]:
# Drop 'year' and 'country' columns
us_df_no_year_country = us_df.drop(['year', 'country'], axis=1)
# Create a pair plot
plt.figure(figsize=(15, 10))
sns.pairplot(us_df_no_year_country)
plt.suptitle('Pair Plot for United States Data', y=1.02)
plt.show()

The histogram of "pop_over_65" shows that the population of the United States over the age of 65 is increasing.
The aging population of the United States is having a significant impact on the economy and society. As the population over 65 grows,
there is a greater demand for healthcare and social services.


In [None]:
# Compute the correlation matrix
correlation_matrix = us_df_no_year_country.corr()
# Create a heatmap for the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap for United States Data')
plt.show()

The folowing have a high correlation.
Population and labor force 0.99
Population and GDP 0.98
Labor force and GDP 0.96
Population over 65 and population 0.81
Population over 65 and labor force 0.76
Population over 65 and GDP 0.90
GDP over population and population 0.98
GDP over population and labor force 0.97

In [None]:
subset_df = df[['unemployment_r', 'labor_force']].dropna()
# Create a 2-D KDE plot
plt.figure(figsize=(10, 6))
sns.kdeplot(data=subset_df, x='unemployment_r', y='labor_force', cmap='viridis', fill=True)
plt.title('2-D KDE Plot for Unemployment Rate vs Labor Force')
plt.xlabel('Unemployment Rate')
plt.ylabel('Labor Force')
plt.show()

There is a general negative correlation between unemployment rates and labor force.
There is considerable variation in the unemployment rates of areas with similar labor force participation rates.
Some areas with very low labor force participation rates also have very high unemployment rates. This suggests that these areas may be facing significant challenges in attracting and retaining workers

In [None]:
subset_df = df[['year', 'unemployment_r']].dropna()
# Create a line plot for 'unemployment_r' with 'year' on the X-axis
plt.figure(figsize=(12, 6))
plt.plot(subset_df['year'], subset_df['unemployment_r'], marker='o', linestyle='-')
# Add text for "The Great Recession" and "Covid-19 Recession"
plt.annotate('The Great Recession', xy=(2009, subset_df['unemployment_r'].max()), xytext=(2007, subset_df['unemployment_r'].max() + 2),
             arrowprops=dict(facecolor='red', shrink=0.05), fontsize=10, color='red')
plt.annotate('Covid-19 Recession', xy=(2020, subset_df['unemployment_r'].max()), xytext=(2021, subset_df['unemployment_r'].max() + 2),
             arrowprops=dict(facecolor='blue', shrink=0.05), fontsize=10, color='blue')

plt.title('Unemployment Rate Over Years')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate')

# Show the plot
plt.show()