# Loading data and libraries

In [None]:
import pandas as pd

# Load the datasets from CSV files
gdp_path = './Global_GDP.csv'  # Replace with the actual path
population_path = './Global_Population.csv'  # Replace with the actual path

gdp_df = pd.read_csv(gdp_path)
population_df = pd.read_csv(population_path)


In [None]:
# Displaying the first few rows of each dataframe
gdp_head = gdp_df.head()
population_head = population_df.head()

gdp_head, population_head


In [None]:
# Filling missing values with the mean of each column in the GDP data
gdp_df = gdp_df.fillna(gdp_df.mean())


Calculating the Mean GDP (2001-2020)

In [None]:
# Identifying the years to calculate the mean
years = [str(year) for year in range(2001, 2021)]

# Calculating mean GDP for the years 2001-2020
gdp_df['Mean GDP 2001-2020'] = gdp_df[years].mean(axis=1)


Displaying Cleaned GDP Data

In [None]:
# Displaying the cleaned GDP data
cleaned_gdp_head = gdp_df[['Country Name', 'Mean GDP 2001-2020']].head()
cleaned_gdp_head


# Task A: Correlation Analysis
Correlation analysis measures the strength and direction of a relationship between two variables.

In [None]:
# Since population data was not successfully cleaned, we will only focus on the GDP data
# Use 'gdp_df' which contains the cleaned GDP data

# Displaying the cleaned GDP data for correlation analysis
gdp_for_correlation = gdp_df[['Country Name', 'Mean GDP 2001-2020']]
gdp_for_correlation.head()



### Fixing the Dataframe issues

In [None]:
# Recalculating the mean GDP for the years 2001-2020
years = [str(year) for year in range(2001, 2021)]
gdp_df['Mean GDP 2001-2020'] = gdp_df[years].mean(axis=1)

## Plotting GDP Data for Visualisation

In [None]:
import matplotlib.pyplot as plt

# Now you can use plt to create your plot
plt.figure(figsize=(10, 6))
plt.hist(gdp_for_correlation['Mean GDP 2001-2020'], bins=30, color='blue')
plt.xlabel('Mean GDP 2001-2020 (US$)')
plt.ylabel('Frequency')
plt.title('Distribution of Mean GDP 2001-2020')
plt.show()


GDP Distribution: The histogram provides a visual representation of how mean GDP ​​
values are distributed across different countries.

## Insights from the Histogram

**Skewness: **If the histogram shows a concentration of values towards the left and a long tail to the right, it indicates a right-skewed distribution. This suggests that most countries have a lower GDP, with a few countries having significantly higher GDP.

**Outliers and Economic Disparities:** The presence of outliers, visible as separate bars on the far right, highlights countries with exceptionally high GDP. This reflects economic disparities between countries.

**Central Tendency and Spread:** The histogram also gives an idea of the central tendency (like mean or median) and the spread (range, variance) of the GDP values.

**Economic Insight: **This distribution can lead to discussions about global economic structures, with a focus on why some countries have much higher GDPs compared to others.

This histogram is a valuable tool for initiating discussions on global economic patterns and the disparities in economic prosperity among countries.

# Task B: Regression

Regression analysis models and predicts the relationship between a dependent variable and independent variables.

### **Next Steps for Task B:**
* **Match Years with GDP Data**: Since the GDP data covers 2001 to 2020, we will use population data from the same range.
* **Calculate Mean Population**: Compute the mean population for each country from 2001 to 2020.
* **Merge with GDP Data**: Merge this population data with the GDP data.
* **Perform Linear Regression**: Use the mean population as the independent variable and mean GDP as the dependent variable.


In [None]:
import pandas as pd

# Load the new population data
new_population_path = '/content/world_population.csv'  # Replace with your file path
new_population_df = pd.read_csv(new_population_path)

# Process the new_population_df as needed
# For example, calculating mean population, handling missing values, etc.

# After processing, you can proceed with the merge operation


In [None]:
# Check if the 'Approx Mean Population 2000-2020' column exists in new_population_df
if 'Approx Mean Population 2000-2020' in new_population_df.columns:
    print("Column exists.")
else:
    print("Column does not exist. Needs to be created.")


In [None]:
# Selecting relevant year columns for the population data
selected_years_population = ['2020 Population', '2015 Population', '2010 Population', '2000 Population']
new_population_df['Approx Mean Population 2000-2020'] = new_population_df[selected_years_population].mean(axis=1)

# Verifying that the column has been added
if 'Approx Mean Population 2000-2020' in new_population_df.columns:
    print("Column successfully created.")
else:
    print("Column creation failed.")


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Ensure gdp_df and new_population_df are already loaded and processed

# Step 1: Merge the DataFrames on a common column
# Replace 'Country Name' with the actual common column name if it's different
regression_df = pd.merge(gdp_df[['Country Name', 'Mean GDP 2001-2020']],
                         new_population_df[['Country/Territory', 'Approx Mean Population 2000-2020']],
                         left_on='Country Name',
                         right_on='Country/Territory')

# Step 2: Perform Regression Analysis
# Assuming 'Mean GDP 2001-2020' is your dependent variable and
# 'Approx Mean Population 2000-2020' is your independent variable
model = LinearRegression()
X = regression_df[['Approx Mean Population 2000-2020']]
y = regression_df['Mean GDP 2001-2020']

# Fitting the model
model.fit(X, y)

# Plotting the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue')  # Actual data points
plt.plot(X, model.predict(X), color='red')  # Regression line
plt.xlabel('Approx Mean Population 2000-2020')
plt.ylabel('Mean GDP 2001-2020 (US$)')
plt.title('Linear Regression: Population vs GDP')
plt.show()


### Merge with GDP Data and Perform Regression

In [None]:
# Checking for NaN values in regression_df
nan_check = regression_df.isna().sum()

# If NaN values are present, we can choose to either fill them with a value (like the mean) or drop these rows
# For example, to drop rows with NaN values:
regression_df_cleaned = regression_df.dropna()

# Re-running the linear regression on the cleaned data
model = LinearRegression()
X_cleaned = regression_df_cleaned[['Approx Mean Population 2000-2020']]
y_cleaned = regression_df_cleaned['Mean GDP 2001-2020']
model.fit(X_cleaned, y_cleaned)

# Plotting the regression line on the cleaned data
plt.figure(figsize=(10, 6))
plt.scatter(X_cleaned, y_cleaned, color='blue')
plt.plot(X_cleaned, model.predict(X_cleaned), color='red')
plt.xlabel('Approx Mean Population 2000-2020')
plt.ylabel('Mean GDP 2001-2020 (US$)')
plt.title('Linear Regression: Population vs GDP (Cleaned Data)')
plt.show()


### Insights

**Positive Correlation:** Larger populations generally correlate with higher GDPs, as indicated by the upward slope of the regression line.

**Variability:** The spread of data points suggests varying degrees of economic efficiency across countries.

**Outliers:** Points far from the trend line highlight unique economic conditions or productivity levels in some countries.

**General Trend: **The plot reveals a broad trend but doesn't capture all factors influencing GDP, highlighting the model's limitations.

### Using Plotly to visualise the population to GDP
relationships

In [None]:

import plotly.express as px

# Ensure you have the merged DataFrame 'regression_df' ready
# It should contain 'Country Name', 'Mean GDP 2001-2020', and 'Approx Mean Population 2000-2020' columns

fig = px.scatter(
    regression_df,
    x='Approx Mean Population 2000-2020',
    y='Mean GDP 2001-2020',
    text='Country Name',
    title='Interactive Plot: Population vs GDP',
    labels={
        'Approx Mean Population 2000-2020': 'Approx Mean Population (2000-2020)',
        'Mean GDP 2001-2020': 'Mean GDP (2001-2020, US$)'
    }
)

# Enhancing readability
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=False)

# Displaying the plot
fig.show()


Double click points above
 to view a detailed section