In [None]:
# Load relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings

warnings.filterwarnings("ignore")  # Suppress all warnings
# reference code used: https://towardsdatascience.com/an-extensive-guide-to-exploratory-data-analysis-ddd99a03199e
# https://towardsdatascience.com/exploratory-data-analysis-eda-python-87178e35b14

In [None]:
greenhouse_data = pd.read_csv("greenhouse_gases.csv")

In [None]:
greenhouse_data.head()

In [None]:
greenhouse_data.info()

## Univariate Analysis

In [None]:
#univariate analysis
# Let's calculate the percentage of each indicator status category.
greenhouse_data.IncomeGroup.value_counts(normalize=True)

In [None]:
#plot the bar graph of percentage job categories
greenhouse_data.IncomeGroup.value_counts(normalize=True).plot.barh(title=1)
plt.show()

In [None]:
#univariate analysis
# Let's calculate the percentage of each indicator status category.
greenhouse_data.region.value_counts(normalize=True)

In [None]:
#plot the bar graph of percentage job categories
greenhouse_data.region.value_counts(normalize=True).plot.bar()
plt.show()

In [None]:
#univariate analysis for numerical variables
greenhouse_data.nunique(axis=0)
greenhouse_data.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

## Bivariate Analysis

In [None]:
# calculate correlation matrix
corr = greenhouse_data.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(240, 20, as_cmap=True))

In [None]:
greenhouse_data.corr()

In [None]:
#plot the correlation matrix of salary, balance and age in data dataframe.
sns.heatmap(greenhouse_data.corr(), annot=True, cmap = 'Reds')
plt.show()

In [None]:
greenhouse_data.groupby('subregion')['tonnes_pc'].mean().plot.bar()

In [None]:
greenhouse_data.groupby('IndicatorName')['IndicatorValue'].mean().plot.bar()

In [None]:
sns.pairplot(greenhouse_data)

## Multivariate Analysis

In [None]:
result = pd.pivot_table(data=greenhouse_data, index='CountryName', columns='IndicatorName',values='tonnes_pc')
print(result)

#create heat map of education vs marital vs response_rate
fig, ax = plt.subplots(figsize=(13,20))
sns.heatmap(result, annot=True, cmap = "PiYG", center=0.117, linewidths=.5, ax=ax)
plt.show()

In [None]:
greenhouse_data['IndicatorValue'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')

In [None]:
greenhouse_data['tonnes_pc'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')

In [None]:
greenhouse_data.boxplot('tonnes_pc')

In [None]:
pm_data = pd.read_csv("pm.csv") ##pm dataset

In [None]:
pm_data.head()

In [None]:
pm_data.info()

## Univariate Analysis

In [None]:
#univariate analysis
# Let's calculate the percentage of each indicator status category.
pm_data.IncomeGroup.value_counts(normalize=True)

In [None]:
#plot the bar graph of percentage job categories
pm_data.IncomeGroup.value_counts(normalize=True).plot.barh(title=1)
plt.show()

In [None]:
#plot the bar graph of percentage job categories
pm_data.region.value_counts(normalize=True).plot.bar()
plt.show()

In [None]:
#univariate analysis
pm_data.region.value_counts(normalize=True)

In [None]:
pm_data.nunique(axis=0)
pm_data.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
# calculate correlation matrix
corr = pm_data.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
sns.pairplot(pm_data)

In [None]:
pm25 = pm_data.copy()

In [None]:
pm25.query('IndicatorName == "PM2.5"', inplace = True)

In [None]:
pm25['IndicatorValue'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')

In [None]:
result = pd.pivot_table(data=pm25, index='CountryName', columns='region',values='IndicatorValue')
print(result)

#create heat map of education vs marital vs response_rate
fig, ax = plt.subplots(figsize=(10,40))
sns.heatmap(result, annot=True, cmap = 'RdYlGn', center=0.117, linewidths=.5, ax=ax)
plt.show()

In [None]:
pm25_who = pm_data.copy()

In [None]:
pm25_who.query('IndicatorName == "PM2.5_WHO"', inplace = True)

In [None]:
pm25_who['IndicatorValue'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')