# CO<sub>2</sub> Levels in Atmosphere

Observed from the Mauna Loa Observatory (Hawaii) at a latitude of 19.5, longitude of -155.6, and elevation of 3397 meters

# Sources to read

- https://en.wikipedia.org/wiki/Carbon_dioxide_in_Earth%27s_atmosphere [Wikipedia]
- https://climate.nasa.gov/climate_resources/24/ [NASA]
- https://www.epa.gov/climate-change-science/causes-climate-change [US EPA]

In [None]:
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import datetime as dt

py.init_notebook_mode(connected=True)

Okay, let's just load the data

In [None]:
co2data = pd.read_csv("../input/archive.csv")
co2data.head(11)

Looks like there blank values in the dataset. Cleaning the dataset by filtering these records.

In [None]:
print(co2data.isnull().any())

# Ignore records with null values
old = co2data.shape
co2data = co2data.dropna()

print("\nIgnored records: ", old[0]-co2data.shape[0])

# 1. Variations in CO<sub>2</sub> concentration from 1958-2017 (~60 years)

In [None]:
grp1 = co2data.groupby(["Year"]).mean()["Carbon Dioxide (ppm)"]
trace1 = go.Bar(x=grp1.index, y=grp1.values)
layout = go.Layout(
    title="Average CO<sub>2</sub> Levels in Atmosphere per month",
    yaxis=dict(title="Parts per million (PPM)", range=(300,420)),
    xaxis=dict(title="Year"))
figure = go.Figure(data=[trace1], layout=layout)
py.iplot(figure, filename="co2-ppm-year")

- CO2 concentrations in the atmosphere have varied, ranging from 7000 ppm to 180 ppm [wikipedia]

# 2. Seasonal fluctations of CO<sub>2</sub> levels

In [None]:
grp2 = co2data.groupby(["Year", "Month"]).mean()["Carbon Dioxide (ppm)"]
x = [dt.datetime(year=i[0], month=i[1], day=15) for i in grp2.index]

# Mean values.
y1 = grp2.values

# Rolling window average
y2 = grp2.rolling(3, min_periods=1).mean().values

# Exponentially weighted moving average
y3 = grp2.ewm(span=3, min_periods=1).mean().values

In [None]:
trace2 = go.Scatter(x=x, y=y1, mode="markers", name="Actual value")
trace3 = go.Scatter(x=x, y=y2, line=dict(color="red"), name="Rolling average")
trace4 = go.Scatter(x=x, y=y3, line=dict(color="green"), name="EWM average")

default_period = (dt.datetime(2008, 1, 1), dt.datetime(2017,12,1))
default_ppm_range = (380, 410)
layout = go.Layout(
    title="Seasonal fluctations of CO<sub>2</sub> levels in atmosphere",
    yaxis=dict(title="Parts per million (PPM)",range=default_ppm_range),
    xaxis=dict(title="Year", range=default_period))

figure = go.Figure(data=[trace2, trace3, trace4], layout=layout)
py.iplot(figure, filename="co2-ppm-seasonal")

<b>Trends</b>

Every year, CO2 concentration levels tend to increase from January to May and then tend to decrease until next January

# 3. Predicting CO<sub>2</sub> concentration

In [None]:
# Fit a linear model
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

x_learn = [ (i.year, i.month, i.month ** 2, i.year ** 2) for i in x]
y_learn = [ i for i in y1 ]  # Using rolling avg

x_train, x_test, y_train, y_test = train_test_split(x_learn, y_learn, test_size=0.40, random_state=45)
clf = linear_model.LinearRegression().fit(x_train, y_train)
print ("Accuracy: ", clf.score(x_test, y_test))

# predicted values
y4 = clf.predict(x_learn)

In [None]:
# Select some future "years"
pred_years = range(1950, 2055)
pred_months = range(1, 13)

# Prepare dataset
x_pred = []
for y in pred_years:
    for m in pred_months:
        x_pred.append([y, m, m ** 2, y ** 2])
        
# Predict values
y_pred = clf.predict(x_pred)

# plot the predicted values
x_plt = [dt.datetime(i[0], i[1], 15) for i in x_pred]
trace5 = go.Scatter(x=x_plt, y=y_pred, line=dict(color="red"), name="Predicted value")

default_period = dt.datetime(1956, 1, 1), dt.datetime(2050,12,1)
default_ppm_range = (300, 500)
layout = go.Layout(
    title="Predicted Vs. Actual CO<sub>2</sub> Concentration levels",
    yaxis=dict(title="Parts per million (PPM)", range=default_ppm_range),
    xaxis=dict(title="Year", range=default_period))
figure = go.Figure(data=[trace2, trace5], layout=layout)
py.iplot(figure, filename="co2-ppm-prediction")

If the current trend continues
## In 18 years, the concentration levels will cross 450 parts per million. 

### During the last 60 years, the concentration levels increased by 30%
### There is a slow non-linear trend we can observe here!

TODO: Think about tuning the regression model.