In [1]:
# matplotlib inline plotting
%matplotlib inline
# make inline plotting higher resolution
%config InlineBackend.figure_format = 'svg'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from itertools import dropwhile
from tqdm import tqdm
import re

# local imports
from helpers.sql import connect_to_db, update_database, read_db
from helpers.ar_model import transform, transformation, forecast_residual

plt.style.use("ggplot")


# Unexpected climate change
This script generates the measure of unexpected climate change perception using AR(1) model


In [3]:
# Connect to database (downloads if not exist)
engine = connect_to_db()

# Set this to True to update database locally
update = False

if update:
    update_database()


In [4]:
# Loading in Twitter data
df = read_db(engine=engine, statement="select * from twitter_kaggle")

# Setting datetime (potentially, we should deal with time-zones problems)
df["datetime"] = pd.to_datetime(df["created_at"])
df = df.set_index(df["datetime"], drop=True)  # Setting index
df.index.name = None

df = df.drop(columns=["created_at", "datetime"])  # dropping irrelevant colunms


# AR(1) specification

We have now constructed a measure of daily climate change perception using sentiment data from tweets. 
We are though still interested in proxy unexpected changes in climate change perception, thus we apply an AR model for this purpose.
\begin{align*}
    \Delta C P_t-\mathbb{E}\left[\Delta C P_t \mid I_{t-1}\right]
\end{align*}

We proxy this unexpected climate change news by extracting the residual/error term fra this AR model. 

We plan to set up an ARX model (AR model with control variables - for now just AR(1)): 
\begin{align*}
    FCP_t=\mu+\rho FCP_{t-1}+\gamma^{\prime} \mathbf{x}_{t-1}+\underbrace{\epsilon_t}_{UCP_t}
\end{align*}
Note that here $X_t$ a vector of explanatory variables. Including this should mitigate problems with endogeneity by capturing potential confounders that could affect the index.

# AR(1) table with estimates (This has been replaced by joint f-test)

This chunk of code stores p-values and estimates from the $t$ AR regressions

In [5]:
# # will be overwritten in loop
# topic = "aggregate"

# daily_sum_estimates = transform(df, transform_type="sum", mapping="log", topic=topic)

# daily_sum_estimates = pd.merge(
#     left=daily_sum_estimates,
#     right=forecast_residual(
#         daily_sum_estimates[f"{topic}_transformed"], return_estimates=True
#     ),
#     how="left",
#     left_index=True,
#     right_index=True,
# )


# for topic in df["topic"].unique():
#     daily_topic_sum_estimates = transform(
#         df, transform_type="sum", mapping="log", topic=topic
#     )

#     daily_topic_sum_estimates = pd.merge(
#         left=daily_topic_sum_estimates,
#         right=forecast_residual(
#             daily_topic_sum_estimates[f"{topic}_transformed"], return_estimates=True
#         ),
#         how="left",
#         left_index=True,
#         right_index=True,
#     )

#     daily_sum_estimates = pd.merge(
#         left=daily_sum_estimates,
#         right=daily_topic_sum_estimates,
#         how="left",
#         left_index=True,
#         right_index=True,
#     )

# daily_sum_estimates.columns = daily_sum_estimates.columns.map(
#     lambda x: x.lower().replace("/ ", "").replace(" ", "_")
# )


# topics = [
#     "aggregate_transformed",
#     "weather_extremes_transformed",
#     "importance_of_human_intervantion_transformed",
#     "seriousness_of_gas_emissions_transformed",
#     "ideological_positions_on_global_warming_transformed",
#     "impact_of_resource_overconsumption_transformed",
#     "global_stance_transformed",
#     "politics_transformed",
#     "significance_of_pollution_awareness_events_transformed",
#     "donald_trump_versus_science_transformed",
#     "undefined_one_word_hashtags_transformed",
# ]

# output = {}


# for topic in topics:
#     desc = daily_sum_estimates[[f'{topic}_ar_1_coef', f'{topic}_ar_1_pval']]
#     desc = desc.dropna()

#     output.update(
#         {topic: {
#             '25': desc[f'{topic}_ar_1_coef'].quantile(0.25),
#             '75': desc[f'{topic}_ar_1_coef'].quantile(0.75),
#             'percent_sig': (desc[f'{topic}_ar_1_pval'] < 0.05).sum() / desc[f'{topic}_ar_1_pval'].shape[0],
#             'mean': desc[f'{topic}_ar_1_coef'].mean(),
#             'n': desc[f'{topic}_ar_1_coef'].shape[0]
#         }}
#     )

#     #break

# output = pd.DataFrame(output).T

# display(output)


# def print_latex_table(df):
#     latex = df.to_latex(
#         index=True,
#         escape=True,
#         sparsify=True,
#         multirow=True,
#         multicolumn=True,
#         bold_rows=True,
#         na_rep="",
#         multicolumn_format="c",
#         float_format="{:.4f}".format,
#         position="H",
#     )

#     latex = re.sub(r"\\(mid|top|bottom)rule", "", latex)
#     print(latex)

#     return


# output = output[['mean', '25', '75', 'percent_sig', 'n']]

# output.index = output.index.map(lambda x: x.replace('_transformed', ''))
# output['percent_sig'] = output['percent_sig'].apply(lambda x: f'{x:.2%}')
# output['n'] = output['n'].apply(lambda x: f'{x:.0f}')

# output.index = output.index.map(lambda x: x.replace('_', ' '))
# output.index = output.index.map(lambda x: x.title())


# print_latex_table(output)


# Actual AR model

This is the actual code, we use to upload the sentiment

In [6]:
# will be overwritten in loop
topic = "aggregate"

daily_sum = transform(df, transform_type="sum", mapping="log", topic=topic)
daily_sum = pd.merge(
    left=daily_sum,
    right=forecast_residual(
        daily_sum[f"{topic}_transformed"],
        return_spec_test=True,
        auto_lag=True,
    ),
    how="left",
    left_index=True,
    right_index=True,
)


for topic in df["topic"].unique():
    daily_topic_sum = transform(df, transform_type="sum", mapping="log", topic=topic)

    daily_topic_sum = pd.merge(
        left=daily_topic_sum,
        right=forecast_residual(
            daily_topic_sum[f"{topic}_transformed"],
            return_spec_test=True,
            auto_lag=True,
        ),
        left_index=True,
        right_index=True,
        how="left",
    )

    daily_sum = pd.merge(
        left=daily_sum,
        right=daily_topic_sum,
        how="left",
        left_index=True,
        right_index=True,
    )

daily_sum.columns = daily_sum.columns.map(
    lambda x: x.lower().replace("/ ", "").replace(" ", "_")
)


In [7]:
# will be overwritten in loop
topic = "aggregate"

daily_mean_n = transform(df, transform_type="mean_n", mapping="log", topic=topic)
daily_mean_n = pd.merge(
    left=daily_mean_n,
    right=forecast_residual(
        daily_mean_n[f"{topic}_transformed"],
        return_spec_test=True,
        auto_lag=True,
    ),
    how="left",
    left_index=True,
    right_index=True,
)

for topic in df["topic"].unique():
    daily_topic_mean_n = transform(
        df, transform_type="mean_n", mapping="log", topic=topic
    )

    daily_topic_mean_n = pd.merge(
        left=daily_topic_mean_n,
        right=forecast_residual(
            daily_topic_mean_n[f"{topic}_transformed"],
            return_spec_test=True,
            auto_lag=True,
        ),
        left_index=True,
        right_index=True,
        how="left",
    )

    daily_mean_n = pd.merge(
        left=daily_mean_n,
        right=daily_topic_mean_n,
        how="left",
        left_index=True,
        right_index=True,
    )


daily_mean_n.columns = daily_mean_n.columns.map(
    lambda x: x.lower().replace("/ ", "").replace(" ", "_")
)


In [8]:
daily_sum

Unnamed: 0,aggregate_average,aggregate_sum,aggregate_count,aggregate_transformed,aggregate_transformed_predictions,aggregate_transformed_observed,aggregate_transformed_residuals,aggregate_transformed_dw_statistic,aggregate_transformed_f_statistic,aggregate_transformed_f_pval,...,undefined_one_word_hashtags_sum,undefined_one_word_hashtags_count,undefined_one_word_hashtags_transformed,undefined_one_word_hashtags_transformed_predictions,undefined_one_word_hashtags_transformed_observed,undefined_one_word_hashtags_transformed_residuals,undefined_one_word_hashtags_transformed_dw_statistic,undefined_one_word_hashtags_transformed_f_statistic,undefined_one_word_hashtags_transformed_f_pval,undefined_one_word_hashtags_transformed_lags
2006-06-06,-0.097180,-0.097180,1,-0.092743,,,,,,,...,,,,,,,,,,
2006-06-07,,0.000000,0,0.000000,,,,,,,...,,,,,,,,,,
2006-06-08,,0.000000,0,0.000000,,,,,,,...,,,,,,,,,,
2006-06-09,,0.000000,0,0.000000,,,,,,,...,,,,,,,,,,
2006-06-10,,0.000000,0,0.000000,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-27,-0.067066,-85.442356,1274,-4.459478,-3.124256,-4.459478,-1.335222,2.010298,0.567680,0.724847,...,8.825044,26.0,2.284935,2.351837,2.284935,-0.066903,2.016272,1.046434,0.351575,2.0
2019-09-28,0.019610,21.845393,1114,3.128749,-3.179015,3.128749,6.307765,2.006343,0.545173,0.742102,...,4.896136,14.0,1.774297,2.366585,1.774297,-0.592287,2.015699,1.036731,0.354996,2.0
2019-09-29,0.059978,46.063407,768,3.851496,0.705938,3.851496,3.145557,2.009506,0.549795,0.738565,...,1.877766,20.0,1.057014,1.971559,1.057014,-0.914545,2.014222,0.978695,0.376163,2.0
2019-09-30,-0.167222,-174.078142,1041,-5.165232,1.550074,-5.165232,-6.715307,2.007633,0.577885,0.717004,...,1.894076,13.0,1.062666,1.330619,1.062666,-0.267953,2.006905,0.796763,0.451074,2.0


In [9]:
daily_mean_n

Unnamed: 0,aggregate_average,aggregate_sum,aggregate_count,aggregate_count_transformed,aggregate_transformed,aggregate_transformed_predictions,aggregate_transformed_observed,aggregate_transformed_residuals,aggregate_transformed_dw_statistic,aggregate_transformed_f_statistic,...,undefined_one_word_hashtags_count,undefined_one_word_hashtags_count_transformed,undefined_one_word_hashtags_transformed,undefined_one_word_hashtags_transformed_predictions,undefined_one_word_hashtags_transformed_observed,undefined_one_word_hashtags_transformed_residuals,undefined_one_word_hashtags_transformed_dw_statistic,undefined_one_word_hashtags_transformed_f_statistic,undefined_one_word_hashtags_transformed_f_pval,undefined_one_word_hashtags_transformed_lags
2006-06-06,-0.097180,-0.097180,1,0.693147,-0.067360,,,,,,...,,,,,,,,,,
2006-06-07,,0.000000,0,0.000000,0.000000,,,,,,...,,,,,,,,,,
2006-06-08,,0.000000,0,0.000000,0.000000,,,,,,...,,,,,,,,,,
2006-06-09,,0.000000,0,0.000000,0.000000,,,,,,...,,,,,,,,,,
2006-06-10,,0.000000,0,0.000000,0.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-27,-0.067066,-85.442356,1274,7.150701,-0.479570,-0.331355,-0.479570,-0.148216,2.007809,0.417581,...,26.0,3.295837,1.118689,1.070639,1.118689,0.048050,2.004494,0.748919,0.523072,3.0
2019-09-28,0.019610,21.845393,1114,7.016610,0.137595,-0.366468,0.137595,0.504063,2.005986,0.407484,...,14.0,2.708050,0.947070,1.110104,0.947070,-0.163034,2.007872,0.707352,0.547705,3.0
2019-09-29,0.059978,46.063407,768,6.645091,0.398562,0.043965,0.398562,0.354596,2.008096,0.411058,...,20.0,3.044522,0.285845,1.010453,0.285845,-0.724608,1.997953,0.757939,0.517837,3.0
2019-09-30,-0.167222,-174.078142,1041,6.948897,-1.162009,0.176293,-1.162009,-1.338301,2.003567,0.438758,...,13.0,2.639057,0.384506,0.513717,0.384506,-0.129211,2.009564,0.779152,0.505682,3.0


# Test of changes to ar_model has no effect on stored UCP

In [10]:
sentiment_sum_ar1 = read_db(
    engine=engine, statement="select * from climate_sum_ar1", idx_col="date"
)
sentiment_mean_n_ar1 = read_db(
    engine=engine, statement="select * from climate_mean_n_ar1", idx_col="date"
)


if not daily_sum.equals(sentiment_sum_ar1):
    raise ValueError("daily_sum and sentiment_sum_ar1 are not equal")


if not daily_mean_n.equals(sentiment_mean_n_ar1):
    raise ValueError("daily_mean_n and sentiment_mean_n_ar1 are not equal")


In [11]:
####
# Update data in database - comment out if we update this :)
####

sentiment = daily_sum.copy()

# use regular incremental index
sentiment["date"] = sentiment.index
sentiment = sentiment.reset_index(drop=True)

# move date column to beginning of
first_column = sentiment.pop("date")
sentiment.insert(0, "date", first_column)

sentiment.to_sql(name="climate_sum_ar1", con=engine, if_exists="replace")


4866

In [12]:
####
# Update data in database - comment out if we update this :)
####

sentiment = daily_mean_n.copy()

# use regular incremental index
sentiment["date"] = sentiment.index
sentiment = sentiment.reset_index(drop=True)

# move date column to beginning of
first_column = sentiment.pop("date")
sentiment.insert(0, "date", first_column)

sentiment.to_sql(name="climate_mean_n_ar1", con=engine, if_exists="replace")


4866

In [13]:
update_database()


Local database newer than S3 database, uploading...
