In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np
from scipy import sparse

from sklearn.linear_model import Lasso, LinearRegression
from gensim.utils import simple_preprocess
from threading import Thread, RLock

import xgboost
import shap

from matplotlib import pyplot as plt

import seaborn as sns
sns.set(style="whitegrid")

In [None]:
%%time
article_df = pd.read_csv("data//articles.csv")
article_df.head()

In [None]:
article_df.shape

In [None]:
article_df["quarter"] = article_df.month.apply(lambda month: np.ceil(month/3)) 
article_df.head()

In [None]:
aspects_words_df = pd.read_csv("data\\aspects.csv")
aspects_words_df

In [None]:
aspects_words = list(set(aspects_words_df.words.sum()))

aspects_words2 = []

bigrams_list = []

for i,word in enumerate(aspects_words): 
    
        
    if len(word.split("_"))>1:
        bigram=True
    else:
        bigram=False
    
    if bigram:
        bigrams_list.append(" ".join(word.split("_")))
    else:
        aspects_words2.append(word)
        
aspects_words = list(set((aspects_words2+bigrams_list)))


In [None]:
len(aspects_words)

In [None]:
aspects_words_dict = {word:i for i,word in enumerate(aspects_words)}
vectorizer_aspects_words = TfidfVectorizer(vocabulary=aspects_words_dict, ngram_range=(1,2), tokenizer=simple_preprocess)


#loading sentiment words 
feel_df = pd.read_csv("data\\FEEL_modified.csv", encoding="utf-8")
#creating a dictionary with sentiment words
sentiment_dict = dict(list(zip(feel_df.word, range(len(feel_df)))))

#Building a dictionary of words with the sentiment words
vectorizer_sent = CountVectorizer(vocabulary=sentiment_dict)


def GetGlobalSent_WordFreq(articles, bigrams=None):
    
    """
    articles : a vector of articles
    """
    
    #word frequencies per article (sum in column)
    article_vector_mat = vectorizer_aspects_words.fit_transform(articles)
    article_vector_df = pd.DataFrame(article_vector_mat.todense(), columns = aspects_words) 
    
    
    #aspect_count_article = article_vector_mat
    
    #fitting the articles to get one vector of sentiment words per article 
    article_sentiment_mat = vectorizer_sent.transform(articles)
    
    ##Number of recognized words (in sentiment disctionary) in each article
    #summing in lines
    nbwords_article_vector = article_sentiment_mat.sum(axis=1) + 0.0001 #avoiding to divide by 0
    nbwords_article_vector = sparse.csr_matrix(nbwords_article_vector)
    
    
    #Product of the fitted matrix with polarity vector
    feel_vector = sparse.csr_matrix(feel_df.sentiment.values.reshape(12011,1))
    global_sent_vector = article_sentiment_mat.dot(feel_vector)
    global_sent_vector = global_sent_vector/nbwords_article_vector  
    global_sent_df = pd.DataFrame(global_sent_vector, columns = ["global_sent"])
    
    
    return pd.concat([global_sent_df, article_vector_df], axis=1)


    

#### Parallelization

In [None]:

def GetGlobalSent_WordFreqYear(year):
    
    results_df =  pd.DataFrame(columns=["year", "quarter", "global_sent"] + list(aspects_words))
    
    for quarter in range(1,5):
        
        
        print("Year {} : quarter {}".format(year, quarter))
        
        articles = article_df.loc[(article_df.quarter==quarter)&(article_df.year==year),"article"]
        result_df =  GetGlobalSent_WordFreq(articles)
        result_df["year"] = year
        result_df["quarter"] = quarter
        
        results_df = pd.concat([results_df, result_df], axis=0, sort=False)
        
    return results_df


lock = RLock()
class ThreadGetGlobalSent(Thread):
    
    
    def __init__(self, year):
        
        Thread.__init__(self)
        self.year = year        
        
    def run(self):
        
        global results_df
        
        year = self.year
        self.results_df = GetGlobalSent_WordFreqYear(year)
            
        #adding the generated data to the whole one; "with lock" is used to only give access to one thread at once
        with lock:
            results_df = pd.concat([results_df, self.results_df], axis=0, sort=False)
        
    

In [None]:
%%time

results_df =  pd.DataFrame(columns=["year", "quarter", "global_sent"] + list(aspects_words))
years = range(2000,2021)

Thread_dict = {}
for year in years:
    Thread_dict[year] = ThreadGetGlobalSent(year)

for year in years:
    Thread_dict[year].start()


for year in years:
    Thread_dict[year].join()



In [None]:
results_df = results_df.sort_values(by=["year","quarter"]).reset_index(drop=True)
results_df

In [None]:
results_df.shape

In [None]:
results_df.global_sent.hist()

In [None]:
results_df.to_csv("data/results.csv", index=None)

# Aggreagating words' TD-IF per aspects (topics)

In [None]:
#%%time
#stop_words = ["ça", "dès","oui","né","mêmes","mis","michel","façon","ci"]

n_aspects = aspects_words_df.shape[0]
aspect_list = list(aspects_words_df.aspect_name.values) #["aspect%s"%i for i in range(n_aspects)]

aspects_tfid_df = pd.DataFrame(columns= ["year", "quarter"] + aspect_list)
aspects_tfid_df["year"] = results_df["year"]
aspects_tfid_df["quarter"] = results_df["quarter"]

for i in range(n_aspects):
    words = aspects_words_df.words[i]
    words = [" ".join(word.split("_")) for word in words if word]
    aspects_words_df.loc[i,"n_final_nwords"] = len(words)
    
    aspects_tfid_df.loc[:, aspect_list[i]] = results_df[words].mean(axis=1)



In [None]:
aspects_words_df.head()

In [None]:
#%%time

nrows=4
ncols=4
aspect_list = aspects_tfid_df.columns[2:]
fig, ax = plt.subplots(nrows=nrows,ncols=ncols, figsize=(18,14))

line=0
col =0
pos = 0

aspects_tfid_groupby_df = aspects_tfid_df.groupby(["year","quarter"]).mean()
year_quarter_list = ["%s-%s"%(year, quarter) for (year, quarter) in aspects_tfid_groupby_df.index]

for i,aspect in enumerate(aspect_list):

    if pos==12:
        col+=1
        pos+=1
        
    ax[line,col].plot(aspects_tfid_groupby_df[aspect].values)

    ax.flat[pos].set_title(aspect)
    
    ax.flat[pos].label_outer()
    
    
    if ((col+1)%ncols==0):
        line+=1
        col=0

    else:
        col+=1

    pos+=1

    
for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(ticks= range(0, 81, 4), labels=range(2000,2021), rotation=90)
    plt.yticks(rotation=0)
    
plt.savefig("Redaction\\pics\\aspect_tfidf.png")
plt.show()

In [None]:
plt.figure(figsize=(14,8))
plt.plot(aspects_tfid_groupby_df.mean(axis=1).values)
plt.xticks(ticks= range(0, 81, 4), labels=range(2000,2021), rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
aspects_tfid_df.shape

# Sentiment decomposition whith Shapley values

#### Global sentiment representation

In [None]:
def rescale(x):
    return (x - x.mean())/x.std()

In [None]:
plt.figure( figsize=(20,10) )

agg_global_sent_scale = rescale(results_df.groupby(["year","quarter"]).mean()["global_sent"].values)
plt.plot(agg_global_sent_scale)



#Early 2000s recession
plt.axvspan(0, 16, facecolor='black', alpha=0.1)
plt.text(7,0.58, "Early 2000s Recession", color="red", rotation=90, fontdict={"size":20})


#Great Recession
plt.axvspan(28, 36, facecolor='black', alpha=0.2)
plt.text(31,-2, "Great Recession", color="red",  rotation=90, fontdict={"size":20})

#Euro crisis
plt.axvspan(40, 48, facecolor='black', alpha=0.1)
plt.text(43,0, "Euro Crisis", color="red",  rotation=90, fontdict={"size":20})



#World cup
plt.axvspan(72, 74, facecolor='black', alpha=0.2)
plt.text(72.5,-1, "World Cup", color="blue", rotation=90, fontdict={"size":20})

#Yellow vest
plt.axvspan(75, 79, facecolor='black', alpha=0.1)
plt.text(76,-1.5, "Yellow vest", color="red",  rotation=90, fontdict={"size":20})


#Covid crisis
plt.axvspan(79, 84, facecolor='black', alpha=0.2)
plt.text(80,0, "Covid-19", color="red",  rotation=90, fontdict={"size":20})

plt.xticks( range(0,82, 4), ["Q1-%s"%year for year in range(2000,2021)], rotation='vertical')
#plt.legend()
plt.savefig(r"Redaction\pics\global_sent.png")
plt.show()

In [None]:
results_df["global_sent_rescaled"] = (results_df.global_sent - results_df.global_sent.mean())/results_df.global_sent.std()
results_df["global_sent_rescaled"].head()

In [None]:
%%time
shap_values_dict = {}

for year in range(2000,2021):
    print(year)
    for quarter in range(1,5):
        
            
        select = (results_df.year==year)&(results_df.quarter==quarter)

        #X,y = np.log(aspects_tfid_df.iloc[:, 2:][select]+0.0001) , results_df.global_sent_rescaled.loc[select]
        X,y = aspects_tfid_df.iloc[:, 2:][select] , results_df.global_sent_rescaled.loc[select]
        model = xgboost.train(params={"learning_rate": 0.01}, dtrain=xgboost.DMatrix(X, label=y), num_boost_round=1000)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        
        shap_values_dict["%s-%s"%(year, quarter)] = shap_values

In [None]:
shap_values_dict

In [None]:
%%time
#putting shap values in data frames 

shap_values_df_all =  pd.DataFrame(columns=aspect_list)

for year in range(2000,2021):
    print(year)
    for quarter in range(1,5):
        

        shap_values_df = pd.DataFrame(shap_values_dict["%s-%s"%(year, quarter)] , columns=aspect_list)
        shap_values_df["year"] = year
        shap_values_df["quarter"] = quarter
        
        #shap_values_df_dict["%s-%s"%(year, quarter)] = shap_values_df
        shap_values_df_all = pd.concat([shap_values_df_all, shap_values_df], axis=0)

In [None]:
shap_values_df_all = shap_values_df_all.set_index(keys=["year","quarter"])
shap_values_df_all

In [None]:
aspects_tfid_df = aspects_tfid_df.set_index(keys=["year","quarter"])

#### Decomposition representation

In [None]:
#%%time

nrows=4
ncols=5

fig, ax = plt.subplots(nrows=nrows,ncols=ncols, figsize=(24,24))

shap_year_dict = {}

line=0
col =0
pos = 0

plt.rc('text', usetex=True)

for year in range(2000,2020):

    #contrib_df_year = pd.concat([ contrib_df_dict[period] for period in ["%s-%s"%(year, quarter) for quarter in range(1,5)]], axis = 1)
    shap_year = shap_values_df_all.loc[(year, ),:]
    
    
    shap_year_dict[year] = shap_year.mean(axis=0)


    color = list(map(lambda value: "r" if value<0.0 else "b", shap_year_dict[year].values)) 

    ax[line,col].bar(aspect_list, shap_year_dict[year].values, color= color )

    ax.flat[pos].set_title(year)
    #ax.flat[pos].tick_params(labelrotation=60)
    
    #ax.flat[pos].label_outer()
    
    select = (results_df.year==year)
    globa_sent = results_df.loc[select, "global_sent_rescaled"].mean()
    ax.flat[pos].text(4., shap_year_dict[year].min()-0.00005, r"\textbf{GlobalSent = %s}"%round(globa_sent,2), fontsize=15)
    
    if ((col+1)%ncols==0):
        line+=1
        col=0

    else:
        col+=1

    pos+=1

    
for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=40, ha="right")
    plt.yticks(rotation=0)
    
    
plt.savefig("Redaction\pics\sent_year_decomp.png")
plt.show()

In [None]:
%%time

nrows=2
ncols=2

fig, ax = plt.subplots(nrows=nrows,ncols=ncols, figsize=(14,8))

shap_period_dict = {}

line=0
col =0
pos = 0


year = 2020
plt.rc('text', usetex=True)

for quarter in range(1,5):
    
    period = (year, quarter) 
    
    shap_period = shap_values_df_all.loc[period,:]
    
    
    shap_period_dict[period] = shap_period.mean(axis=0).values


    color = list(map(lambda value: "r" if value<0.0 else "b", shap_period_dict[period])) 

    ax[line,col].bar(aspect_list, shap_period_dict[period], color= color )
    
    ax.flat[pos].set_title(f"Q{quarter}-{year}")
    #ax.flat[pos].tick_params(labelrotation=90)
    
    ax.flat[pos].label_outer()
    
    select = (results_df.year==year)&(results_df.quarter==quarter)
    globa_sent = results_df.loc[select, "global_sent_rescaled"].mean()
    
    ax.flat[pos].text(4.5, shap_period_dict[period].min()-0.00005, r"\textbf{GlobalSent = %s}"%round(globa_sent,2), fontsize=15)
    
    if ((col+1)%ncols==0):
        line+=1
        col=0

    else:
        col+=1

    pos+=1

 
for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)


plt.savefig("Redaction\\pics\\sent_quarter_decomp.png")
plt.show()

# Some graphics

In [None]:
def rescale(x):
    return (x - x.mean())/x.std()

In [None]:
gdp_df = pd.read_csv("data\\gdp_france.csv", sep=";")
gdp_df = gdp_df[["TIME","Value"]]
gdp_df.columns=["year_quarter", "gdp_growth" ]
gdp_df["gdp_growth_norm"] = rescale(gdp_df.gdp_growth)
gdp_df.head()

In [None]:
cli_oecd_df = pd.read_csv("data/composite_oecd.csv",sep=";")

cli_oecd_df["month"] = cli_oecd_df.TIME.apply(lambda time: int(time[-2:]))
cli_oecd_df["year"] = cli_oecd_df.TIME.apply(lambda time: int(time[:4]))

cli_oecd_df["quarter"] = cli_oecd_df["month"].apply(get_quarter) 

cli_oecd_df.Value = (cli_oecd_df.Value - 100)/100
#cli_oecd_df["Value_norm"] = (cli_oecd_df.Value  - cli_oecd_df.Value.mean())/cli_oecd_df.Value.std()

cli_oecd_norm_quarter = cli_oecd_df.groupby(["year", "quarter"])["Value"].mean().to_frame()
cli_oecd_norm_quarter["cli_norm" ] = rescale(cli_oecd_norm_quarter.Value) 
cli_oecd_norm_quarter["cli_norm_1" ] = cli_oecd_norm_quarter["cli_norm" ].shift(1)
cli_oecd_norm_quarter.head()

In [None]:
plt.figure( figsize=(12,6) )
#plt.plot(busi_oecd_norm_quarter.busi_norm.values, color="red", label="BCI")
#plt.plot(cons_oecd_norm_quarter.cons_norm.values, color="green", label="CCI")
plt.plot(cli_oecd_norm_quarter.cli_norm.values, color="blue", label="CLI")
plt.plot(gdp_df.gdp_growth_norm.values, color="black", label="GDP growth rate")


plt.xticks( range(0,82, 4), ["Q1-%s"%year for year in range(2000,2021)], rotation='vertical')
plt.legend()
plt.savefig("Redaction\pics\oecd_indexes.png")
plt.show()

In [None]:
unemp_df = pd.read_csv("data/unemployment_rate.csv", sep=";")
unemp_df.head()

In [None]:
plt.figure( figsize=(12,6) )
#plt.plot(busi_oecd_norm_quarter.busi_norm.values, color="red", label="BCI")
#plt.plot(cons_oecd_norm_quarter.cons_norm.values, color="green", label="CCI")
plt.plot(unemp_df.unemp.values, color="blue")


plt.xticks( range(0,82, 4), ["Q1-%s"%year for year in range(2000,2021)], rotation='vertical')
#plt.legend()
plt.savefig("Redaction\pics\\unemp.png")
plt.show()

In [None]:
unemp_df.tail()