In [1]:
##### Purpose:  featurize the sentences, and make a naive bayes classifier
##### Author: Julia Cope
##### Creation Date: 05/03/23
##### Project: A2 NLP - capturing climate claims 
##### Inputs: 
##### Inputs: 
##### Inputs: 11_climate_dataset.csv
##### Output: 
##### Output: 
##### Output: 

In [31]:
### libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import LdaModel
from gensim.corpora import Dictionary

from gensim.models.coherencemodel import CoherenceModel
from sklearn.model_selection import train_test_split


In [3]:
climate_df = pd.read_csv('03_Outputs/11_climate_dataset.csv',parse_dates=['Date'])


In [4]:
climate_df['sentences'] = climate_df['sentences'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower()

In [5]:
climate_df['sentences'] = climate_df['sentences'].str.replace('carbon capture', 'carbon_capture')
climate_df['sentences'] = climate_df['sentences'].str.replace('climate change', 'climate_change')
climate_df['sentences'] = climate_df['sentences'].str.replace('global warming', 'global_warming')
climate_df['sentences'] = climate_df['sentences'].str.replace('crude oil', 'crude_oil')
climate_df['sentences'] = climate_df['sentences'].str.replace('energy efficiency', 'energy_efficiency')
climate_df['sentences'] = climate_df['sentences'].str.replace('natural gas', 'natural_gas')
climate_df['sentences'] = climate_df['sentences'].str.replace('renewable energy', 'renewable_energy')



In [14]:
climate_df

Unnamed: 0.1,Unnamed: 0,index,Date,company,sentences,Year,X1
0,1,1655,2002-12-17,Exxon,factors that could cause actual results to dif...,2002,1
1,2,401,2003-02-03,Chevron,"""while offering no guarantees that they'll be ...",2003,1
2,3,133,2003-05-14,Chevron,and it's significant that this year's event ta...,2003,1
3,4,1875,2003-10-01,Exxon,importance will be attached to secure a balanc...,2003,1
4,5,1860,2004-04-27,Chevron,the u.s. department of energy (doe) today anno...,2004,1
...,...,...,...,...,...,...,...
63610,63611,8665,2012-07-31,Chevron,"karen hinton, the spokesperson for the ecuador...",2012,1
63611,63612,3794,2020-12-04,Valero,a few notable trends may include increasing rd...,2020,1
63612,63613,17354,2020-02-13,Chevron,"in fact, as much as the price erosion in crude...",2020,1
63613,63614,5501,2005-02-18,Chevron,future full-scale hydrogen energy stations cou...,2005,1


In [6]:
## strip_accents = 'unicode', stop_words='english')



In [7]:
# Preprocess text data
data_processed = [simple_preprocess(remove_stopwords(sent)) for sent in climate_df['sentences']]

In [8]:
#data_processed

In [9]:
# Create dictionary and corpus
dictionary = Dictionary(data_processed)
corpus = [dictionary.doc2bow(text) for text in data_processed]

In [10]:
# Define model parameters
num_topics = 9


In [15]:
# Instantiate LDA model
### maybe change it so the dictionary has fewer words...?
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=1)

In [16]:
# Extract output
topic_words = lda_model.print_topics()

In [17]:
topic_words

[(0,
  '0.062*"energy" + 0.032*"power" + 0.021*"inc" + 0.017*"fuels" + 0.016*"products" + 0.016*"company" + 0.015*"corporation" + 0.014*"chevron" + 0.013*"solutions" + 0.012*"natural_gas"'),
 (1,
  '0.030*"energy" + 0.026*"fuel" + 0.015*"technology" + 0.011*"hydrogen" + 0.010*"carbon" + 0.010*"emissions" + 0.009*"new" + 0.009*"technologies" + 0.009*"low" + 0.008*"world"'),
 (2,
  '0.019*"environmental" + 0.015*"chevron" + 0.015*"oil" + 0.014*"company" + 0.012*"climate" + 0.011*"climate_change" + 0.010*"companies" + 0.008*"exxonmobil" + 0.008*"exxon" + 0.008*"today"'),
 (3,
  '0.017*"including" + 0.013*"factors" + 0.013*"environmental" + 0.012*"changes" + 0.011*"ability" + 0.011*"risks" + 0.009*"business" + 0.009*"conditions" + 0.009*"results" + 0.009*"capital"'),
 (4,
  '0.042*"ii" + 0.032*"emissions" + 0.020*"gas" + 0.019*"table" + 0.014*"oil" + 0.014*"carbon" + 0.014*"greenhouse" + 0.012*"includes" + 0.012*"oils" + 0.010*"iii"'),
 (5,
  '0.060*"market" + 0.016*"global" + 0.016*"deman

In [26]:
# save the top topic number to each row in the corpus DataFrame
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    top_topic = max(doc_topics, key=lambda x: x[1])[0]
    climate_df.at[i, 'topic'] = top_topic
    
    
    

In [28]:
climate_df

Unnamed: 0.1,Unnamed: 0,index,Date,company,sentences,Year,X1,topic
0,1,1655,2002-12-17,Exxon,factors that could cause actual results to dif...,2002,1,3.0
1,2,401,2003-02-03,Chevron,"""while offering no guarantees that they'll be ...",2003,1,1.0
2,3,133,2003-05-14,Chevron,and it's significant that this year's event ta...,2003,1,1.0
3,4,1875,2003-10-01,Exxon,importance will be attached to secure a balanc...,2003,1,6.0
4,5,1860,2004-04-27,Chevron,the u.s. department of energy (doe) today anno...,2004,1,1.0
...,...,...,...,...,...,...,...,...
63610,63611,8665,2012-07-31,Chevron,"karen hinton, the spokesperson for the ecuador...",2012,1,2.0
63611,63612,3794,2020-12-04,Valero,a few notable trends may include increasing rd...,2020,1,5.0
63612,63613,17354,2020-02-13,Chevron,"in fact, as much as the price erosion in crude...",2020,1,4.0
63613,63614,5501,2005-02-18,Chevron,future full-scale hydrogen energy stations cou...,2005,1,1.0


In [33]:
### select 5 sentences per year 

#random.seed(42)
train, test = train_test_split(climate_df, train_size=440, stratify=climate_df[['topic','Year']], random_state=7)



# Print the shape of the training and testing sets
print('Training set shape:', train.shape)
print('Testing set shape:', test.shape)

Training set shape: (440, 8)
Testing set shape: (63175, 8)


In [27]:
#print(dictionary.id2token)



In [30]:
climate_df.to_csv('03_Outputs/12_topics_climate_df.csv', index=False)

In [34]:
train.to_csv('03_Outputs/12_sample_topic_sents.csv', index=False)

Unnamed: 0.1,Unnamed: 0,index,Date,company,sentences,Year,X1,topic
47170,47171,26289,2020-01-22,Exxon,generation revenues increased 96% to $14.0 mil...,2020,1,7.0
28136,28137,13078,2011-12-08,Exxon,according to a team of german scientists the c...,2011,1,2.0
38870,38871,1610,2020-03-04,Marathon,factors that could cause our actual results to...,2020,1,3.0
4890,4891,8256,2013-03-20,Exxon,"over the past decade, the site has invested ov...",2013,1,4.0
45140,45141,16507,2021-12-15,Chevron,"if established, additional details about the h...",2021,1,1.0
...,...,...,...,...,...,...,...,...
30935,30936,22249,2016-10-17,Exxon,"as part of this partnership, last month we ann...",2016,1,1.0
52322,52323,16529,2022-03-11,Chevron,"in contrast, current technologies in use for c...",2022,1,1.0
57925,57926,16481,2021-10-01,Chevron,"san ramon, calif- chevron corporation (nyse: c...",2021,1,2.0
31433,31434,8739,2011-11-14,Exxon,accrued environmental reserves at the end of t...,2011,1,7.0
