# Beginning
+ header
+ libraries
+ read in data 



In [1]:
##### Purpose:  to take in setences from press releases and select a stratified random sample for hand coding.
##### Author: Julia Cope
##### Creation Date: 4/20/23
##### Project: A2 NLP - capturing climate claims 
##### Inputs: 
##### Inputs: 02_textsentences.csv
##### Inputs: 02_PR_sentences.csv


##### Output: 04_traindata
##### Output: 04_sampledata
##### Output: 

In [2]:
### libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
import random


In [3]:
### read in data 
#sent_df = pd.read_csv('03_Outputs/02_textsentences.csv',parse_dates=['Date'])

#sent_df = pd.read_csv('03_Outputs/02_PR_sentences.csv',parse_dates=['Date'])

sent_df = pd.read_csv('03_Outputs/04B_unlabeled_data.csv',parse_dates=['Date'])


sent_df['Year'] = sent_df['Date'].dt.year 


In [4]:
sent_df = sent_df.dropna(subset=['sentences'])

In [5]:
sent_df['sentences'] = sent_df['sentences'].astype(str)

In [6]:
sent_df

Unnamed: 0,index,Date,company,sentences,Year
0,15613,2020-04-03,Chevron,Key Topics Covered:\n\n1 INTRODUCTION\n\n2 RES...,2020
1,9012,2011-12-15,Chevron,"Contact: Karen Hinton, 703-798-3109, karen@hin...",2011
2,16074,2018-02-07,Exxon,This includes c. US$280 million cash received ...,2018
3,15509,2017-04-25,Exxon,Through innovative technologies and improved e...,2017
4,9427,2013-09-13,Chevron,"Today, Zacks Equity Research discusses the U.S...",2013
...,...,...,...,...,...
1241198,5938,2009-09-18,Exxon,The study includes profiles of major producers...,2009
1241199,24210,2018-08-08,Exxon,"Tags: , , , ,...",2018
1241200,39,2002-11-04,Chevron,For project details view the entire article at...,2002
1241201,790,2018-07-18,Marathon,"Praxair produces, sells and distributes atmosp...",2018


# Get climate score 

+ save vocab
+ use count vectorizer to get score from vocab list 
+ score the sentences



In [7]:
vocab_climate_long = ['atmosphere', 'carbon', 'carbon dioxide',
                      'climate', 'climate change', 'co2','dioxide', 'earth', 'emissions', 
                      'global warming', 'greenhouse', 'greenhouse gases', 'human', 'ice', 
                      'level', 'ocean', 'rise', 'scientists', 'sea', 'sea level', 'surface', 
                      'temperature', 'temperatures', 'warming', 
                      
                      'aerosol' , "alternative energy" , 'biofuel' , "black carbon" , 
                      'carbon capture', 'carbon footprint' , 'carbon sequestration' ,
                     'ch4' ,  'climate feedback' , 'climate lag' , 'climate science' , 
                      'climate sensitivity' , 'coal' , 
                      "coal bleaching" , 'deforestation' , 'desertification' , 'ecology' , 'ecosystem' , 'emission' , 
                      "energy efficiency" , "enteric fermentation" , "environment" , "environmental", "extreme weather" , 
                      "fluorinated gas" , 'fluorocarbon' , "fossil fuel" , "fuel switching" , 'geothermal' , 'ghg' , 
                      'global temperature', 'greenhouse gas', "environmentally",
                      'greenhouse effect', 'halocarbon', 'hcfcs', 
                      "heat dome" , 'hfcs' , 'hydrochlorofluorocarbon' , 'hydrofluorocarbon' , 'hydropower' , "ice loss", 
                      'ipcc' , 'landfill' , 'methane' , "municipal solid waste" , 'n2o' , "nitrogen oxide" , 
                      "nitrous oxide" , 'nox' , "ocean acid" , 'ozone' , 
                      "particulate matter" , 'perfluorocarbon' , 'pfcs' , 
                      'pollution' , 'recycle' , 'reforest' , "resource management" , 'sf6' , 'solar' , 
                      "sulfur hexafluoride" , 'sustainability',
                      'sustainable' , 'renewable', "trace gas" , 'unfccc']

In [8]:
sent_df

Unnamed: 0,index,Date,company,sentences,Year
0,15613,2020-04-03,Chevron,Key Topics Covered:\n\n1 INTRODUCTION\n\n2 RES...,2020
1,9012,2011-12-15,Chevron,"Contact: Karen Hinton, 703-798-3109, karen@hin...",2011
2,16074,2018-02-07,Exxon,This includes c. US$280 million cash received ...,2018
3,15509,2017-04-25,Exxon,Through innovative technologies and improved e...,2017
4,9427,2013-09-13,Chevron,"Today, Zacks Equity Research discusses the U.S...",2013
...,...,...,...,...,...
1241198,5938,2009-09-18,Exxon,The study includes profiles of major producers...,2009
1241199,24210,2018-08-08,Exxon,"Tags: , , , ,...",2018
1241200,39,2002-11-04,Chevron,For project details view the entire article at...,2002
1241201,790,2018-07-18,Marathon,"Praxair produces, sells and distributes atmosp...",2018


# Split and Export


In [9]:
sent_df.head()

Unnamed: 0,index,Date,company,sentences,Year
0,15613,2020-04-03,Chevron,Key Topics Covered:\n\n1 INTRODUCTION\n\n2 RES...,2020
1,9012,2011-12-15,Chevron,"Contact: Karen Hinton, 703-798-3109, karen@hin...",2011
2,16074,2018-02-07,Exxon,This includes c. US$280 million cash received ...,2018
3,15509,2017-04-25,Exxon,Through innovative technologies and improved e...,2017
4,9427,2013-09-13,Chevron,"Today, Zacks Equity Research discusses the U.S...",2013


In [10]:
# get max length of values in 'text_col'
max_len = max(sent_df['sentences'].map(len))

print(max_len)

34542


In [11]:
### select 5 sentences per year 

#random.seed(42)
train, test = train_test_split(sent_df, train_size=600, stratify=sent_df[['Year','company']], random_state=7)



# Print the shape of the training and testing sets
print('Training set shape:', train.shape)
print('Testing set shape:', test.shape)

Training set shape: (600, 5)
Testing set shape: (1240603, 5)


In [12]:
### export the sample for hand coding 

In [13]:
## this count vectorizer checks how many words in the text are in my crafted vocab... 
## approx one minute

vectorizer = CountVectorizer(min_df=0.10, max_df=0.8,ngram_range = (1,3), vocabulary = vocab_climate_long,
                             strip_accents = 'unicode',stop_words='english')
dtm = vectorizer.fit_transform(train['sentences'])

In [14]:
## sum each row of the DTM to get a climate score  
array_score = np.sum(dtm, axis=1)
train['climate_score'] = array_score.tolist()
train = train.explode('climate_score')

train['likely'] = train['climate_score'].apply(lambda x: 1 if x > 1 else 0)

In [15]:
train

Unnamed: 0,index,Date,company,sentences,Year,climate_score,likely
690786,20799,2019-10-10,Exxon,PR Newswire\n\nMarine Grease market worldwide ...,2019,0,0
388431,14890,2018-07-31,Exxon,Impact on demand for LNG from LNG HDVs consume...,2018,0,0
519782,12579,2017-04-18,Chevron,The market size of marine lubricants is projec...,2017,0,0
685918,1266,2000-01-19,Exxon,While the Fund's investments are not restricte...,2000,0,0
839295,1324,2021-06-14,Marathon,"""We anticipate effecting a smooth transition o...",2021,0,0
...,...,...,...,...,...,...,...
495146,25565,2020-08-11,Exxon,The increasing demand for rubber chemicals has...,2020,0,0
1028637,20325,2019-05-08,Exxon,"PR Newswire\n\nThe""China Filling Station and G...",2019,0,0
35369,6198,2012-08-07,Chevron,ARP is also the largest sponsor of natural gas...,2012,0,0
463660,24965,2021-01-20,Exxon,If you purchased securities of Exxon Mobil ple...,2021,0,0


In [16]:
### export file
path = "C:\\Users\\julia\\OneDrive\\Desktop\\NLP"

# Change the directory
os.chdir(path)
train.to_csv('03_Outputs/04C_labeled_sampledata.csv',index=False)
test.to_csv('03_Outputs/04C_unlabeled_data.csv',index=False)

In [17]:
#train[train['climate_score'] > 0]