In [1]:
##### Purpose: to process text data into sentencec level with index. 
##### Author: Julia Cope
##### Creation Date: 4/20/23
##### Project: A2 NLP - capturing climate claims 
##### Inputs: 
##### Inputs: 03_Outputs/01_text_metadata.csv

##### Inputs: 03_Outputs/01_exxon_metadata.csv
##### Inputs: 03_Outputs/01_chevron_metadata.csv
##### Inputs: 03_Outputs/01_marathon_metadata.csv
##### Inputs: 03_Outputs/01_phillips_metadata.csv
##### Inputs: 03_Outputs/01_valero_metadata.csv

##### Output:  
##### Output: 02_textsentences.csv
##### Output: 

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import itertools
random.seed(10)

In [3]:
### split into train, test, dev
def func_split_three_dfs(df):
    
    df['randNumCol'] = pd.Series(range(1,4)).sample(16190, replace=True,random_state = 10).array
    df = df.reset_index()
    
    ## df has 16190 rows
    traindf = df[df['randNumCol'] == 1]
    ## traindf has 5359 rows
    devdf = df[df['randNumCol'] == 2]
    ##dev df has 5396 rows
    testdf = df[df['randNumCol'] == 3]
    ## testdf has 5435 rows
    
    return traindf, devdf, testdf

In [4]:
def func_split_metadata_text(df_ninecols):
    text_df = df_ninecols[['index','Date','text','company']]
    ## keep index
    
    return text_df

In [5]:
#def func_clean_data(text_df):
    ## remove the date first words
    
    ## separate paragraphs
    ## separate sentences
    
#cleaned_dev_text = func_clean_data(dev_text_df)
#text = dev_text_df["text"][200]

In [6]:
## read in data 
#text_metadata_df = pd.read_csv('03_Outputs/01_text_metadata.csv')


exx_df = pd.read_csv('03_Outputs/01_exxon_metadata.csv')
che_df = pd.read_csv('03_Outputs/01_chevron_metadata.csv')
mar_df = pd.read_csv('03_Outputs/01_marathon_metadata.csv')
phi_df = pd.read_csv('03_Outputs/01_phillips_metadata.csv')
val_df = pd.read_csv('03_Outputs/01_valero_metadata.csv')



exx_df['company'] = 'Exxon'
che_df['company'] = 'Chevron'
mar_df['company'] = 'Marathon'
phi_df['company'] = 'Phillips'
val_df['company'] = 'Valero'



In [7]:
text_metadata_df = pd.concat([exx_df, che_df, mar_df, phi_df, val_df])

In [8]:
text_metadata_df

Unnamed: 0,Filename,Publication,Section,Date,Title,Author,LNID,text,company
0,business-wire-alaska-north-slope-7134cb7e-dfdc...,Business Wire,,2000-12-06,Alaska North Slope Gas Producers Announce Agre...,,41V6-6VD0-00RH-4557-00000-00,The three major Alaska North Slope Gas produce...,Exxon
1,business-wire-equityoutlook.com-announces-stoc...,Business Wire,,2002-05-07,EquityOutlook.com Announces Stock Evaluation R...,,45SF-5590-010G-0532-00000-00,EquityOutlook.Com (CRD#118718) - www.equityout...,Exxon
2,business-wire-on24-video-investor-713933bc-dfd...,Business Wire,,2000-08-01,"ON24 Video Investor Alert: Summer Sets in, Mar...",,40W3-XMK0-00RH-426G-00000-00,The markets appear to have finally settled int...,Exxon
3,business-wire-exxonmobil-and-teach-713c1424-df...,Business Wire,,2000-08-23,ExxonMobil and Teach the Children Pump Student...,,411T-8HS0-00RH-4100-00000-00,Exxon Mobil Corporation is helping to prepare ...,Exxon
4,business-wire-insight_s-dick-powers-713f5012-d...,Business Wire,,2003-04-03,INSIGHT's Dick Powers Wins iSource 'Pros to Kn...,,4891-FH20-01KN-12X9-00000-00,"INSIGHT, Inc., a top international provider of...",Exxon
...,...,...,...,...,...,...,...,...,...
3889,newstex-blogs-zacks-investment-research-zacks....,Newstex Blogs Zacks Investment Research,,2022-02-23,Zacks.com featured highlights include Valero E...,Zacks Equity Research,64VH-NWG1-JCMN-Y4MX-00000-00,"\n\nFeb 23, 2022( Zacks Investment Research: /...",Valero
3890,newstex-blogs-zacks-investment-research-zacks....,Newstex Blogs Zacks Investment Research,,2022-11-11,"Zacks.com featured highlights Arch Resources, ...",Zacks Equity Research,66V4-PTJ1-F03R-N3VB-00000-00,"\n\nNov 11, 2022( Zacks Investment Research: /...",Valero
3891,newstex-blogs-zacks-investment-research-the-za...,Newstex Blogs Zacks Investment Research,,2022-06-08,"The Zacks Analyst Blog Highlights Nucor, Steel...",Santanu Roy,65MX-D8R1-JCMN-Y32P-00000-00,"\n\nJun 08, 2022( Zacks Investment Research: /...",Valero
3892,newstex-blogs-zacks-investment-research-zacks....,Newstex Blogs Zacks Investment Research,,2022-04-12,Zacks.com featured highlights include National...,Zacks Equity Research,656S-3NK1-JCMN-Y2RG-00000-00,"\n\nApr 12, 2022( Zacks Investment Research: /...",Valero


In [9]:

text_metadata_df['text'] = text_metadata_df['text'].astype(str)
#traindf, devdf, testdf = func_split_three_dfs(text_metadata_df)
#dev_text_df = func_split_metadata_text(devdf)
text_metadata_df = text_metadata_df.reset_index()
full_text_df = func_split_metadata_text(text_metadata_df)

In [10]:
full_text_df

# Function to split text into sentences
def split_sentences(text):
    return nltk.sent_tokenize(text)

# Apply the function to the text column
#dev_text_df['sentences'] = dev_text_df['text'].apply(split_sentences)
full_text_df['sentences'] = full_text_df['text'].apply(lambda x: nltk.sent_tokenize(x))




del full_text_df['text']

## takes about 4 to 5 minutes
full_text_df = full_text_df.explode('sentences')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_text_df['sentences'] = full_text_df['text'].apply(lambda x: nltk.sent_tokenize(x))


In [11]:
full_text_df

Unnamed: 0,index,Date,company,sentences
0,0,2000-12-06,Exxon,The three major Alaska North Slope Gas produce...
0,0,2000-12-06,Exxon,"Ultimately, such a project would involve a pip..."
0,0,2000-12-06,Exxon,The agreement announced today initiates the fi...
0,0,2000-12-06,Exxon,The key program activities over the next year ...
0,0,2000-12-06,Exxon,The focus will be on route evaluation and sele...
...,...,...,...,...
54334,3893,2022-06-22,Valero,"Newstex Authoritative Content is not ""read and..."
54334,3893,2022-06-22,Valero,"Accordingly, neither Newstex nor its re-distri..."
54334,3893,2022-06-22,Valero,The Newstex Authoritative Content shall be con...
54334,3893,2022-06-22,Valero,"Accordingly, no warranties or other guarantees..."


In [12]:
#text_metadata_df

In [15]:
### export file

import os

path = "C:\\Users\\julia\\OneDrive\\Desktop\\NLP"

# Change the directory
os.chdir(path)
full_text_df.to_csv('03_Outputs/02_PR_sentences.csv',index=False)
#full_text_df.to_csv('03_Outputs/02_textsentences.csv',index=False)

In [14]:
### Below is experimentation...