In [1]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize
import numpy as np
import pandas as pd
import pymongo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import string
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, AveragePooling1D, Conv1D, Flatten, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/josephdixon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/josephdixon/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Connect to DB
client = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = client["msds696"]
mycol = mydb["climate_score_corpus"]

In [9]:
# Load results into DataFrame
results = mycol.find()
df =  pd.DataFrame(list(results))
del df['_id']

df = df[df.label != 'Test']

In [10]:
def clean_text(corpus):
    # Remove stopwords
    tokens = wordpunct_tokenize(corpus)
    tokens = [w.lower() for w in tokens] # Lowercase letters
    table = str.maketrans('', '', string.punctuation) # Remove punctuation
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.encode().isalpha()] # Keep only English text
    stop_words = set(stopwords.words('english')) # Remove English stopwords
    words = [w for w in words if not w in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    corpus_cleaned = [lemmatizer.lemmatize(word) for word in words]
    corpus_cleaned = ' '.join(corpus_cleaned)
    return corpus_cleaned

df['corpus_cleaned'] = df['corpus'].apply(clean_text)

In [11]:
print(df['corpus_cleaned'])

0     environmental policy joe biden administration ...
1     pete buttigieg transportation climate solution...
2     climate change senator edward markey massachus...
3     green new deal bernie sander official website ...
4     climate representative ocasio cortez skip main...
5     combatting climate crisis congresswoman diana ...
6     energy environment governor jay inslee skip ma...
7     climate justice michelle wu bostonskip main co...
8     office city county denver open new tab window ...
9     heading second term fed chair jerome powell bu...
10    forbidden forbidden nginx delimit remark secre...
11    joe manchin popularity west virginia soar bide...
12    sinema backed bill protects strengthens invest...
13    energy environment representative jared golden...
14    adam kinzinger league conservation voter score...
15    leading climate phil scott governor toggle nav...
16    avoiding word climate change desantis say glob...
17    resilience officer slate jacksonville clim

In [6]:
# Get size of dictionary for one-hot encoding
results = set()
df['corpus_cleaned'].str.lower().str.split().apply(results.update)
vocab_size = len(results)
print(vocab_size)

44986


In [14]:
lb_make = LabelEncoder()

labels = lb_make.fit_transform(df['label'])
corpora = df['corpus_cleaned']

x_train, x_test , y_train, y_test = train_test_split(corpora, labels , test_size = 0.25)
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [15]:
x_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in x_train]
x_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in x_test]

In [16]:
max_length = 50000
x_train = pad_sequences(x_train, max_length, padding='post')
x_test = pad_sequences(x_test, max_length, padding='post')

In [23]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length))
model.add(Conv1D(32, 5, activation='relu', input_shape=(5000,16)))
model.add(AveragePooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50000, 16)         719776    
                                                                 
 conv1d_2 (Conv1D)           (None, 49996, 32)         2592      
                                                                 
 average_pooling1d_2 (Averag  (None, 24998, 32)        0         
 ePooling1D)                                                     
                                                                 
 flatten_2 (Flatten)         (None, 799936)            0         
                                                                 
 dense_5 (Dense)             (None, 250)               199984250 
                                                                 
 dense_6 (Dense)             (None, 100)               25100     
                                                      

In [24]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=30, verbose=2)
# Getting score metrics from our model
scores = model.evaluate(x_test, y_test, verbose=0)
# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/5


2022-08-08 08:31:59.261632: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


2/2 - 13s - loss: 1.4608 - accuracy: 0.1892 - val_loss: 1.5055 - val_accuracy: 0.3846 - 13s/epoch - 6s/step
Epoch 2/5


2022-08-08 08:32:11.225672: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


2/2 - 11s - loss: 1.2790 - accuracy: 0.4324 - val_loss: 1.1811 - val_accuracy: 0.2308 - 11s/epoch - 6s/step
Epoch 3/5
2/2 - 9s - loss: 1.1621 - accuracy: 0.2703 - val_loss: 1.0827 - val_accuracy: 0.4615 - 9s/epoch - 5s/step
Epoch 4/5
2/2 - 1s - loss: 1.0794 - accuracy: 0.4054 - val_loss: 1.0883 - val_accuracy: 0.4615 - 1s/epoch - 702ms/step
Epoch 5/5
2/2 - 1s - loss: 1.0729 - accuracy: 0.4054 - val_loss: 1.0889 - val_accuracy: 0.4615 - 1s/epoch - 748ms/step
Accuracy: 46.15%
