# LITMUS-Based COVID-19 Misinformation Classification
## Eric Hsieh and Dongsuk Lim
### CS 6353 Spring 2021

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("."))

['post_data.csv', 'post_data_full.csv', '.DS_Store', 'saved_model', 'data_collect.py', 'Applying Kaggle Spam Detection Model to Our Data.ipynb', 'data_mining.ipynb', 'wc_misinf.png', 'README.md', 'enterprise_app', 'Covid_Classification.ipynb', '.ipynb_checkpoints', '.git', 'post_data_source.csv', 'wc_inf.png']


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, Flatten, SimpleRNN, GRU

Using TensorFlow backend.


In [3]:
data = pd.read_csv("post_data_full.csv")

In [4]:
#Drop unnecessary columns 
data = data.drop(columns=['url', 'subreddit', 'score', 'permalink','Source'])

In [5]:
data.head()

Unnamed: 0,class,title
0,1,COVID-19: Cases of vaccine-resistant variant m...
1,1,Should A COVID Vaccine Be Part Of The Back-To-...
2,1,UK hits 30 million first vaccine doses - 57% o...
3,1,EU medical regulator says AstraZeneca COVID va...
4,1,Moderna begins testing next-generation coronav...


In [6]:
def normalize_text(s):
    s = s.lower()
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    return s

In [7]:
# Normalize Titles
data['text'] = [normalize_text(s) for s in data['title']]

In [8]:
sentences_train, sentences_test, classes_train, classes_test = train_test_split(
    data['text'], data['class'], test_size=0.2, random_state=1000)

In [10]:
# Multinomial Naive Bayes

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
x_train = vectorizer.transform(sentences_train)
x_test = vectorizer.transform(sentences_test)

encoder = LabelEncoder()
encoder.fit(classes_train)
y_train = encoder.transform(classes_train)
y_test = encoder.transform(classes_test)

nb = MultinomialNB()
nb.fit(x_train, classes_train)
nb.score(x_test, classes_test)

0.8940269749518305

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

x_train_tok = tokenizer.texts_to_sequences(sentences_train)
x_test_tok = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 50
x_train_tok = pad_sequences(x_train_tok, padding='post', maxlen=maxlen)
x_test_tok = pad_sequences(x_test_tok, padding='post', maxlen=maxlen)

In [12]:
#Tensorflow
tf.set_random_seed(1)

embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
history = model.fit(x_train_tok, y_train, epochs=5, validation_data=(x_test_tok, y_test), batch_size=8)


Train on 2076 samples, validate on 519 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
loss, accuracy = model.evaluate(x_train_tok, y_train)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test_tok, y_test)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9976
Testing Accuracy:  0.8940


In [15]:
model.save('saved_model')