# Connect to Drive

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#permit notebook to access drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#obtain the five datasets found in Datasets/Use
%cd '/content/drive/MyDrive/F22/COSC_Project_Jane_Atul/4426/rumor_detection_acl2017'
%ls

/content/drive/MyDrive/F22/COSC_Project_Jane_Atul/4426/rumor_detection_acl2017
cleandf.csv  README.txt  [0m[01;34mtwitter15[0m/  Twitter15.csv  [01;34mtwitter16[0m/  Twitter16.csv


In [4]:
tweet15 = pd.read_csv("Twitter15.csv")
tweet15.shape

(1490, 11)

# Tweeter15 Dataset

In [5]:
tweet15 = tweet15.loc[:, ~tweet15.columns.str.contains('^Unnamed')]
tweet15.head()

Unnamed: 0,ID,LABEL,CONTENT
0,731166399389962242,unverified,\t🔥ca kkk grand wizard 🔥 endorses @hillaryclin...
1,714598641827246081,unverified,\tan open letter to trump voters from his top ...
2,691809004356501505,non-rumor,"\t""america is a nation of second chances"" —@po..."
3,693204708933160960,non-rumor,"\tbrandon marshall visits and offers advice, s..."
4,551099691702956032,true,\trip elly may clampett


In [6]:
#REMOVE SPACES
tweet15["CONTENT"] = tweet15["CONTENT"].str.strip()
tweet15.head()

Unnamed: 0,ID,LABEL,CONTENT
0,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...
1,714598641827246081,unverified,an open letter to trump voters from his top st...
2,691809004356501505,non-rumor,"""america is a nation of second chances"" —@potu..."
3,693204708933160960,non-rumor,"brandon marshall visits and offers advice, sup..."
4,551099691702956032,true,rip elly may clampett


In [7]:
#REMOVE PUNCTUATION
import string
string.punctuation

def removePunctuation(txt):
  no_punt = "".join([c for c in txt if c not in string.punctuation])
  return no_punt

tweet15["CONTENT"] = tweet15['CONTENT'].apply(lambda x: removePunctuation(x))
tweet15.head()

Unnamed: 0,ID,LABEL,CONTENT
0,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses hillaryclinton...
1,714598641827246081,unverified,an open letter to trump voters from his top st...
2,691809004356501505,non-rumor,america is a nation of second chances —potus o...
3,693204708933160960,non-rumor,brandon marshall visits and offers advice supp...
4,551099691702956032,true,rip elly may clampett


In [8]:
#REMOVE LINKS
def clean_data(dataframe):
    tweet15['CONTENT'] = tweet15['CONTENT'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

clean_data(tweet15)
tweet15.head()

  tweet15['CONTENT'] = tweet15['CONTENT'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


Unnamed: 0,ID,LABEL,CONTENT
0,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses hillaryclinton...
1,714598641827246081,unverified,an open letter to trump voters from his top st...
2,691809004356501505,non-rumor,america is a nation of second chances —potus o...
3,693204708933160960,non-rumor,brandon marshall visits and offers advice supp...
4,551099691702956032,true,rip elly may clampett


In [9]:
#REMOVE EMOJIS
tweet15["CONTENT"] = tweet15['CONTENT'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
tweet15.head()

Unnamed: 0,ID,LABEL,CONTENT
0,731166399389962242,unverified,ca kkk grand wizard endorses hillaryclinton n...
1,714598641827246081,unverified,an open letter to trump voters from his top st...
2,691809004356501505,non-rumor,america is a nation of second chances potus on...
3,693204708933160960,non-rumor,brandon marshall visits and offers advice supp...
4,551099691702956032,true,rip elly may clampett


In [10]:
tweet15.drop("ID", axis=1, inplace = True)
tweet15.head()

Unnamed: 0,LABEL,CONTENT
0,unverified,ca kkk grand wizard endorses hillaryclinton n...
1,unverified,an open letter to trump voters from his top st...
2,non-rumor,america is a nation of second chances potus on...
3,non-rumor,brandon marshall visits and offers advice supp...
4,true,rip elly may clampett


In [11]:
tweet15['LABEL'].value_counts()

unverified    374
non-rumor     374
true          372
false         370
Name: LABEL, dtype: int64

In [12]:
def labelToBin(x):
  if (x == 'non-rumor'):
    x = 1
  else:
    x = 0
  return(x)

tweet15['LABELBIN'] = tweet15['LABEL'].apply(lambda x: labelToBin(x))
tweet15.head()

Unnamed: 0,LABEL,CONTENT,LABELBIN
0,unverified,ca kkk grand wizard endorses hillaryclinton n...,0
1,unverified,an open letter to trump voters from his top st...,0
2,non-rumor,america is a nation of second chances potus on...,1
3,non-rumor,brandon marshall visits and offers advice supp...,1
4,true,rip elly may clampett,0


In [13]:
tweet15.drop("LABEL", axis=1, inplace=True)
tweet15.head()

Unnamed: 0,CONTENT,LABELBIN
0,ca kkk grand wizard endorses hillaryclinton n...,0
1,an open letter to trump voters from his top st...,0
2,america is a nation of second chances potus on...,1
3,brandon marshall visits and offers advice supp...,1
4,rip elly may clampett,0


In [14]:
#TOKENIZATION
import re
def tokenize(txt):
  tokens = re.split('\W+', txt)
  return tokens

tweet15['CONTENT'] = tweet15['CONTENT'].apply(lambda x: tokenize(x.lower()))
tweet15.head()


Unnamed: 0,CONTENT,LABELBIN
0,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
1,"[an, open, letter, to, trump, voters, from, hi...",0
2,"[america, is, a, nation, of, second, chances, ...",1
3,"[brandon, marshall, visits, and, offers, advic...",1
4,"[rip, elly, may, clampett]",0


In [15]:
#REMOVE STOPWORDS
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(txt):
  new_txt = [word for word in txt if word not in stopwords]
  return new_txt

tweet15['CONTENT'] = tweet15['CONTENT'].apply(lambda x: remove_stopwords(x))
tweet15.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,CONTENT,LABELBIN
0,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
1,"[open, letter, trump, voters, top, strategistt...",0
2,"[america, nation, second, chances, potus, new,...",1
3,"[brandon, marshall, visits, offers, advice, su...",1
4,"[rip, elly, may, clampett]",0


In [16]:
#LEMMATIZATION
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
  
wn = nltk.WordNetLemmatizer()
def lemmatization(txt):
  text = [wn.lemmatize(word) for word in txt]
  return text

tweet15['CONTENT'] = tweet15['CONTENT'].apply(lambda x: lemmatization(x))
tweet15.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,CONTENT,LABELBIN
0,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
1,"[open, letter, trump, voter, top, strategisttu...",0
2,"[america, nation, second, chance, potus, new, ...",1
3,"[brandon, marshall, visit, offer, advice, supp...",1
4,"[rip, elly, may, clampett]",0


In [17]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(tweet15, test_size=0.2, random_state=42)
x_train = train_set["CONTENT"].copy()
y_train = train_set["LABELBIN"].copy()
x_test = test_set["CONTENT"].copy()
y_test = test_set["LABELBIN"].copy()

In [18]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
numbers=4000
tokenizer=Tokenizer(num_words=numbers)
tokenizer.fit_on_texts(x_train)

In [19]:
tokenize_train = tokenizer.texts_to_sequences(x_train)
tokenize_test = tokenizer.texts_to_sequences(x_test)

In [20]:
X_train=pad_sequences(tokenize_train)
X_test=pad_sequences(tokenize_test, maxlen=X_train.shape[1])

DecisionTreeClassifier Model

In [21]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
classifier = DecisionTreeClassifier(criterion='entropy', max_depth=200)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[180  37]
 [ 59  22]]


0.6778523489932886

RandomForestClassifier Model

In [22]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
y_pred_rfc = forest_clf.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print(cm_rfc)
accuracy_score(y_test, y_pred_rfc)

[[206  11]
 [ 72   9]]


0.7214765100671141

LSTM Model

In [23]:
from keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from keras.models import Model

In [24]:
words_index=tokenizer.word_index
unique_words=len(words_index)
i = Input(shape=(X_train.shape[1],))
x=Embedding(unique_words+1, 20)(i)
x=LSTM(15, return_sequences=True)(x)
x=GlobalMaxPooling1D()(x)
x=Dense(32, activation='relu')(x)
x=Dense(1, activation='sigmoid')(x)

lstm_model=Model(i, x)


In [25]:
lstm_model.compile(loss="binary_crossentropy",optimizer="adam",
              metrics=["accuracy"])

lstm_model.fit(X_train,y_train,epochs= 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f54c20f17f0>

In [26]:
lstm_model.evaluate(X_test, y_test)



[0.6284453868865967, 0.8389261960983276]

# Twitter16 Dataset

In [29]:
tweet16 = pd.read_csv("Twitter16.csv")
tweet16.shape

(818, 14)

In [30]:
tweet16 = tweet16.loc[:, ~tweet16.columns.str.contains('^Unnamed')]
tweet16.head()

Unnamed: 0,ID,LABEL,CONTENT
0,656955120626880512,false,\tcorrect predictions in back to the future ii...
1,615689290706595840,true,\t.@whitehouse in rainbow colors for #scotusma...
2,613404935003217920,false,\tcops bought the alleged church shooter burge...
3,731166399389962242,unverified,\t🔥ca kkk grand wizard 🔥 endorses @hillaryclin...
4,714598641827246081,unverified,\tan open letter to trump voters from his top ...


In [31]:
#REMOVE SPACES
tweet16["CONTENT"] = tweet16["CONTENT"].str.strip()
tweet16.head()

Unnamed: 0,ID,LABEL,CONTENT
0,656955120626880512,false,correct predictions in back to the future ii URL
1,615689290706595840,true,.@whitehouse in rainbow colors for #scotusmarr...
2,613404935003217920,false,cops bought the alleged church shooter burger ...
3,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...
4,714598641827246081,unverified,an open letter to trump voters from his top st...


In [32]:
#REMOVE PUNCTUATION
import string
string.punctuation

def removePunctuation(txt):
  no_punt = "".join([c for c in txt if c not in string.punctuation])
  return no_punt

tweet16["CONTENT"] = tweet16['CONTENT'].apply(lambda x: removePunctuation(x))
tweet16.head()

Unnamed: 0,ID,LABEL,CONTENT
0,656955120626880512,false,correct predictions in back to the future ii URL
1,615689290706595840,true,whitehouse in rainbow colors for scotusmarriag...
2,613404935003217920,false,cops bought the alleged church shooter burger ...
3,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses hillaryclinton...
4,714598641827246081,unverified,an open letter to trump voters from his top st...


In [33]:
#REMOVE LINKS
def clean_data(dataframe):
    tweet16['CONTENT'] = tweet16['CONTENT'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

clean_data(tweet16)
tweet16.head()

  tweet16['CONTENT'] = tweet16['CONTENT'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


Unnamed: 0,ID,LABEL,CONTENT
0,656955120626880512,false,correct predictions in back to the future ii URL
1,615689290706595840,true,whitehouse in rainbow colors for scotusmarriag...
2,613404935003217920,false,cops bought the alleged church shooter burger ...
3,731166399389962242,unverified,🔥ca kkk grand wizard 🔥 endorses hillaryclinton...
4,714598641827246081,unverified,an open letter to trump voters from his top st...


In [34]:
#REMOVE EMOJIS
tweet16["CONTENT"] = tweet16['CONTENT'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
tweet16.head()

Unnamed: 0,ID,LABEL,CONTENT
0,656955120626880512,false,correct predictions in back to the future ii URL
1,615689290706595840,true,whitehouse in rainbow colors for scotusmarriag...
2,613404935003217920,false,cops bought the alleged church shooter burger ...
3,731166399389962242,unverified,ca kkk grand wizard endorses hillaryclinton n...
4,714598641827246081,unverified,an open letter to trump voters from his top st...


In [35]:
tweet16.drop("ID", axis=1, inplace = True)
tweet16.head()

Unnamed: 0,LABEL,CONTENT
0,false,correct predictions in back to the future ii URL
1,true,whitehouse in rainbow colors for scotusmarriag...
2,false,cops bought the alleged church shooter burger ...
3,unverified,ca kkk grand wizard endorses hillaryclinton n...
4,unverified,an open letter to trump voters from his top st...


In [36]:
tweet16['LABEL'].value_counts()

true          207
false         205
non-rumor     205
unverified    201
Name: LABEL, dtype: int64

In [37]:
def labelToBin(x):
  if (x == 'non-rumor'):
    x = 1
  else:
    x = 0
  return(x)

tweet16['LABELBIN'] = tweet16['LABEL'].apply(lambda x: labelToBin(x))
tweet16.head()

Unnamed: 0,LABEL,CONTENT,LABELBIN
0,false,correct predictions in back to the future ii URL,0
1,true,whitehouse in rainbow colors for scotusmarriag...,0
2,false,cops bought the alleged church shooter burger ...,0
3,unverified,ca kkk grand wizard endorses hillaryclinton n...,0
4,unverified,an open letter to trump voters from his top st...,0


In [38]:
tweet16.drop("LABEL", axis=1, inplace=True)
tweet16.head()

Unnamed: 0,CONTENT,LABELBIN
0,correct predictions in back to the future ii URL,0
1,whitehouse in rainbow colors for scotusmarriag...,0
2,cops bought the alleged church shooter burger ...,0
3,ca kkk grand wizard endorses hillaryclinton n...,0
4,an open letter to trump voters from his top st...,0


In [39]:
#TOKENIZATION
import re
def tokenize(txt):
  tokens = re.split('\W+', txt)
  return tokens

tweet16['CONTENT'] = tweet16['CONTENT'].apply(lambda x: tokenize(x.lower()))
tweet16.head()


Unnamed: 0,CONTENT,LABELBIN
0,"[correct, predictions, in, back, to, the, futu...",0
1,"[whitehouse, in, rainbow, colors, for, scotusm...",0
2,"[cops, bought, the, alleged, church, shooter, ...",0
3,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
4,"[an, open, letter, to, trump, voters, from, hi...",0


In [40]:
#REMOVE STOPWORDS
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(txt):
  new_txt = [word for word in txt if word not in stopwords]
  return new_txt

tweet16['CONTENT'] = tweet16['CONTENT'].apply(lambda x: remove_stopwords(x))
tweet16.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,CONTENT,LABELBIN
0,"[correct, predictions, back, future, ii, url]",0
1,"[whitehouse, rainbow, colors, scotusmarriage, ...",0
2,"[cops, bought, alleged, church, shooter, burge...",0
3,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
4,"[open, letter, trump, voters, top, strategistt...",0


In [41]:
#LEMMATIZATION
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
  
wn = nltk.WordNetLemmatizer()
def lemmatization(txt):
  text = [wn.lemmatize(word) for word in txt]
  return text

tweet16['CONTENT'] = tweet16['CONTENT'].apply(lambda x: lemmatization(x))
tweet16.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,CONTENT,LABELBIN
0,"[correct, prediction, back, future, ii, url]",0
1,"[whitehouse, rainbow, color, scotusmarriage, h...",0
2,"[cop, bought, alleged, church, shooter, burger...",0
3,"[ca, kkk, grand, wizard, endorses, hillaryclin...",0
4,"[open, letter, trump, voter, top, strategisttu...",0


In [42]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(tweet16, test_size=0.2, random_state=42)
x_train = train_set["CONTENT"].copy()
y_train = train_set["LABELBIN"].copy()
x_test = test_set["CONTENT"].copy()
y_test = test_set["LABELBIN"].copy()

In [43]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
numbers=4000
tokenizer=Tokenizer(num_words=numbers)
tokenizer.fit_on_texts(x_train)

In [44]:
tokenize_train = tokenizer.texts_to_sequences(x_train)
tokenize_test = tokenizer.texts_to_sequences(x_test)

In [45]:
X_train=pad_sequences(tokenize_train)
X_test=pad_sequences(tokenize_test, maxlen=X_train.shape[1])

DecisionTreeClassifier Model

In [46]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
classifier = DecisionTreeClassifier(criterion='entropy', max_depth=200)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[96 20]
 [40  8]]


0.6341463414634146

RandomForestClassifier Model

In [47]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
y_pred_rfc = forest_clf.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print(cm_rfc)
accuracy_score(y_test, y_pred_rfc)

[[113   3]
 [ 43   5]]


0.7195121951219512

LSTM Model

In [48]:
from keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from keras.models import Model

In [49]:
words_index=tokenizer.word_index
unique_words=len(words_index)
i = Input(shape=(X_train.shape[1],))
x=Embedding(unique_words+1, 20)(i)
x=LSTM(15, return_sequences=True)(x)
x=GlobalMaxPooling1D()(x)
x=Dense(32, activation='relu')(x)
x=Dense(1, activation='sigmoid')(x)

lstm_model=Model(i, x)

In [50]:
lstm_model.compile(loss="binary_crossentropy",optimizer="adam",
              metrics=["accuracy"])

lstm_model.fit(X_train,y_train,epochs= 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f54c6f976d0>

In [54]:
lstm_model.evaluate(X_test, y_test)



[0.7513077259063721, 0.7865853905677795]