In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## How to download from Kaggle
reference: https://www.kaggle.com/discussions/general/74235

In [2]:
# !pip install kaggle

In [3]:
# %cd drive/MyDrive
# ! pip install -q kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

In [4]:
# %cd Neuromatch/datasets
# !kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

In [5]:
# %cd datasets
# !unzip fake-and-real-news-dataset.zip

##Load Datasets

In [6]:
%cd drive/MyDrive/Neuromatch

/content/drive/MyDrive/Neuromatch


In [7]:
import pandas as pd

true_df = pd.read_csv('datasets/True.csv')
fake_df = pd.read_csv('datasets/Fake.csv')

In [8]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [9]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [10]:
fake_df.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [11]:
fake_df.shape

(23481, 4)

In [12]:
fake_df['label']= 1
true_df['label']= 0

##Merge the datasets

In [13]:
df = pd.concat([fake_df,true_df], ignore_index = True)

In [14]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [15]:
df['text'][0]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

##Data Cleaning

In [16]:
# df = df.drop(['subject', 'date'], axis =1)

In [17]:
df.tail()

Unnamed: 0,title,text,subject,date,label
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0
44897,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",0


In [18]:
# df = df.dropna()
# df = df.reset_index(drop=True)

###Using nltk

In [None]:
!pip install nltk

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('all')

def preprocess_text(df, column_name):

    # Convert to lower case
    df[column_name] = df[column_name].str.lower()

    # Remove punctuation
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # Tokenization
    df[column_name] = df[column_name].apply(lambda x: word_tokenize(x))

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df[column_name] = df[column_name].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Stemming
    stemmer = PorterStemmer()
    df[column_name] = df[column_name].apply(lambda x: [stemmer.stem(word) for word in x])

    return df


In [21]:
df = preprocess_text(df, 'title')

In [22]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, send, embarrass, new, year, ev...",Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,"[drunk, brag, trump, staffer, start, russian, ...",House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,"[sheriff, david, clark, becom, internet, joke,...","On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,"[trump, obsess, even, obama, name, code, websi...","On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,"[pope, franci, call, donald, trump, christma, ...",Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [23]:
df = preprocess_text(df, 'text')

In [24]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, send, embarrass, new, year, ev...","[donald, trump, wish, american, happi, new, ye...",News,"December 31, 2017",1
1,"[drunk, brag, trump, staffer, start, russian, ...","[hous, intellig, committe, chairman, devin, nu...",News,"December 31, 2017",1
2,"[sheriff, david, clark, becom, internet, joke,...","[friday, reveal, former, milwauke, sheriff, da...",News,"December 30, 2017",1
3,"[trump, obsess, even, obama, name, code, websi...","[christma, day, donald, trump, announc, would,...",News,"December 29, 2017",1
4,"[pope, franci, call, donald, trump, christma, ...","[pope, franci, use, annual, christma, day, mes...",News,"December 25, 2017",1


In [25]:
df = preprocess_text(df, 'subject')

In [26]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, send, embarrass, new, year, ev...","[donald, trump, wish, american, happi, new, ye...",[news],"December 31, 2017",1
1,"[drunk, brag, trump, staffer, start, russian, ...","[hous, intellig, committe, chairman, devin, nu...",[news],"December 31, 2017",1
2,"[sheriff, david, clark, becom, internet, joke,...","[friday, reveal, former, milwauke, sheriff, da...",[news],"December 30, 2017",1
3,"[trump, obsess, even, obama, name, code, websi...","[christma, day, donald, trump, announc, would,...",[news],"December 29, 2017",1
4,"[pope, franci, call, donald, trump, christma, ...","[pope, franci, use, annual, christma, day, mes...",[news],"December 25, 2017",1


In [27]:
df = preprocess_text(df, 'date')

In [28]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, send, embarrass, new, year, ev...","[donald, trump, wish, american, happi, new, ye...",[news],"[decemb, 31, 2017]",1
1,"[drunk, brag, trump, staffer, start, russian, ...","[hous, intellig, committe, chairman, devin, nu...",[news],"[decemb, 31, 2017]",1
2,"[sheriff, david, clark, becom, internet, joke,...","[friday, reveal, former, milwauke, sheriff, da...",[news],"[decemb, 30, 2017]",1
3,"[trump, obsess, even, obama, name, code, websi...","[christma, day, donald, trump, announc, would,...",[news],"[decemb, 29, 2017]",1
4,"[pope, franci, call, donald, trump, christma, ...","[pope, franci, use, annual, christma, day, mes...",[news],"[decemb, 25, 2017]",1


In [29]:
df.tail()

Unnamed: 0,title,text,subject,date,label
44893,"[fulli, commit, nato, back, new, u, approach, ...","[brussel, reuter, nato, alli, tuesday, welcom,...",[worldnew],"[august, 22, 2017]",0
44894,"[lexisnexi, withdrew, two, product, chines, ma...","[london, reuter, lexisnexi, provid, legal, reg...",[worldnew],"[august, 22, 2017]",0
44895,"[minsk, cultur, hub, becom, author]","[minsk, reuter, shadow, disus, sovietera, fact...",[worldnew],"[august, 22, 2017]",0
44896,"[vatican, upbeat, possibl, pope, franci, visit...","[moscow, reuter, vatican, secretari, state, ca...",[worldnew],"[august, 22, 2017]",0
44897,"[indonesia, buy, 114, billion, worth, russian,...","[jakarta, reuter, indonesia, buy, 11, sukhoi, ...",[worldnew],"[august, 22, 2017]",0




No charts were generated by quickchart




No charts were generated by quickchart
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


###Using Spacy

In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

2023-07-13 17:43:09.375326: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-13 17:43:11.992222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-13 17:43:11.992656: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [None]:
# # # import spacy
# # # from spacy.lang.en.stop_words import STOP_WORDS
# # # from spacy.tokenizer import Tokenizer

# # # # Load the English language model in Spacy
# # # nlp = spacy.load('en_core_web_sm')

# # # # Define a custom tokenizer using Spacy
# # # tokenizer = Tokenizer(nlp.vocab)

# # # # Convert to lowercase, remove stopwords, and lemmatize
# # # df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in tokenizer(x.lower()) if token.text.lower() not in STOP_WORDS]))

# # # # Stemming (using Lemmatization with POS Tagging)
# # # df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ if token.pos_ != 'VERB' else token.lemma_ for token in tokenizer(x)]))

# # import spacy
# # from spacy.lang.en.stop_words import STOP_WORDS

# # # Load the English language model in Spacy
# # nlp = spacy.load('en_core_web_sm')

# # # Define a function to preprocess text using Spacy
# # def preprocess_text(text):
# #     doc = nlp(text)
# #     tokens = [token.lemma_ for token in doc if not token.is_stop]
# #     return ' '.join(tokens)

# # # Apply the preprocessing function to the 'text' column
# # df['text'] = df['text'].apply(preprocess_text)

# import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
# import string

# # Load the English language model in Spacy
# nlp = spacy.load('en_core_web_sm')

# # Custom preprocessing function
# def preprocess_text(text):
#     doc = nlp(text)
#     tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation]
#     return ' '.join(tokens)

# # Apply preprocessing to the 'text' column
# df['text'] = df['text'].apply(preprocess_text)


In [None]:
# df['text'] = df['text'].str.lower()

In [None]:
# import re
# # Remove punctuation
# df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


In [None]:
# df.head()

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump couldn t wish americans happy new...,1
1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,1
2,Sheriff David Clarke Becomes An Internet Joke...,friday reveal milwaukee sheriff david clarke c...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,christmas day donald trump announce work f...,1
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis annual christmas day message rebu...,1


In [None]:
# df.tail()

Unnamed: 0,title,text,label
44893,'Fully committed' NATO backs new U.S. approach...,brussels reuters nato ally tuesday welcome pre...,0
44894,LexisNexis withdrew two products from Chinese ...,london reuters lexisnexis provider legal regul...,0
44895,Minsk cultural hub becomes haven from authorities,minsk reuters shadow disuse soviet era factory...,0
44896,Vatican upbeat on possibility of Pope Francis ...,moscow reuters vatican secretary state cardina...,0
44897,Indonesia to buy $1.14 billion worth of Russia...,jakarta reuters indonesia buy 11 sukhoi fighte...,0


## Save the cleaned dataset

In [30]:
from pathlib import Path
# filepath = Path('/content/drive/MyDrive/Neuromatch/Cleaned Data /NLTK/data-finally.csv')
filepath = Path('/content/drive/MyDrive/Neuromatch/cleaned-dataset/cleaned-datasets.csv')
df.to_csv(filepath)