In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd drive/MyDrive/Neuromatch

/content/drive/MyDrive/Neuromatch


In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import re

In [4]:
true_data = pd.read_csv('datasets/True.csv')
fake_data = pd.read_csv('datasets/Fake.csv')
true_data.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
# Adding labels
true_data["label"] = np.ones(len(true_data),dtype=int)
fake_data["label"] = np.zeros(len(fake_data),dtype=int)

true_data.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [6]:
data = pd.concat((true_data,fake_data),axis=0)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB
None


In [7]:
data = data.sample(frac=1)
data.head(10)

Unnamed: 0,title,text,subject,date,label
17878,Soon-to-go-free jail convicts snared in French...,PARIS (Reuters) - Two French prison inmates wh...,worldnews,"October 10, 2017",1
14830,Hariri to return to Lebanon in next two days,BEIRUT (Reuters) - Saad al-Hariri will return ...,worldnews,"November 14, 2017",1
19632,Puerto Rico power grid faces generational thre...,(Reuters) - Hurricane Maria was on course Tues...,worldnews,"September 19, 2017",1
16119,WATCH TUCKER CARLSON’S Heated Debate With Delu...,"Tucker Carlson interviews Jose Antonio Vargas,...",Government News,"May 3, 2017",0
6887,Five Women Donald Trump Allegedly Slept With ...,Donald Trump obviously has a knack for purchas...,News,"April 15, 2016",0
11875,French Socialists sell historic headquarters f...,PARIS (Reuters) - France s struggling Socialis...,worldnews,"December 19, 2017",1
12854,Turkey's Erdogan says U.S. Jerusalem decision ...,ATHENS (Reuters) - Turkish President Tayyip Er...,worldnews,"December 7, 2017",1
15748,U.S. official met Syrian security chief in Dam...,BEIRUT (Reuters) - A senior U.S. official met ...,worldnews,"November 3, 2017",1
17763,"WHERE’S THE MEDIA? BLM Blocked Streets, Storme...","In September of 2016, 3 separate reports of ra...",left-news,"Oct 25, 2017",0
3151,STUNNING New Report Shows HUGE Number Of Inel...,A new report shows that we have at least 50 el...,News,"January 5, 2017",0


In [8]:
data = data.drop("date",axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 17878 to 8403
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   label    44898 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [9]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [10]:
data.head()

Unnamed: 0,title,text,subject,label
17878,Soon-to-go-free jail convicts snared in French...,PARIS (Reuters) - Two French prison inmates wh...,worldnews,1
14830,Hariri to return to Lebanon in next two days,BEIRUT (Reuters) - Saad al-Hariri will return ...,worldnews,1
19632,Puerto Rico power grid faces generational thre...,(Reuters) - Hurricane Maria was on course Tues...,worldnews,1
16119,WATCH TUCKER CARLSON’S Heated Debate With Delu...,"Tucker Carlson interviews Jose Antonio Vargas,...",Government News,0
6887,Five Women Donald Trump Allegedly Slept With ...,Donald Trump obviously has a knack for purchas...,News,0


In [11]:
# Tokenize and preprocess text
lemma = WordNetLemmatizer()
pattern = "[^a-zA-Z]"
def preprocess_text(text):
    text = re.sub(pattern, " ", text)  # Cleaning
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    lemmaized_tokens  = [lemma.lemmatize(word) for word in filtered_tokens]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return stemmed_tokens

In [12]:
# Train Word2Vec model
all_tokens = [preprocess_text(text) for text in data['text']] + [preprocess_text(title) for title in data['title']]
model = Word2Vec(all_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [13]:
# Function to vectorize the text
def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if not vectors:
        # If none of the words are in the model's vocabulary, return None
        return None
    # Average the word vectors to get the text representation
    avg_vector = sum(vectors) / len(vectors)
    return avg_vector

In [14]:
# Vectorize the "text" column
data['text_vector'] = data['text'].apply(lambda x: vectorize_text(preprocess_text(x), model))

# Vectorize the "title" column
data['title_vector'] = data['title'].apply(lambda x: vectorize_text(preprocess_text(x), model))

# Print the new DataFrame
print(data)

                                                   title  \
17878  Soon-to-go-free jail convicts snared in French...   
14830       Hariri to return to Lebanon in next two days   
19632  Puerto Rico power grid faces generational thre...   
16119  WATCH TUCKER CARLSON’S Heated Debate With Delu...   
6887    Five Women Donald Trump Allegedly Slept With ...   
...                                                  ...   
19984  BREAKING: MORE HACKED E-MAILS From DNC Release...   
12038  Yemen air strike kills eight women, two childr...   
7986   Michigan Attorney General says weak Detroit sc...   
6043   Republicans to kill U.S. rules on corruption, ...   
8403     Clinton details plans to boost small businesses   

                                                    text          subject  \
17878  PARIS (Reuters) - Two French prison inmates wh...        worldnews   
14830  BEIRUT (Reuters) - Saad al-Hariri will return ...        worldnews   
19632  (Reuters) - Hurricane Maria was on course

In [15]:
data.head()

Unnamed: 0,title,text,subject,label,text_vector,title_vector
17878,Soon-to-go-free jail convicts snared in French...,PARIS (Reuters) - Two French prison inmates wh...,worldnews,1,"[0.23415925, -0.5290911, 0.016925426, 0.157360...","[0.46603268, -0.3129969, 0.483145, 0.054324996..."
14830,Hariri to return to Lebanon in next two days,BEIRUT (Reuters) - Saad al-Hariri will return ...,worldnews,1,"[-0.045355745, -0.66362226, -0.39455786, 0.246...","[0.16019563, -1.8293892, -0.56820095, 0.276806..."
19632,Puerto Rico power grid faces generational thre...,(Reuters) - Hurricane Maria was on course Tues...,worldnews,1,"[0.20887998, -0.70190006, -0.14172056, 0.47725...","[0.26503304, -1.4047214, 0.29446974, 0.2743730..."
16119,WATCH TUCKER CARLSON’S Heated Debate With Delu...,"Tucker Carlson interviews Jose Antonio Vargas,...",Government News,0,"[0.7143223, -0.23713715, 0.23288968, 0.0328230...","[0.61964226, 0.12819193, 1.2273825, -0.5127372..."
6887,Five Women Donald Trump Allegedly Slept With ...,Donald Trump obviously has a knack for purchas...,News,0,"[0.7464145, -0.40416723, -0.047101095, 0.38588...","[0.9208329, -1.1799678, 0.3656034, -0.08450867..."


In [16]:
# Drop rows with missing vector representations
data = data.dropna()

In [17]:
data.to_csv('/content/drive/MyDrive/Neuromatch/ANN/temp/preprocessed_dataset.csv')