In [46]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
tqdm.pandas()

In [47]:
true = pd.read_csv('./True.csv')
fake = pd.read_csv('./Fake.csv')

In [48]:
true['type'] = 'true'
fake['type'] = 'fake'

In [49]:
df = pd.concat([true, fake])

In [50]:
df

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",true
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",true
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",true
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",true
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",true
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",fake
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",fake
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",fake
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",fake


In [51]:
train, test = train_test_split(df, test_size=0.2, shuffle = True)

In [52]:
import re

def clean_text(text):
    text = text.lower()
    
    text = re.sub(r'https\S+', '', text)
    
    text = re.sub(r'<.*?>+', '', text)
    
    text = re.sub(r'[^\w\s]', '', text)
    
    text = re.sub(r'\w*\d\w*', '', text)
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()
    

In [53]:
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)

In [54]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [55]:
def preprocess_text(text):
    
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    additional_stop_words = [
        'fake', 'news', 'false', 'hoax', 'said', 'misinformation',
        'disinformation', 'rumor', 'unverified', 'not', 'true',
        'misleading', 'bogus', 'fabricated', 'baseless',
    ]
    
    stop_words = set(stopwords.words('english') + additional_stop_words)
    
    filtered_tokens = [token for token in stemmed_tokens if token not in stop_words]
    
    preprocess_textt = ' '.join(filtered_tokens)
    
    return preprocess_textt

In [56]:
df['title'] = df['title'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)

In [57]:
df.drop(columns=['date','subject'], inplace=True)

In [147]:

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['type'] = encoder.fit_transform(df['type'])


In [148]:
X = df['text']+' '+df['title']
Y = df['type']

In [149]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, shuffle=true, random_state=42)

In [150]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [151]:
from sklearn.metrics import classification_report

def fit_model(model, X_train, X_test, y_train, y_test):
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    print(classification_report(y_test, predictions))

In [152]:
model_svc = LinearSVC(penalty = 'l2', loss = 'squared_hinge')
model_log = LogisticRegression(penalty = 'l2')
model_gbc = GradientBoostingClassifier(loss ='log_loss', learning_rate = 0.01)

In [157]:
fit_model(model_svc, X_train_vect, X_test_vect, y_train, y_test)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6996
           1       0.99      1.00      0.99      6474

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [158]:
fit_model(model_log, X_train_vect, X_test_vect, y_train, y_test)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      6996
           1       0.98      0.99      0.98      6474

    accuracy                           0.98     13470
   macro avg       0.98      0.98      0.98     13470
weighted avg       0.98      0.98      0.98     13470



In [159]:
fit_model(model_gbc, X_train_vect, X_test_vect, y_train, y_test)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6996
           1       0.99      1.00      0.99      6474

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470

