French Fake News Detection baseline model 

This notebook contains : 
- Preparation input data TF-IDF
- Training baseline Sequence Classification (using "LogisticRegression")
- Evaluation

Works on Google Colab


## Google Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Installation

## Import

In [2]:
%matplotlib inline

# for figure
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set(color_codes=True, font_scale=1.33)

# for training
#from simpletransformers.classification import ClassificationModel

# to save 
import pickle
from sklearn.externals import joblib

# useful
import pandas as pd
import numpy as np
import shutil
import os
import re

# for text manipulation
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from collections import defaultdict
from nltk.stem.snowball import FrenchStemmer
#from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# for model
from sklearn.linear_model import LogisticRegression

# import user module
#from my_text_utils import myTokenizer



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Definitions

In [0]:
# your data folder
PATH_FOLDER_SAVE = '/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data'

# % of data used for training
train_percent = 0.7

# path folder saved model
PATH_FOLDER_MODEL_SAVED = '/content/drive/My Drive/outputs_proj7'
 
# save df news
PATH_DF_NEWS_SAVE = PATH_FOLDER_SAVE + '/df_news.pkl' 

## Useful functions

In [0]:
# prepare dictionnary of translation to suppress ponctuation
replace_punctuation = str.maketrans(string.punctuation,
                                    ' '*len(string.punctuation))
def cleaning_text(questions_curr):

    # delete newlines
    questions_curr = re.sub(r'\s+', ' ', questions_curr)
    # delete single quotes
    questions_curr = re.sub(r"\'", " ", questions_curr)
    # delete double quotes
    questions_curr = re.sub(r'\"', " ", questions_curr)
    # delete tags
    questions_curr = re.sub('<[^<]+?>',' ', questions_curr)
    # delete numbers (forming group = word with only numbers 
    # example : delete "123" but not "a123")
    questions_curr = re.sub(r'\b\d+\b','', questions_curr) 
    # delete ponctuation (replace by space)
    questions_curr = questions_curr.translate(replace_punctuation)
    # keep only word
    questions_curr = re.sub(r'\W',  ' ', questions_curr)
    # lower case
    questions_curr = ' '.join([w.lower() for w in \
                               nltk.word_tokenize(questions_curr,
                                                  language='french') \
                              if not w.lower() in list(sw)])

    return questions_curr

def myTokenizer(text):
    '''
    Create tokens from text (French words > 3 letters)
    '''
    def stem_tokens(tokens, stemmer):
        '''
        Stem words in tokens.
        and suppress word < 3 characters
        '''
        stemmed = []
        for item in tokens:
            if re.match('[a-zA-Z0-9]{3,}',item):
                stemmed.append(stemmer.stem(item))
        return stemmed

    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, FrenchStemmer())
    return stems

In [83]:
cleaning_text("<a> dede<a/> l'aube est arrivée : “war horse” chien")

'dede aube arrivée war horse chien'

In [59]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Prepare Data 

### Load data

In [0]:
df_news = joblib.load(PATH_DF_NEWS_SAVE)

In [7]:
df_news.head()

Unnamed: 0,url,source,author,title,theme,description,date_published,body,fake,text,train,nb_words
1,https://www.liberation.fr/france/2019/12/15/le...,Liberation,Nathalie Raulin,Les chefs de service hospitaliers en appellent...,france,Jugeant très insuffisante la réponse du gouver...,15/12/2019,C’est une menace d’une démission collective qu...,0,Les chefs de service hospitaliers en appellent...,False,789
2,https://www.futura-sciences.com/sante/actualit...,Futura Sciences,Futura avec Relaxnews,L'origine des comportements alimentaires ident...,sante,D'après une étude publiée dans la revue,16/12/2019,D'après une étude publiée dans la revue Natur...,0,L'origine des comportements alimentaires ident...,True,499
3,https://www.futura-sciences.com/tech/actualite...,Futura Sciences,Louis Neveu,Microsoft alerte sur de nouvelles techniques d...,tech,Le géant de l'informatique Microsoft vient de ...,16/12/2019,Le géant de l'informatique Microsoft vient de ...,0,Microsoft alerte sur de nouvelles techniques d...,True,412
4,https://www.futura-sciences.com/sciences/actua...,Futura Sciences,Rémy Decourt,L'Europe se dote d'une infrastructure de téléc...,sciences,Pour garantir la sécurité de la libre circulat...,16/12/2019,Pour garantir la sécurité de la libre circulat...,0,L'Europe se dote d'une infrastructure de téléc...,False,1269
6,https://www.futura-sciences.com/sciences/actua...,Futura Sciences,Nathalie Mayer,Science décalée : la créature la plus rapide s...,sciences,Le guépard est l'animal le plus rapide... sur ...,15/12/2019,Le guépard est l'animal le plus rapide... sur ...,0,Science décalée : la créature la plus rapide s...,True,557


In [8]:
df_news.shape

(2432, 12)

### Features




In [0]:
news = df_news.filter(items=["text"]).to_dict()
news = news["text"]

### Stopwords

In [0]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

def freq_stats_corpora(my_corpus):
    '''
    Statistics about words in my_corpus
    '''
    corpora = defaultdict(list)

    # Création d'un corpus de tokens par question
    for id_curr,text_curr in my_corpus.items():        
        try:
            corpora[id_curr] += tokenizer.tokenize(text_curr.lower())
        except:
            print('text_curr {} : Error body empty'.format(id_curr))
            
    stats, freq = dict(), dict()

    for k, v in corpora.items():
        freq[k] = fq = nltk.FreqDist(v)
        stats[k] = {'total': len(v), 'unique': len(fq.keys())}
        
    return (freq, stats, corpora)

# Récupération des comptages
freq, stats, corpora = freq_stats_corpora(news)
#df_stats = pd.DataFrame.from_dict(stats, orient='index')



# Premièrement, on récupère la fréquence totale de chaque mot sur tout le corpus
# de questions
freq_totale = nltk.Counter()
for k, v in news.items():
    freq_totale += freq[k]



In [11]:
freq_totale.most_common(100)

[('de', 60954),
 ('la', 34250),
 ('le', 25787),
 ('l', 24083),
 ('à', 23543),
 ('les', 22745),
 ('des', 20230),
 ('et', 20130),
 ('d', 18655),
 ('en', 17778),
 ('un', 15777),
 ('une', 13251),
 ('a', 12921),
 ('est', 12864),
 ('du', 12003),
 ('pour', 11426),
 ('dans', 10094),
 ('il', 9972),
 ('que', 9310),
 ('qui', 9205),
 ('sur', 7776),
 ('par', 7239),
 ('pas', 6921),
 ('au', 6807),
 ('plus', 6348),
 ('ce', 6313),
 ('qu', 5351),
 ('s', 4885),
 ('ne', 4563),
 ('n', 4551),
 ('on', 4528),
 ('se', 4506),
 ('c', 4281),
 ('avec', 4242),
 ('son', 4195),
 ('ont', 3990),
 ('mais', 3742),
 ('sont', 3652),
 ('islam', 3490),
 ('cette', 3108),
 ('ou', 3104),
 ('été', 3006),
 ('nous', 2961),
 ('aux', 2853),
 ('comme', 2715),
 ('sa', 2582),
 ('elle', 2568),
 ('je', 2507),
 ('vous', 2373),
 ('y', 2363),
 ('aussi', 2339),
 ('ces', 2332),
 ('même', 2296),
 ('être', 2254),
 ('tout', 2207),
 ('leur', 2164),
 ('fait', 2140),
 ('ils', 2093),
 ('ses', 2071),
 ('deux', 1930),
 ('après', 1840),
 ('depuis', 183

In [12]:
# Stopwords : 100 first most used words + typical french stopwords
most_freq = freq_totale.most_common(100)
sw = set()
for tuple_freq in most_freq:
    sw.add(tuple_freq[0])
    
sw.update(tuple(nltk.corpus.stopwords.words('french')))
sw

{'a',
 'ai',
 'aie',
 'aient',
 'aies',
 'ainsi',
 'ait',
 'alors',
 'ans',
 'après',
 'as',
 'au',
 'aura',
 'aurai',
 'auraient',
 'aurais',
 'aurait',
 'auras',
 'aurez',
 'auriez',
 'aurions',
 'aurons',
 'auront',
 'aussi',
 'autres',
 'aux',
 'avaient',
 'avais',
 'avait',
 'avant',
 'avec',
 'avez',
 'aviez',
 'avions',
 'avoir',
 'avons',
 'ayant',
 'ayante',
 'ayantes',
 'ayants',
 'ayez',
 'ayons',
 'bien',
 'c',
 'ce',
 'ces',
 'cette',
 'comme',
 'contre',
 'd',
 'dans',
 'de',
 'depuis',
 'des',
 'deux',
 'dont',
 'du',
 'déjà',
 'elle',
 'en',
 'encore',
 'entre',
 'es',
 'est',
 'et',
 'eu',
 'eue',
 'eues',
 'eurent',
 'eus',
 'eusse',
 'eussent',
 'eusses',
 'eussiez',
 'eussions',
 'eut',
 'eux',
 'eûmes',
 'eût',
 'eûtes',
 'faire',
 'fait',
 'fois',
 'france',
 'français',
 'furent',
 'fus',
 'fusse',
 'fussent',
 'fusses',
 'fussiez',
 'fussions',
 'fut',
 'fûmes',
 'fût',
 'fûtes',
 'gouvernement',
 'il',
 'ils',
 'islam',
 'j',
 'je',
 'jour',
 'l',
 'la',
 'le',

In [13]:
# save stopwords
path_save_sw = PATH_FOLDER_SAVE + "/stop_words_sw.pkl"
joblib.dump(sw, path_save_sw)
print("Stop words Saved here:\n{}".format(path_save_sw))


Stop words Saved here:
/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data/stop_words_sw.pkl


### Clean Text

In [0]:
clean_text_dict = dict()
# for each news, prepare token : lower, no ponctuation
for id_curr, news_curr in news.items():
    try:
        # save cleaned text into dict output
        clean_text_dict[id_curr] = cleaning_text(news_curr) 
    except:
        print('News {} : Error body empty'.format(id_curr))

In [15]:
df_clean = pd.DataFrame.from_dict(clean_text_dict, orient='index', 
                                  columns=["clean_text"])
df_clean.head()

Unnamed: 0,clean_text
1,chefs service hospitaliers appellent démission...
2,origine comportements alimentaires identifiée ...
3,microsoft alerte nouvelles techniques phishing...
4,europe dote infrastructure télécommunications ...
6,science décalée créature rapide terre celle cr...


In [16]:
df_news_clean = df_news.copy()
df_news_clean["clean_text"] = df_clean["clean_text"]
df_news_clean.head()

Unnamed: 0,url,source,author,title,theme,description,date_published,body,fake,text,train,nb_words,clean_text
1,https://www.liberation.fr/france/2019/12/15/le...,Liberation,Nathalie Raulin,Les chefs de service hospitaliers en appellent...,france,Jugeant très insuffisante la réponse du gouver...,15/12/2019,C’est une menace d’une démission collective qu...,0,Les chefs de service hospitaliers en appellent...,False,789,chefs service hospitaliers appellent démission...
2,https://www.futura-sciences.com/sante/actualit...,Futura Sciences,Futura avec Relaxnews,L'origine des comportements alimentaires ident...,sante,D'après une étude publiée dans la revue,16/12/2019,D'après une étude publiée dans la revue Natur...,0,L'origine des comportements alimentaires ident...,True,499,origine comportements alimentaires identifiée ...
3,https://www.futura-sciences.com/tech/actualite...,Futura Sciences,Louis Neveu,Microsoft alerte sur de nouvelles techniques d...,tech,Le géant de l'informatique Microsoft vient de ...,16/12/2019,Le géant de l'informatique Microsoft vient de ...,0,Microsoft alerte sur de nouvelles techniques d...,True,412,microsoft alerte nouvelles techniques phishing...
4,https://www.futura-sciences.com/sciences/actua...,Futura Sciences,Rémy Decourt,L'Europe se dote d'une infrastructure de téléc...,sciences,Pour garantir la sécurité de la libre circulat...,16/12/2019,Pour garantir la sécurité de la libre circulat...,0,L'Europe se dote d'une infrastructure de téléc...,False,1269,europe dote infrastructure télécommunications ...
6,https://www.futura-sciences.com/sciences/actua...,Futura Sciences,Nathalie Mayer,Science décalée : la créature la plus rapide s...,sciences,Le guépard est l'animal le plus rapide... sur ...,15/12/2019,Le guépard est l'animal le plus rapide... sur ...,0,Science décalée : la créature la plus rapide s...,True,557,science décalée créature rapide terre celle cr...


In [17]:
# save clean text
path_save_df_news_clean = PATH_FOLDER_SAVE + "/df_news_clean.pkl"
joblib.dump(df_news_clean, path_save_df_news_clean)
print("df news clean Saved here:\n{}".format(path_save_df_news_clean))

df news clean Saved here:
/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data/df_news_clean.pkl


### Count words

In [18]:
# init features
tf_vectorizer = CountVectorizer(max_df=0.11, min_df=10, max_features=10000,
                                      tokenizer=myTokenizer)

# train counts
X_train_counts = tf_vectorizer.\
    fit_transform(df_news_clean[\
                                   df_news_clean["train"]\
                                   == True]["clean_text"])

X_train_counts.shape

(1703, 3411)

In [19]:
# save CountVectorizer
path_save_count_vect = PATH_FOLDER_SAVE + "/count_vect.pkl"
joblib.dump(tf_vectorizer, path_save_count_vect)
print("tf_vectorizer Saved here:\n{}".format(path_save_count_vect))

tf_vectorizer Saved here:
/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data/count_vect.pkl


In [20]:
# test counts
X_test_counts = tf_vectorizer.transform(
    df_news_clean[df_news_clean["train"] == False]["clean_text"])
X_test_counts.shape

(729, 3411)

### TF-IDF

In [21]:
# calculate tf-idf
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
# train
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

print("X_train_tfidf.shape" , X_train_tfidf.shape)
# test
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print("X_test_tfidf.shape" , X_test_tfidf.shape)

X_train_tfidf.shape (1703, 3411)
X_test_tfidf.shape (729, 3411)


In [22]:
# save TfidfTransformer
path_save_tfidf_transformer = PATH_FOLDER_SAVE + "/tfidf_transformer.pkl"
joblib.dump(tfidf_transformer, path_save_tfidf_transformer)
print("tfidf_transformer Saved here:\n{}".format(path_save_tfidf_transformer))

tfidf_transformer Saved here:
/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data/tfidf_transformer.pkl


# Baseline : LogisticRegression




### Train model

In [23]:
clf_log_reg = LogisticRegression(random_state=0)

clf_log_reg.fit(X_train_tfidf, 
              df_news_clean[df_news_clean["train"] == True]["fake"])
print("Train Score : ",
      clf_log_reg.score(X_train_tfidf, 
                      df_news_clean[df_news_clean["train"] == True]["fake"]))
print("Test Score : ",
      clf_log_reg.score(X_test_tfidf, 
                      df_news_clean[df_news_clean["train"] == False]["fake"]))

Train Score :  0.9782736347621844
Test Score :  0.9012345679012346




In [24]:
# save clf_log_reg
path_save_clf_log_reg = PATH_FOLDER_SAVE + "/clf_log_reg.pkl"
joblib.dump(clf_log_reg, path_save_clf_log_reg)
print("clf_log_reg Saved here:\n{}".format(path_save_clf_log_reg))

clf_log_reg Saved here:
/content/drive/My Drive/OpenClassRooms/IML_Projet_7/data/clf_log_reg.pkl
