In [1]:
!pip install wordcloud



In [2]:
%run ../preprocessing.py

In [3]:
import os
import re
import csv
import json
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import matplotlib.pyplot as plt

In [4]:
ROOT_FOLDER = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/"
DATASET_PATH = ROOT_FOLDER + "dataset/"

In [5]:
# NLTK stopwords
stopwords = set(['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent'])

## Import dataset

In [6]:
labels = ['santé', 'science_high-tech', 'sports', 'économie', 'international', 'culture', 'france', 'homepage'] 

In [7]:
def append_(dictionnary, key, value):
    if key in dictionnary:
        dictionnary[key].append(value)
    else:
        dictionnary[key] = [value]

In [8]:
def get_main_category(dictOfNames):
    new_dict = {}
    try:
        for (key,value) in dictOfNames.items():
            #if "score" in key or "applenews" in key or "homepage" in key:
            #    continue
            new_key = re.sub(r'desktop_|mobile_webview_', "", key)
            new_key = re.sub(r'google_', "", new_key)
            if new_key not in labels:
                continue
            if new_key not in new_dict:
                new_dict[new_key] = 0
            new_dict[new_key] += value
        #return [key for key in new_dict.keys()]
        return max(new_dict, key=new_dict.get)
    except ValueError as e :
        return ""

In [9]:
category_count = {} # { category_1: ["article_1"]}

with open(DATASET_PATH+'shuffled_since_january.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    next(reader)
    for i, row in enumerate(reader):
        if len(row) != 5:
            continue

        category = ""
        if row[4] != {} and row[4] != "":
            try:
                category = get_main_category(json.loads(row[4]))
            except Exception as e:
                print(row)
                print(row[4])
                assert False
        if category == "":
            continue

        text = clean_text(row[0], remove_stopwords=True, lower=False)
        append_(category_count, category, text)

In [10]:
categories = [". ".join(val) for key, val in category_count.items()]

In [11]:
len(categories)

8

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(categories)

In [14]:
X.shape

(8, 68175)

In [16]:
X[0][:10].toarray()

array([[0.00050064, 0.00427453, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
vectorizer.get_feature_names()

['00',
 '000',
 '0000',
 '000e',
 '000m',
 '000pts',
 '000â',
 '001',
 '004',
 '007',
 '01',
 '010',
 '016',
 '019',
 '01e',
 '01hebdo',
 '01live',
 '01net',
 '02',
 '020',
 '03',
 '03100',
 '03200',
 '03470',
 '035',
 '038',
 '04',
 '044',
 '046',
 '05',
 '050',
 '056',
 '06',
 '060',
 '061fr',
 '06581138z',
 '06f',
 '07',
 '077',
 '08',
 '080',
 '081',
 '083',
 '0880',
 '08h49',
 '09',
 '099',
 '09h00',
 '09h34',
 '09h54',
 '0c',
 '0commentaires',
 '0day',
 '0h',
 '0h00',
 '0h15',
 '0h30',
 '0h45',
 '0l',
 '0x80242006',
 '0â',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '10000x',
 '1000e',
 '1000g',
 '1000xm3',
 '1000xm4',
 '1000ème',
 '1001',
 '100e',
 '100eme',
 '100go',
 '100k',
 '100km',
 '100m',
 '100me',
 '100sationnel',
 '100w',
 '100x',
 '100â',
 '100ème',
 '101',
 '102',
 '10210u',
 '103',
 '10300h',
 '103e',
 '103â',
 '104',
 '10400',
 '1043',
 '105',
 '1059',
 '105e',
 '105mm',
 '106',
 '1060',
 '10600',
 '1068g7',
 '106e',
 '106â',
 '106è',
 '107',
 '10700f',
 '10700k',