In [4]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
import string
import os
import networkx as nx
import codecs
from os import path

## 1.Load Data

In [5]:
def get_data_full_modified():
    with open("./data/train.csv", 'r') as f:
        train_data = f.read().splitlines()
    with open("./data/test.csv", 'r') as f:
        test_hosts = f.read().splitlines()

    train_hosts = list()
    y_train = list()
    for row in train_data:
        host, label = row.split(",")
        train_hosts.append(host)
        y_train.append(label.lower())

    # Text data
    # Load the textual content of a set of webpages for each host into the dictionary "data".
    # The encoding parameter is required since the majority of our data is french.
    text = dict()
    filenames = os.listdir('./data/text')
    
    for filename in filenames:
        try:
            with codecs.open(path.join('data/text/', filename), encoding='utf-8') as f:
                text[filename] = f.read().replace("\n", "").lower()
        except:
            with codecs.open(path.join('data/text/', filename), encoding='latin-1') as f:
                text[filename] = f.read().replace("\n", "").lower()
    
    X_train = list()
    for host in train_hosts:
        if host in text:
            X_train.append([host, text[host]])
        else:
            X_train.append([host, ''])
    # Get textual content of web hosts of the test set
    X_test = list()
    for host in test_hosts:
        if host in text:
            X_test.append([host, text[host]])
        else:
            X_test.append([host, ''])
    return X_train, y_train, X_test, test_hosts

In [6]:
X_train, y_train, X_test, test_hosts = get_data_full_modified() 

In [7]:
data_train = pd.DataFrame({'text': X_train,'category': y_train,})
data_train.head()

Unnamed: 0,text,category
0,"[9032, #polepharma » flux polepharma » flux...",health/medical
1,"[5346, 301 moved p...",entertainment
2,"[18778, (button) fermer en poursuivant vo...",entertainment
3,"[11939, #hal (button) toggle navigation ...",education/research
4,"[17502, user-agent: * disallow: disallow: /...",tech/science


In [8]:
data_train[['train_host','text']] = pd.DataFrame(data_train.text.values.tolist(), index= data_train.index)
data_train.set_index('train_host', inplace=True)
data_train.head()

Unnamed: 0_level_0,text,category
train_host,Unnamed: 1_level_1,Unnamed: 2_level_1
9032,#polepharma » flux polepharma » flux des co...,health/medical
5346,301 moved permanen...,entertainment
18778,(button) fermer en poursuivant votre navi...,entertainment
11939,#hal (button) toggle navigation ccsd ...,education/research
17502,user-agent: * disallow: disallow: /publishe...,tech/science


In [9]:
data_train.shape

(2125, 2)

In [10]:
print(data_train['text']['9032'])

   #polepharma » flux polepharma » flux des commentaires alternate   alternate   polepharma   polepharma ﻿ recherche_________ submit   (button) toggle navigation   polepharma     * polepharma          + qui sommes-nous ?          + organigramme          + les chiffres clés          + un réseau     * territoires          + région centre-val de loire          + région normandie          + conseil départemental d’eure&loir          + chartres metropole          + metropole de rouen          + agglo du pays de dreux          + agglo evreux portes de normandie          + agglo seine&eure          + agglo de tours plus     * performance industrielle          + excellence opérationnelle          + maintenance          + packaging          + supply chain          + usine du futur     * talents     * relations publiques     * innovation          + biomédicaments          + fédérer          + collaborer          + promouvoir   adhérer à polepharma   espace adhérent   rechercher un adhérent   je 

In [11]:
data_test = pd.DataFrame({'text': X_test,})
data_test.head()

Unnamed: 0,text
0,"[27997, iframe: //www.googletagmanager.com/..."
1,"[9316, iframe: https://www.googletagmanager..."
2,"[27045, #toutes les astuces beauté ⋅ astuce..."
3,"[19805, [logodefault.jpg] [etab_juvisy-su..."
4,"[26580, #l'école de demain » flux l'école d..."


In [12]:
data_test[['test_host','text']] = pd.DataFrame(data_test.text.values.tolist(), index= data_test.index)
data_test.set_index('test_host', inplace=True)
data_test.head()

Unnamed: 0_level_0,text
test_host,Unnamed: 1_level_1
27997,iframe: //www.googletagmanager.com/ns.html?...
9316,iframe: https://www.googletagmanager.com/ns...
27045,#toutes les astuces beauté ⋅ astuces beauté...
19805,[logodefault.jpg] [etab_juvisy-sur-orge.p...
26580,#l'école de demain » flux l'école de demain...


In [13]:
data_test.shape

(560, 1)

In [14]:
labels = ['business/finance', 'education/research', 'entertainment', 'health/medical',
          'news/press', 'politics/government/law', 'sports', 'tech/science']

## 2.Preprocessing

In [15]:
### Remove html tags and uris from contents
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [17]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [20]:
#Removing stopwords 
from nltk.corpus import stopwords
import numpy as np
import pickle

stops = set(stopwords.words('french'))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)