In [None]:
!pip install unidecode
!wget http://embeddings.net/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin

In [6]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
import string
import os
import networkx as nx
import codecs
from os import path

In [7]:
data=pd.read_csv('../../data/train.csv')  

In [8]:
data.head()

Unnamed: 0,9032,health/medical
0,5346,entertainment
1,18778,entertainment
2,11939,education/research
3,17502,tech/science
4,7904,health/medical


## 1.Load Data

In [9]:
def get_data_full_modified():
    with open("../../data/train.csv", 'r') as f:
        train_data = f.read().splitlines()
    with open("../../data/test.csv", 'r') as f:
        test_hosts = f.read().splitlines()

    train_hosts = list()
    y_train = list()
    for row in train_data:
        host, label = row.split(",")
        train_hosts.append(host)
        y_train.append(label.lower())

    # Text data
    # Load the textual content of a set of webpages for each host into the dictionary "data".
    # The encoding parameter is required since the majority of our data is french.
    text = dict()
    filenames = os.listdir('../../data/text')
    
    for filename in filenames:
        try:
            with codecs.open(path.join('../../data/text/', filename), encoding='utf-8') as f:
                text[filename] = f.read().replace("\n", "").lower()
        except:
            with codecs.open(path.join('../../data/text/', filename), encoding='latin-1') as f:
                text[filename] = f.read().replace("\n", "").lower()
    
    X_train = list()
    for host in train_hosts:
        if host in text:
            X_train.append([host, text[host]])
        else:
            X_train.append([host, ''])
    # Get textual content of web hosts of the test set
    X_test = list()
    for host in test_hosts:
        if host in text:
            X_test.append([host, text[host]])
        else:
            X_test.append([host, ''])
    return X_train, y_train, X_test, test_hosts

In [10]:
X_train, y_train, X_test, test_hosts = get_data_full_modified() 

In [134]:
data_train = pd.DataFrame({'text': X_train,'category': y_train,})
data_train.head()

Unnamed: 0,text,category
0,"[9032, #polepharma » flux polepharma » flux...",health/medical
1,"[5346, 301 moved p...",entertainment
2,"[18778, (button) fermer en poursuivant vo...",entertainment
3,"[11939, #hal (button) toggle navigation ...",education/research
4,"[17502, user-agent: * disallow: disallow: /...",tech/science


In [135]:
data_train[['train_host','text']] = pd.DataFrame(data_train.text.values.tolist(), index= data_train.index)
data_train.set_index('train_host', inplace=True)
data_train.head()

Unnamed: 0_level_0,text,category
train_host,Unnamed: 1_level_1,Unnamed: 2_level_1
9032,#polepharma » flux polepharma » flux des co...,health/medical
5346,301 moved permanen...,entertainment
18778,(button) fermer en poursuivant votre navi...,entertainment
11939,#hal (button) toggle navigation ccsd ...,education/research
17502,user-agent: * disallow: disallow: /publishe...,tech/science


In [46]:
data_train.index

Index(['9032', '5346', '18778', '11939', '17502', '7904', '9232', '21604',
       '26782', '16397',
       ...
       '11890', '17', '2498', '19658', '17706', '957', '26416', '10643',
       '17307', '19307'],
      dtype='object', name='train_host', length=2125)

In [18]:
print(data_train['text'][3])

   #hal   (button) toggle navigation   ccsd     * hal          + hal          + halshs          + tel          + médihal          + liste des portails          +          + auréhal          + api          + data          + documentation     * episciences.org          + episciences.org          + revues          +          + documentation     * sciencesconf.org     * support   (button)  connexion (button)     * connexion     * connexion avec orcid     * se connecter avec fédération     *     * créer un compte     *     * mot de passe oublié ?     * login oublié ?     * fr     * en     * accueil     * dépôt          + je dépose          + questions pratiques          + questions juridiques          + c'est quoi l'oa ?     * consultation          + consultation par période          + consultation par auteur          + consultation par collections          + consultation par type de publication          + consultation par structure          + consultation par discipline          + consulta

In [19]:
data_test = pd.DataFrame({'text': X_test,})
data_test.head()

Unnamed: 0,text
0,"[27997, iframe: //www.googletagmanager.com/..."
1,"[9316, iframe: https://www.googletagmanager..."
2,"[27045, #toutes les astuces beauté ⋅ astuce..."
3,"[19805, [logodefault.jpg] [etab_juvisy-su..."
4,"[26580, #l'école de demain » flux l'école d..."


In [20]:
data_test[['test_host','text']] = pd.DataFrame(data_test.text.values.tolist(), index= data_test.index)
data_test.set_index('test_host', inplace=True)
data_test.head()

Unnamed: 0_level_0,text
test_host,Unnamed: 1_level_1
27997,iframe: //www.googletagmanager.com/ns.html?...
9316,iframe: https://www.googletagmanager.com/ns...
27045,#toutes les astuces beauté ⋅ astuces beauté...
19805,[logodefault.jpg] [etab_juvisy-sur-orge.p...
26580,#l'école de demain » flux l'école de demain...


In [21]:
data_test.shape

(560, 1)

In [22]:
labels = ['business/finance', 'education/research', 'entertainment', 'health/medical',
          'news/press', 'politics/government/law', 'sports', 'tech/science']

## 2.Preprocessing

In [23]:
### Remove html tags and uris from contents
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [24]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [25]:
#Removing stopwords 
from nltk.corpus import stopwords
import numpy as np
import pickle

stops = set(stopwords.words('french'))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

### Text embedding 

In [23]:
import gensim
from unidecode import unidecode

from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec
wv_from_bin = KeyedVectors.load_word2vec_format('frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin', binary=True)

In [26]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize

In [2]:
wv_from_bin

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x2345bdb38d0>

In [27]:
data_train_tokenized = [word_tokenize(doc, language='french') for doc in data_train['text']]

In [28]:
data_test_tokenized = [word_tokenize(doc, language='french') for doc in data_test['text']]

In [32]:
dct_train = Dictionary(data_train_tokenized, prune_at=1000000) 
dct_test = Dictionary(data_test_tokenized, prune_at=1000000) 
corpus_train= [dct_train.doc2bow(doc) for doc in data_train_tokenized]
corpus_test= [dct_test.doc2bow(doc) for doc in data_test_tokenized]

model_train = TfidfModel(corpus_train)
model_test=TfidfModel(corpus_test)


In [122]:
from nltk.corpus import stopwords
import numpy as np
import pickle

st = set(stopwords.words('french'))

doc_embeds = {}
for  fn, doc_train in zip(data_train.index,corpus_train):
    
    print
    doc_embed = np.zeros(200)
    doc_tfidf = model_train[doc_train]
    l = 1
    for word_id, freq in doc_tfidf:
        try:
            word = dct_train[word_id].split("'")[-1]
            if len(word) > 1 and word.isalpha() and word not in st:
                word_embed = wv_from_bin[dct_train[word_id]]
                doc_embed += freq * word_embed
                l += 1
        except:
            pass
        # doc_embeds.append(doc_embed / l)
        
        doc_embeds[fn] = doc_embed / l
       
        
    pickle.dump(doc_embeds, open('doc_embeds.pkl', 'wb'))

In [129]:
data_embed= pd.DataFrame(doc_embeds).T

In [130]:
data_embed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
9032,0.000407,0.0005,0.002512,3.4e-05,-0.000424,-4.5e-05,0.001448,0.00018,0.000722,-0.001588,...,0.000158,-0.001133,-0.004803,-0.002278,-0.000368,0.002254,-0.000196,-0.000155,-0.001363,-0.000389
5346,-0.016258,0.037862,0.183643,-0.047882,-0.388329,0.341453,0.008773,-0.006256,0.024864,-0.007944,...,-0.226902,0.113912,-0.102976,-0.044072,0.06411,-0.141412,0.227155,0.153829,-0.083734,-0.126567
18778,0.004225,0.001992,-0.001086,-0.004034,0.001287,0.001131,0.002279,0.002018,0.001242,-0.002825,...,0.001804,-0.00158,-0.003133,-0.000617,0.001095,0.000351,-0.004243,0.001923,-0.005732,0.000751
11939,-0.007566,-0.013146,0.00521,-0.017795,0.0037,-0.001216,0.001374,-0.00166,0.003447,0.011853,...,-0.003085,0.002724,0.00449,0.006041,-0.006865,0.006479,0.010296,-0.005845,-0.013943,0.012342
17502,0.006932,-0.007299,0.002596,-0.008959,0.007271,-0.001274,-0.00532,-0.000119,-0.003201,0.002527,...,-0.00565,-0.00338,0.00088,-0.001482,-0.002526,0.008101,0.002013,0.006539,-0.007604,-0.004728


In [136]:
data_train_bis=data_train.merge(data_embed, left_on='train_host', right_index=True)

In [137]:
data_train_bis.head()

Unnamed: 0_level_0,text,category,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
train_host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9032,#polepharma » flux polepharma » flux des co...,health/medical,0.000407,0.0005,0.002512,3.4e-05,-0.000424,-4.5e-05,0.001448,0.00018,...,0.000158,-0.001133,-0.004803,-0.002278,-0.000368,0.002254,-0.000196,-0.000155,-0.001363,-0.000389
5346,301 moved permanen...,entertainment,-0.016258,0.037862,0.183643,-0.047882,-0.388329,0.341453,0.008773,-0.006256,...,-0.226902,0.113912,-0.102976,-0.044072,0.06411,-0.141412,0.227155,0.153829,-0.083734,-0.126567
18778,(button) fermer en poursuivant votre navi...,entertainment,0.004225,0.001992,-0.001086,-0.004034,0.001287,0.001131,0.002279,0.002018,...,0.001804,-0.00158,-0.003133,-0.000617,0.001095,0.000351,-0.004243,0.001923,-0.005732,0.000751
11939,#hal (button) toggle navigation ccsd ...,education/research,-0.007566,-0.013146,0.00521,-0.017795,0.0037,-0.001216,0.001374,-0.00166,...,-0.003085,0.002724,0.00449,0.006041,-0.006865,0.006479,0.010296,-0.005845,-0.013943,0.012342
17502,user-agent: * disallow: disallow: /publishe...,tech/science,0.006932,-0.007299,0.002596,-0.008959,0.007271,-0.001274,-0.00532,-0.000119,...,-0.00565,-0.00338,0.00088,-0.001482,-0.002526,0.008101,0.002013,0.006539,-0.007604,-0.004728


In [138]:
from nltk.corpus import stopwords
import numpy as np
import pickle

st = set(stopwords.words('french'))

doc_embeds = {}
for  fn, doc_test in zip(data_test.index,corpus_test):
    
    print
    doc_embed = np.zeros(200)
    doc_tfidf = model_test[doc_test]
    l = 1
    for word_id, freq in doc_tfidf:
        try:
            word = dct_test[word_id].split("'")[-1]
            if len(word) > 1 and word.isalpha() and word not in st:
                word_embed = wv_from_bin[dct_test[word_id]]
                doc_embed += freq * word_embed
                l += 1
        except:
            pass
        # doc_embeds.append(doc_embed / l)
        
        doc_embeds[fn] = doc_embed / l
       
        
    pickle.dump(doc_embeds, open('doc_embeds.pkl', 'wb'))

In [139]:
data_embed_test= pd.DataFrame(doc_embeds).T

In [142]:
data_embed_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
27997,0.005762,-0.034971,-0.003441,-0.014206,0.004646,0.005454,-0.004475,0.010124,0.008006,0.033156,...,-0.007316,0.013128,0.017282,-0.018139,0.006837,0.012308,0.015443,0.017533,-0.012994,-0.007444
9316,0.006638,-0.024499,0.004011,-0.005909,0.000808,0.001149,-0.003058,0.00122,0.001687,0.019726,...,0.004918,-0.008148,0.011051,-0.014156,-0.003003,0.01851,0.003232,0.011842,-0.006527,-0.00309
27045,0.0039,-0.000173,0.001875,-0.002115,-0.004927,0.003469,-0.000761,-0.00359,0.002484,0.001478,...,0.002075,-0.00173,0.001444,-0.00302,0.00216,-0.001046,0.000228,0.001834,-0.003837,-0.000849
19805,-0.000913,-0.003279,0.004304,-0.000381,0.001662,-0.000818,0.002793,-0.001739,-0.001714,0.001071,...,-0.004635,0.000484,-0.005664,-0.002815,0.003551,-0.001863,-0.003259,0.0014,0.000818,-3.6e-05
26580,0.000103,0.002366,0.000578,-0.000381,0.002157,-0.00078,0.001223,0.00193,0.000667,0.000196,...,0.002187,0.000705,0.001159,0.000935,0.001348,-0.002309,0.000554,-0.00018,-0.002425,0.000813


In [146]:
data_test_bis=data_test.merge(data_embed_test, left_on='test_host', right_index=True)

In [147]:
data_test.head()

Unnamed: 0_level_0,text
test_host,Unnamed: 1_level_1
27997,iframe: //www.googletagmanager.com/ns.html?...
9316,iframe: https://www.googletagmanager.com/ns...
27045,#toutes les astuces beauté ⋅ astuces beauté...
19805,[logodefault.jpg] [etab_juvisy-sur-orge.p...
26580,#l'école de demain » flux l'école de demain...


In [148]:
data_test_bis.head()

Unnamed: 0_level_0,text,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
test_host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27997,iframe: //www.googletagmanager.com/ns.html?...,0.005762,-0.034971,-0.003441,-0.014206,0.004646,0.005454,-0.004475,0.010124,0.008006,...,-0.007316,0.013128,0.017282,-0.018139,0.006837,0.012308,0.015443,0.017533,-0.012994,-0.007444
9316,iframe: https://www.googletagmanager.com/ns...,0.006638,-0.024499,0.004011,-0.005909,0.000808,0.001149,-0.003058,0.00122,0.001687,...,0.004918,-0.008148,0.011051,-0.014156,-0.003003,0.01851,0.003232,0.011842,-0.006527,-0.00309
27045,#toutes les astuces beauté ⋅ astuces beauté...,0.0039,-0.000173,0.001875,-0.002115,-0.004927,0.003469,-0.000761,-0.00359,0.002484,...,0.002075,-0.00173,0.001444,-0.00302,0.00216,-0.001046,0.000228,0.001834,-0.003837,-0.000849
19805,[logodefault.jpg] [etab_juvisy-sur-orge.p...,-0.000913,-0.003279,0.004304,-0.000381,0.001662,-0.000818,0.002793,-0.001739,-0.001714,...,-0.004635,0.000484,-0.005664,-0.002815,0.003551,-0.001863,-0.003259,0.0014,0.000818,-3.6e-05
26580,#l'école de demain » flux l'école de demain...,0.000103,0.002366,0.000578,-0.000381,0.002157,-0.00078,0.001223,0.00193,0.000667,...,0.002187,0.000705,0.001159,0.000935,0.001348,-0.002309,0.000554,-0.00018,-0.002425,0.000813
