# Imports

In [2]:
import pandas as pd
import numpy as np 
import string
import random
import vbcode

import nltk
from nltk.corpus import brown
from nltk.corpus import reuters

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

In [3]:
#charger 10 000 documents d'actualité reuters
len(reuters.fileids())

10788

In [4]:
#afficher le texte d'un document
reuters.raw(fileids=['test/14826'])[0:201]

"ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n  Mounting trade friction between the\n  U.S. And Japan has raised fears among many of Asia's exporting\n  nations that the row could inflict far-reaching"

# Tokenization

In [5]:
# supprimer la ponctuation de tous les DOC 
exclude = set(string.punctuation)
alldocslist = []

for index, i in  enumerate(reuters.fileids()):
    text = reuters.raw(fileids=[i])
    text = ''.join(ch for ch in text if ch not in exclude)
    alldocslist.append(text)
    
print(alldocslist[1])

CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of Chinas grain
  stocks the China Daily said
      It also said that each year 1575 mln tonnes or 25 pct of
  Chinas fruit output are left to rot and 21 mln tonnes or up
  to 30 pct of its vegetables The paper blamed the waste on
  inadequate storage and bad preservation methods
      It said the government had launched a national programme to
  reduce waste calling for improved technology in storage and
  preservation and greater production of additives The paper
  gave no further details
  




In [6]:
#tokenize les mots dans tous les DOCS
plot_data = [[]] * len(alldocslist)

for doc in alldocslist:
    text = doc
    tokentext = word_tokenize(text)
    plot_data[index].append(tokentext)

# afficher tous les termes du premier document après tokenization
print(plot_data[0][1])

['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '712', 'PCT', 'GRAIN', 'STOCKS', 'A', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'Chinas', 'grain', 'stocks', 'the', 'China', 'Daily', 'said', 'It', 'also', 'said', 'that', 'each', 'year', '1575', 'mln', 'tonnes', 'or', '25', 'pct', 'of', 'Chinas', 'fruit', 'output', 'are', 'left', 'to', 'rot', 'and', '21', 'mln', 'tonnes', 'or', 'up', 'to', '30', 'pct', 'of', 'its', 'vegetables', 'The', 'paper', 'blamed', 'the', 'waste', 'on', 'inadequate', 'storage', 'and', 'bad', 'preservation', 'methods', 'It', 'said', 'the', 'government', 'had', 'launched', 'a', 'national', 'programme', 'to', 'reduce', 'waste', 'calling', 'for', 'improved', 'technology', 'in', 'storage', 'and', 'preservation', 'and', 'greater', 'production', 'of', 'additives', 'The', 'paper', 'gave', 'no', 'further', 'details']


In [7]:
# Navigation: le premier index donne tous les documents, 
#le deuxième index donne le document spécifique, 
#le troisième index donne les mots de ce document
print(plot_data[0][1][0:10])

['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '712', 'PCT', 'GRAIN', 'STOCKS', 'A']


# Normalisation

In [8]:
#faire tous les mots en minuscules pour tous les documents
for x in range(len(reuters.fileids())):
    lowers = [word.lower() for word in plot_data[0][x]]
    plot_data[0][x] = lowers


# afficher les 10 premiers termes du premier document
plot_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'a']

# Suppression des mots vides

In [9]:
#Supprimer les mots vides(Stop words) de tous les documents
stop_words = set(stopwords.words('english'))

for x in range(len(reuters.fileids())):
    filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
    plot_data[0][x] = filtered_sentence

# afficher les 10 premiers termes du deuxième document
plot_data[0][2][0:10]

['japan',
 'revise',
 'longterm',
 'energy',
 'demand',
 'downwards',
 'ministry',
 'international',
 'trade',
 'industry']

# Stemming

In [10]:
#stem words

porter_stemmer = PorterStemmer()
for x in range(len(reuters.fileids())):
    stemmed_sentence = [porter_stemmer.stem(w) for w in plot_data[0][x]]
    plot_data[0][x] = stemmed_sentence

# afficher les 10 premiers stem words du premier document
print(plot_data[0][1][0:10])

['china', 'daili', 'say', 'vermin', 'eat', '712', 'pct', 'grain', 'stock', 'survey']


# Compression

In [11]:
# Normalisation des Docs_IDs 
for x in range(len(reuters.fileids())):
    reuters.fileids()[x] = (reuters.fileids()[x].split("/"))[1]

for i in range(len(reuters.fileids())):
    reuters.fileids()[i]=int(reuters.fileids()[i])

## Variable Byte Encoding 

In [12]:
#Encoding:

id_encoding = vbcode.encode(reuters.fileids())

#Decoding:

id_decoding = vbcode.decode(id_encoding)

In [13]:
print(id_encoding)

b's\xeas\xecs\xeds\xf0s\xf1s\xf7s\xf8s\xf9s\xfas\xfbs\xfct\x81t\x84t\x86t\x8at\x8bt\x8ct\x8dt\x8et\x8ft\x91t\x93t\x98t\x99t\x9bt\x9ct\x9dt\xa1t\xa2t\xa5t\xa6t\xa8t\xaat\xabt\xact\xb3t\xb4t\xb7t\xb8t\xbbt\xbdt\xbft\xc0t\xc1t\xc6t\xc7t\xc9t\xcat\xcbt\xcet\xd0t\xd2t\xd3t\xd4t\xd5t\xd6t\xddt\xdft\xe5t\xe7t\xeat\xedt\xeet\xeft\xf0t\xf2t\xf3t\xf4t\xf5t\xf7t\xf8t\xf9t\xfat\xfbt\xfet\xffu\x82u\x85u\x86u\x87u\x88u\x89u\x8au\x8bu\x8cu\x91u\x93u\x96u\x98u\x99u\x9au\x9cu\x9du\x9eu\xa3u\xa4u\xa5u\xa8u\xa9u\xacu\xafu\xb0u\xb2u\xb3u\xb4u\xb5u\xb7u\xb8u\xb9u\xbdu\xbeu\xc3u\xc5u\xc6u\xc8u\xc9u\xccu\xcdu\xcfu\xd0u\xd4u\xd5u\xd6u\xd7u\xd9u\xdbu\xddu\xdeu\xe2u\xe5u\xe6u\xe7u\xeau\xf2u\xf3u\xf4u\xf5u\xf6u\xf7u\xf8u\xf9u\xffv\x80v\x82v\x83v\x85v\x86v\x87v\x88v\x8ev\x8fv\x90v\x91v\x92v\x94v\x96v\x98v\x99v\x9av\x9cv\xa0v\xa2v\xa5v\xa8v\xa9v\xaav\xadv\xb0v\xb1v\xb2v\xb4v\xb5v\xb9v\xbav\xc3v\xc4v\xc7v\xcbv\xccv\xd1v\xd4v\xd5v\xd6v\xd9v\xdav\xddv\xdev\xe0v\xe4v\xe5v\xe6v\xe7v\xe8v\xeav\xebv\xecv\xedv\xf1v\xf3v\x