# Imports

In [0]:
import pandas as pd
import numpy as np 
import string
import random

import nltk
from nltk.corpus import brown
from nltk.corpus import reuters

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

In [4]:
#charger 10 000 documents d'actualité reuters
len(reuters.fileids())

10788

In [5]:
#afficher le texte d'un document
reuters.raw(fileids=['test/14826'])[0:201]

"ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n  Mounting trade friction between the\n  U.S. And Japan has raised fears among many of Asia's exporting\n  nations that the row could inflict far-reaching"

# Tokenization

In [6]:
# supprimer la ponctuation de tous les DOC 
exclude = set(string.punctuation)
alldocslist = []

for index, i in  enumerate(reuters.fileids()):
    text = reuters.raw(fileids=[i])
    text = ''.join(ch for ch in text if ch not in exclude)
    alldocslist.append(text)
    
#afficher le document 1 après la suppression de ponctuation
print(alldocslist[1])

CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of Chinas grain
  stocks the China Daily said
      It also said that each year 1575 mln tonnes or 25 pct of
  Chinas fruit output are left to rot and 21 mln tonnes or up
  to 30 pct of its vegetables The paper blamed the waste on
  inadequate storage and bad preservation methods
      It said the government had launched a national programme to
  reduce waste calling for improved technology in storage and
  preservation and greater production of additives The paper
  gave no further details
  




In [11]:
#tokenize les mots dans tous les DOCS

tokenized_data = [[]] * len(alldocslist)

for doc in alldocslist:
    text = doc
    tokentext = word_tokenize(text)
    tokenized_data[index].append(tokentext)

# afficher tous les termes du premier document après tokenization
print(tokenized_data[0][1])

['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '712', 'PCT', 'GRAIN', 'STOCKS', 'A', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'Chinas', 'grain', 'stocks', 'the', 'China', 'Daily', 'said', 'It', 'also', 'said', 'that', 'each', 'year', '1575', 'mln', 'tonnes', 'or', '25', 'pct', 'of', 'Chinas', 'fruit', 'output', 'are', 'left', 'to', 'rot', 'and', '21', 'mln', 'tonnes', 'or', 'up', 'to', '30', 'pct', 'of', 'its', 'vegetables', 'The', 'paper', 'blamed', 'the', 'waste', 'on', 'inadequate', 'storage', 'and', 'bad', 'preservation', 'methods', 'It', 'said', 'the', 'government', 'had', 'launched', 'a', 'national', 'programme', 'to', 'reduce', 'waste', 'calling', 'for', 'improved', 'technology', 'in', 'storage', 'and', 'preservation', 'and', 'greater', 'production', 'of', 'additives', 'The', 'paper', 'gave', 'no', 'further', 'details']


In [12]:
# Navigation: le premier index donne tous les documents, 
#le deuxième index donne le document spécifique, 
#le troisième index donne les mots de ce document
print(tokenized_data[0][1][0:10])

['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '712', 'PCT', 'GRAIN', 'STOCKS', 'A']


# Normalisation

In [14]:
#rendre tous les mots en minuscules pour tous les documents
for x in range(len(reuters.fileids())):
    lowers = [word.lower() for word in tokenized_data[0][x]]
    tokenized_data[0][x] = lowers


# afficher les 10 premiers termes du premier document
tokenized_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'a']

# Suppression des mots vides

In [17]:
#Supprimer les mots vides(Stop words) de tous les documents
stop_words = set(stopwords.words('english'))

for x in range(len(reuters.fileids())):
    filtered_sentence = [w for w in tokenized_data[0][x] if not w in stop_words]
    tokenized_data[0][x] = filtered_sentence

# afficher les 10 premiers termes du deuxième document
tokenized_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'survey']

# Stemming

In [19]:
#stem words

porter_stemmer = PorterStemmer()
for x in range(len(reuters.fileids())):
    stemmed_sentence = [porter_stemmer.stem(w) for w in tokenized_data[0][x]]
    tokenized_data[0][x] = stemmed_sentence

# afficher les 10 premiers stem words du premier document
print(tokenized_data[0][2][0:10])

['japan', 'revis', 'longterm', 'energi', 'demand', 'downward', 'ministri', 'intern', 'trade', 'industri']


# Création de l'index inversé

In [0]:
# Normalisation des Docs_IDs 
for x in range(len(reuters.fileids())):
    reuters.fileids()[x] = (reuters.fileids()[x].split("/"))[1]

for i in range(len(reuters.fileids())):
    reuters.fileids()[i]=int(reuters.fileids()[i])

In [22]:
# Create inverse index which gives document number for each document 

#first we need to create a list of all words 
l = tokenized_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
wordsunique = set(words)
wordsunique = list(wordsunique)

wordsunique

['wednesday',
 '54993',
 'matur',
 'demand',
 '7936',
 'hoax',
 '1456000',
 'parallel',
 'pro',
 'disut',
 'frighten',
 '4685930',
 '7134000',
 'wirelesss',
 'bother',
 'bbd',
 'symposium',
 '34186',
 '299000',
 'witter700550700',
 'gari',
 'cleari',
 '9899038',
 'gooseberri',
 '5294',
 'graham',
 'dd',
 'stubbl',
 '1492000',
 'histori',
 'burdekin',
 'canron',
 'fintech',
 '8696000',
 '3761000',
 '526',
 'ltfput',
 'pst',
 'agrianalysi',
 'behoov',
 '15426',
 'mccormack',
 '4641',
 'laborarori',
 '612',
 'solmec',
 '600010000',
 'richmond',
 'stevenson',
 'junk',
 '9543',
 '2341818',
 'cogeca',
 '3007',
 '81p',
 'gummi',
 '236100',
 'goe',
 'aluv',
 'bottl',
 'conceiv',
 'stopout',
 'ltpae',
 'resumpt',
 'ltlomko',
 'inner',
 'gase',
 'nui',
 '1184000',
 '5200',
 '126749',
 '2885',
 'verzekeringsgroep',
 '876',
 'sesostri',
 '5207',
 'solut',
 'greenwel',
 '8800000',
 'scare',
 'ltcometra',
 '3122983',
 '1308503',
 'detent',
 'daz',
 '1174',
 'ltscovil',
 'oblig',
 'vien',
 'ltccn',
 

In [0]:
# Create dictonary of words
import re
import numpy as np

plottest = tokenized_data[0][0:1000]

worddic = {}

for doc in plottest:
    for i in range(len(wordsunique)):
        word = wordsunique[i]
        if word in doc:
            word = str(word)
            index2 = reuters.fileids()[plottest.index(doc)]
            try:
                worddic[word].append([index2])
            except:
                worddic[word] = []
                worddic[word].append([index2])

In [24]:
worddic

{'newspap': [[14826],
  [14904],
  [14909],
  [14932],
  [15154],
  [15367],
  [15581],
  [16097],
  [16203],
  [16216]],
 'largest': [[14826],
  [14833],
  [14840],
  [14858],
  [14862],
  [14888],
  [14900],
  [14926],
  [14959],
  [15013],
  [15149],
  [15204],
  [15238],
  [15270],
  [15413],
  [15417],
  [15438],
  [15840],
  [15975],
  [16009],
  [16093],
  [16094],
  [16122],
  [16176],
  [16212],
  [16277],
  [16332],
  [16094]],
 'world': [[14826],
  [14833],
  [14840],
  [14852],
  [14892],
  [14900],
  [14928],
  [14987],
  [15063],
  [15095],
  [15154],
  [15194],
  [15254],
  [15287],
  [15290],
  [15310],
  [15352],
  [15367],
  [15372],
  [15386],
  [15394],
  [15424],
  [15447],
  [15472],
  [15539],
  [15543],
  [15551],
  [15556],
  [15617],
  [15653],
  [15798],
  [15906],
  [15910],
  [15914],
  [15921],
  [15922],
  [15923],
  [15925],
  [15928],
  [15939],
  [15973],
  [16075],
  [16093],
  [16139],
  [16190],
  [16196],
  [16200],
  [16206],
  [16213],
  [16216],

In [25]:
dictionary = list(worddic.keys())
dictionary_list_doc = list(worddic.values())
print(dictionary)
print(dictionary_list_doc)

['newspap', 'largest', 'world', 'makoto', 'rift', 'two', 'curb', 'ltjame', 'alleg', 'lttaiwan', 'complet', 'quickli', 'feder', '49', 'account', 'stick', 'asia', 'tokyo', 'public', 'beyond', 'safe', 'whole', 'larg', 'meanwhil', 'directorgener', 'told', 'outweigh', 'japan', 'remov', 'put', 'foreign', 'washington', 'financi', 'cut', 'kind', 'friday', 'reuter', 'stimul', 'hong', 'kong', 'signific', 'time', 'analyst', 'economi', 'coal', 'eros', 'murtha', 'day', 'govern', 'april', 'expand', 'deputi', 'budget', 'row', 'reserv', 'effort', 'manoeuvr', 'repres', 'packag', 'place', 'serious', 'boost', 'mln', 'export', 'domest', 'sentiment', 'worri', 'market', 'spokesman', 'one', 'much', 'mount', 'virtual', 'also', 'billion', 'new', 'manufactur', 'length', 'inflict', 'possibl', 'commerci', 'exchang', 'year', 'trade', 'work', '300', 'co', 'democrat', 'chief', 'ltd', 'standoff', 'restrain', 'michael', 'paul', 'yasuhiro', 'move', 'centr', 'threat', 'firm', 'promot', 'us', 'textil', 'dlr', 'belowcost'

In [0]:
postings = []
for i in range(len(dictionary_list_doc)):
    liste = []
    for j in range(len(dictionary_list_doc[i])):
        liste.append(dictionary_list_doc[i][j][0])
    postings.append(liste)
        

In [27]:
postings

[[14826, 14904, 14909, 14932, 15154, 15367, 15581, 16097, 16203, 16216],
 [14826,
  14833,
  14840,
  14858,
  14862,
  14888,
  14900,
  14926,
  14959,
  15013,
  15149,
  15204,
  15238,
  15270,
  15413,
  15417,
  15438,
  15840,
  15975,
  16009,
  16093,
  16094,
  16122,
  16176,
  16212,
  16277,
  16332,
  16094],
 [14826,
  14833,
  14840,
  14852,
  14892,
  14900,
  14928,
  14987,
  15063,
  15095,
  15154,
  15194,
  15254,
  15287,
  15290,
  15310,
  15352,
  15367,
  15372,
  15386,
  15394,
  15424,
  15447,
  15472,
  15539,
  15543,
  15551,
  15556,
  15617,
  15653,
  15798,
  15906,
  15910,
  15914,
  15921,
  15922,
  15923,
  15925,
  15928,
  15939,
  15973,
  16075,
  16093,
  16139,
  16190,
  16196,
  16200,
  16206,
  16213,
  16216,
  16256,
  16257,
  16513],
 [14826, 14904, 15154],
 [14826],
 [14826,
  14840,
  14843,
  14849,
  14852,
  14865,
  14873,
  14875,
  14888,
  14890,
  14921,
  14931,
  14959,
  14964,
  15043,
  15055,
  15063,
  15065,


# TF IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

list_of_docs = [" ".join(x) for x in tokenized_data[0][0:99]]
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(list_of_docs)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
feature_names

['071',
 '10',
 '100',
 '100000',
 '100p',
 '102',
 '102350',
 '103',
 '104357',
 '105',
 '1072',
 '1073163',
 '109',
 '1093',
 '10month',
 '10year',
 '11',
 '110',
 '1100000',
 '113432',
 '114000',
 '1149000',
 '115',
 '116',
 '118350',
 '119',
 '12',
 '121',
 '122',
 '125',
 '126',
 '127',
 '128',
 '129',
 '13',
 '130',
 '130000',
 '1300000',
 '132',
 '133',
 '134',
 '1358000',
 '137600',
 '13member',
 '14',
 '141',
 '142',
 '1427000',
 '144',
 '1440000',
 '145',
 '14525',
 '14530',
 '14533',
 '1456070',
 '1473',
 '149',
 '1490',
 '14937',
 '15',
 '150',
 '150000',
 '1502',
 '153',
 '153454',
 '1539',
 '154',
 '155',
 '1551000',
 '156',
 '15628',
 '157',
 '1575',
 '1588985',
 '1591',
 '16',
 '16000',
 '1600000',
 '161',
 '1627',
 '1651',
 '165185',
 '1653000',
 '166',
 '167',
 '17',
 '1700',
 '1709',
 '172',
 '174739',
 '175',
 '1750',
 '17583',
 '176',
 '1764',
 '177',
 '1784000',
 '18',
 '1801',
 '1824',
 '184',
 '1843',
 '18500',
 '187',
 '188',
 '1888',
 '189',
 '19',
 '1905',
 '

In [29]:
df

Unnamed: 0,071,10,100,100000,100p,102,102350,103,104357,105,1072,1073163,109,1093,10month,10year,11,110,1100000,113432,114000,1149000,115,116,118350,119,12,121,122,125,126,127,128,129,13,130,130000,1300000,132,133,...,wide,widen,will,william,willing,wilson,wipe,wisdom,wish,withdraw,within,without,wmc,woolworth,work,worker,workforc,workpractic,world,worri,worth,would,wouldnt,wr,written,wrong,wtc,yasuhiro,year,yemen,yen,yesterday,yet,yeutter,yield,york,youd,zealand,zeebregt,zinc
0,0.0,0.028565,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.027522,0.0,0.0,0.0,0.053206,0.038344,0.0,0.056839,0.038344,0.0,0.0,0.0,0.0,0.038344,0.059065,0.0,0.000000,0.025037,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.086730,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.046532,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.055547,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.059604,0.0,0.057651,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.181462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.069880,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.064242,0.000000,0.0,0.045753,0.000000,0.0,0.0,0.0,0.0,0.000000,0.035658,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.378117,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282256,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073959,0.062583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.057862,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.161226,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.124175,0.0,0.000000,0.000000,0.073959,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.341853,0.0,0.0,0.0,0.398018,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
querry = 'asia export fear damag downward ministri'

querry_vector = vectorizer.transform([querry]).todense()
print(querry_vector)

[[0. 0. 0. ... 0. 0. 0.]]


# Cosine similarity

In [31]:
list_cosine_similarity=cosine_similarity(vectors,querry_vector[0])
print(np.argmax(list_cosine_similarity))

0


# Compression de Docs_ids

## Gamma-Gap Encoding

## Gap Encoding

In [0]:
import collections

def gap(list):
    counts=[]
    for i in range(len(list)):
        count = []
        count.append(list[i][0])
        for j in range(1,len(list[i])):
            count.append(list[i][j]-list[i][j-1])
        counts.append(count)
    return counts

In [33]:
gap(postings)

[[14826, 78, 5, 23, 222, 213, 214, 516, 106, 13],
 [14826,
  7,
  7,
  18,
  4,
  26,
  12,
  26,
  33,
  54,
  136,
  55,
  34,
  32,
  143,
  4,
  21,
  402,
  135,
  34,
  84,
  1,
  28,
  54,
  36,
  65,
  55,
  -238],
 [14826,
  7,
  7,
  12,
  40,
  8,
  28,
  59,
  76,
  32,
  59,
  40,
  60,
  33,
  3,
  20,
  42,
  15,
  5,
  14,
  8,
  30,
  23,
  25,
  67,
  4,
  8,
  5,
  61,
  36,
  145,
  108,
  4,
  4,
  7,
  1,
  1,
  2,
  3,
  11,
  34,
  102,
  18,
  46,
  51,
  6,
  4,
  6,
  7,
  3,
  40,
  1,
  256],
 [14826, 78, 250],
 [14826],
 [14826,
  14,
  3,
  6,
  3,
  13,
  8,
  2,
  13,
  2,
  31,
  10,
  28,
  5,
  79,
  12,
  8,
  2,
  30,
  51,
  8,
  34,
  12,
  19,
  2,
  17,
  17,
  15,
  42,
  1,
  1,
  23,
  27,
  3,
  5,
  12,
  16,
  13,
  2,
  21,
  -36,
  62,
  2,
  7,
  10,
  1,
  21,
  8,
  12,
  16,
  4,
  9,
  4,
  13,
  27,
  43,
  35,
  8,
  38,
  42,
  35,
  13,
  24,
  59,
  23,
  8,
  40,
  12,
  2,
  9,
  2,
  6,
  14,
  17,
  26,
  1,
  19,
  16,
  

## Gamma Encoding

In [0]:
from bitarray import bitarray


def gamma_code(n):
    binary_n = format(n, 'b')
    binary_offset = bitarray(binary_n[1::])
    unary_length = bitarray(True for i in range(len(binary_offset))) + bitarray([False])
    return unary_length+binary_offset

## Gamma-Gap Encoding

In [0]:
encoding_gap_ids = gap(postings)
encoding_gamma_gap_ids=[]
for i in range(len(encoding_gap_ids)):
    encoding_gamma_gap_id=[]
    for j in range(len(encoding_gap_ids[i])):
        encoding_gamma_gap_id.append(gamma_code(encoding_gap_ids[i][j]))
    encoding_gamma_gap_ids.append(encoding_gamma_gap_id)

In [40]:
encoding_gamma_gap_ids

[[bitarray('111111111111101100111101010'),
  bitarray('1111110001110'),
  bitarray('11001'),
  bitarray('111100111'),
  bitarray('111111101011110'),
  bitarray('111111101010101'),
  bitarray('111111101010110'),
  bitarray('1111111110000000100'),
  bitarray('1111110101010'),
  bitarray('1110101')],
 [bitarray('111111111111101100111101010'),
  bitarray('11011'),
  bitarray('11011'),
  bitarray('111100010'),
  bitarray('11000'),
  bitarray('111101010'),
  bitarray('1110100'),
  bitarray('111101010'),
  bitarray('11111000001'),
  bitarray('11111010110'),
  bitarray('111111100001000'),
  bitarray('11111010111'),
  bitarray('11111000010'),
  bitarray('11111000000'),
  bitarray('111111100001111'),
  bitarray('11000'),
  bitarray('111100101'),
  bitarray('11111111010010010'),
  bitarray('111111100000111'),
  bitarray('11111000010'),
  bitarray('1111110010100'),
  bitarray('0'),
  bitarray('111101100'),
  bitarray('11111010110'),
  bitarray('11111000100'),
  bitarray('1111110000001'),
  bitarra

In [0]:
import pickle

#Construction du fichier inverse

file = open('important', 'wb')

pickle.dump(encoding_gamma_gap_ids, file)

file.close()