# Gensim Tutorial

## Install gensim

In [3]:
# pip install gensim

## Using corpora

In [1]:
from gensim import corpora

documents = [u"Football club Arsenal defeat local rivals this weekend.", u"Weekend football frenzy takes over London.", u"Bank open for take over bids after losing millions.", u"London football clubs bid to move to Wembley stadium.", u"Arsenal bid 50 million pounds for striker Kane.", u"Financial troubles result in loss of millions for bank.", u"Western bank files for bankruptcy after financial losses.", u"London football club is taken over by oil millionaire from Russia.", u"Banking on finances not working for Russia."]

import spacy
nlp = spacy.load("en")
texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num:
            text.append(w.lemma_)
    texts.append(text)

In [2]:
texts

[['football', 'club', 'Arsenal', 'defeat', 'local', 'rival', 'weekend'],
 ['weekend', 'football', 'frenzy', 'take', 'London'],
 ['bank', 'open', 'bid', 'lose', 'million'],
 ['London', 'football', 'club', 'bid', 'Wembley', 'stadium'],
 ['Arsenal', 'bid', 'pound', 'striker', 'Kane'],
 ['financial', 'trouble', 'result', 'loss', 'million', 'bank'],
 ['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss'],
 ['London', 'football', 'club', 'take', 'oil', 'millionaire', 'Russia'],
 ['bank', 'finance', 'work', 'Russia']]

## using dictionary

In [3]:
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in texts] 

{'Arsenal': 0, 'club': 1, 'defeat': 2, 'football': 3, 'local': 4, 'rival': 5, 'weekend': 6, 'London': 7, 'frenzy': 8, 'take': 9, 'bank': 10, 'bid': 11, 'lose': 12, 'million': 13, 'open': 14, 'Wembley': 15, 'stadium': 16, 'Kane': 17, 'pound': 18, 'striker': 19, 'financial': 20, 'loss': 21, 'result': 22, 'trouble': 23, 'bankruptcy': 24, 'file': 25, 'western': 26, 'Russia': 27, 'millionaire': 28, 'oil': 29, 'finance': 30, 'work': 31}


In [4]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(3, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(1, 1), (3, 1), (7, 1), (11, 1), (15, 1), (16, 1)],
 [(0, 1), (11, 1), (17, 1), (18, 1), (19, 1)],
 [(10, 1), (13, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(10, 1), (20, 1), (21, 1), (24, 1), (25, 1), (26, 1)],
 [(1, 1), (3, 1), (7, 1), (9, 1), (27, 1), (28, 1), (29, 1)],
 [(10, 1), (27, 1), (30, 1), (31, 1)]]

## TF-IDF representation

In [6]:
from gensim import models
tfidf = models.TfidfModel(corpus)

for document in tfidf[corpus]:
    print(document)

[(0, 0.3292179861221233), (1, 0.24046829370585296), (2, 0.4809365874117059), (3, 0.1774993848325406), (4, 0.4809365874117059), (5, 0.4809365874117059), (6, 0.3292179861221233)]
[(3, 0.24212967666975266), (6, 0.4490913847888623), (7, 0.32802654645398593), (8, 0.6560530929079719), (9, 0.4490913847888623)]
[(10, 0.2184344336379748), (11, 0.29592528218102643), (12, 0.5918505643620529), (13, 0.4051424990000138), (14, 0.5918505643620529)]
[(1, 0.29431054749542984), (3, 0.21724253258131512), (7, 0.29431054749542984), (11, 0.29431054749542984), (15, 0.5886210949908597), (16, 0.5886210949908597)]
[(0, 0.354982288765831), (11, 0.25928712547209604), (17, 0.5185742509441921), (18, 0.5185742509441921), (19, 0.5185742509441921)]
[(10, 0.19610384738673725), (13, 0.3637247180792822), (20, 0.3637247180792822), (21, 0.3637247180792822), (22, 0.5313455887718271), (23, 0.5313455887718271)]
[(10, 0.18286519950508276), (20, 0.3391702611796705), (21, 0.3391702611796705), (24, 0.4954753228542582), (25, 0.4954

## using n-gram

In [13]:
import gensim
from gensim.corpora import *
bigram = gensim.models.Phrases(texts) 
texts = [bigram[line] for line in texts]

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

dictionary.filter_extremes(no_below=20, no_above=0.5) 

In [16]:
texts

[['football', 'club', 'Arsenal', 'defeat', 'local', 'rival', 'weekend'],
 ['weekend', 'football', 'frenzy', 'take', 'London'],
 ['bank', 'open', 'bid', 'lose', 'million'],
 ['London', 'football', 'club', 'bid', 'Wembley', 'stadium'],
 ['Arsenal', 'bid', 'pound', 'striker', 'Kane'],
 ['financial', 'trouble', 'result', 'loss', 'million', 'bank'],
 ['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss'],
 ['London', 'football', 'club', 'take', 'oil', 'millionaire', 'Russia'],
 ['bank', 'finance', 'work', 'Russia']]