# __Word to Word example__

Code from https://datascience.stackexchange.com/questions/40038/how-to-implement-word-to-word-co-occurence-matrix-in-python

Data from https://archive.ics.uci.edu/ml/datasets/Amazon+Commerce+reviews+set#

In [None]:
import numpy as np
import pandas as pd
import string
import re

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter


## Example 1

In [1]:
ctxs = [
    'krayyem like candy crush more then coffe',
    'krayyem plays candy crush all days',
    'krayyem do not invite his friends to play candy crush',
    'krayyem is smart',
]

l_unique = list(set((' '.join(ctxs)).split(' ')))
mat = np.zeros((len(l_unique), len(l_unique)))

nei = []
nei_size = 3

for ctx in ctxs:
    for word in ctx.split(' '):
        nei.append(word)
        if len(nei) > nei_size:
            nei.pop(0)
        for word_1 in nei:
            for word_2 in nei:
                # if word_1 != word_2 -> to avoid diagonal
                mat[l_unique.index(word_1), l_unique.index(word_2)] += 1

mat = pd.DataFrame(mat)
mat.index = l_unique
mat.columns = l_unique
display(mat)

Unnamed: 0,days,all,invite,play,more,krayyem,coffe,crush,his,smart,do,then,plays,to,is,candy,not,friends,like
days,3.0,2.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
all,2.0,3.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
invite,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
play,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0
more,0.0,0.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
krayyem,2.0,1.0,0.0,0.0,0.0,12.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,0.0,2.0,3.0,1.0,0.0,2.0
coffe,0.0,0.0,0.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
crush,1.0,2.0,0.0,1.0,2.0,2.0,0.0,9.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,6.0,0.0,0.0,1.0
his,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0
smart,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Example 2

In [3]:
sentences = ['i go to london', 'you do not go to london', 'but london goes to you']
vocab = set(word_tokenize(' '.join(sentences)))
print('Vocabulary:\n',vocab,'\n')
token_sent_list = [word_tokenize(sen) for sen in sentences]
print('Each sentence in token form:\n',token_sent_list,'\n')

co_occ = {ii:Counter({jj:0 for jj in vocab if jj!=ii}) for ii in vocab}
k=2

for sen in token_sent_list:
    for ii in range(len(sen)):
        if ii < k:
            c = Counter(sen[0:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        elif ii > len(sen)-(k+1):
            c = Counter(sen[ii-k::])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        else:
            c = Counter(sen[ii-k:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c

# Having final matrix in dict form lets you convert it to different python data structures
co_occ = {ii:dict(co_occ[ii]) for ii in vocab}
display(co_occ)

[nltk_data] Downloading package punkt to /Users/jpinzon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Vocabulary:
 {'i', 'you', 'do', 'london', 'but', 'to', 'goes', 'not', 'go'} 

Each sentence in token form:
 [['i', 'go', 'to', 'london'], ['you', 'do', 'not', 'go', 'to', 'london'], ['but', 'london', 'goes', 'to', 'you']] 



{'i': {'to': 1, 'go': 1},
 'you': {'do': 1, 'not': 1, 'goes': 1, 'to': 1},
 'do': {'you': 1, 'not': 1, 'go': 1},
 'london': {'to': 3, 'go': 2, 'but': 1, 'goes': 1},
 'but': {'london': 1, 'goes': 1},
 'to': {'i': 1, 'london': 3, 'go': 2, 'not': 1, 'goes': 1, 'you': 1},
 'goes': {'you': 1, 'london': 1, 'but': 1, 'to': 1},
 'not': {'you': 1, 'do': 1, 'to': 1, 'go': 1},
 'go': {'i': 1, 'london': 2, 'to': 2, 'do': 1, 'not': 1}}

## USING A DATASET

In [70]:
# data from: https://data.world/ostp/wsrd-testbed-inventory 
data_inventory = pd.read_csv('data/Testbed_Inventory.csv')

In [71]:
# Remove space at the end of the sentence
data_world = data_inventory['name'].str.replace(r' $', '')
# Create a list of the sentences
text = list(data_world)

In [72]:
# Remove punctuation 
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in text]
# Converting the case to lower
lowered_text = [word.lower() for word in  stripped]
print(lowered_text)

['calit2 ericsson wireless access network research testbed', 'afrl aerial layer networking facilities', 'nrl cognitive radio test laboratory', 'orbit open access next generation wireless network testbed', 'spectrum sharing innovation testbed', 'public safety communications research lab pscr', 'cognitive radio network testbed cornet', 'nrl tactical edge network testbed', 'ornl communications testbed', 'electronic proving ground us army test ranges', 'white sands missile range us army test ranges', 'aberdeen test center us army test ranges', 'redstone test center us army test ranges', 'yuma proving ground us army test ranges', 'idaho national laboratory wireless national testbed', 'cmulab', 'd meas learn', 'digital object registry', 'cloudctl vise', 'bgpmux dtunnels', 'enterprisegeni openflow', 'geni4yr', 'gmoc netkarma kgeni', 'gpeni', 'gush proto', 'instools ism infrastructure', 'kansei otm', 'max', 'measurementsys', 'millionnodegeni security', 'orbit wimax', 'orcaben', 'planetlab scaf

In [114]:
import nltk
from nltk.corpus import stopwords
# Removing STOP WORDS
set(stopwords.words('english'))
stop_words = stopwords.words('english')
stop_words.append('calit2')
filtered_sentence = [w for w in lowered_text if not w in stop_words] 

In [115]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Example 1

In [98]:
text_list = filtered_sentence
l_unique = list(set((' '.join(text_list)).split(' ')))
mat = np.zeros((len(l_unique), len(l_unique)))
nei = []
nei_size = 3

for ctx in text_list:
    for word in ctx.split(' '):
        nei.append(word)
        if len(nei) > nei_size:
            nei.pop(0)
        for word_1 in nei:
            for word_2 in nei:
                # if word_1 != word_2 -> to avoid diagonal
                mat[l_unique.index(word_1), l_unique.index(word_2)] += 1

mat = pd.DataFrame(mat)
mat.index = l_unique
mat.columns = l_unique
display(mat)

Unnamed: 0,redstone,ism,infrastructure,gush,pigeonnet,hive,radio,army,vmi,geni4yr,...,wimax,provserv,dtunnels,s3geni,testbed,gpeni,quilt,aerial,digital,security
redstone,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ism,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
infrastructure,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gush,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
pigeonnet,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
hive,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radio,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
army,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vmi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
geni4yr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
sentences = filtered_sentence
vocab = set(word_tokenize(' '.join(sentences)))
#print('Vocabulary:\n',vocab,'\n')
token_sent_list = [word_tokenize(sen) for sen in sentences]
#print('Each sentence in token form:\n',token_sent_list,'\n')

co_occ = {ii:Counter({jj:0 for jj in vocab if jj!=ii}) for ii in vocab}
k=2

for sen in token_sent_list:
    for ii in range(len(sen)):
        if ii < k:
            c = Counter(sen[0:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        elif ii > len(sen)-(k+1):
            c = Counter(sen[ii-k::])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        else:
            c = Counter(sen[ii-k:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c

# Having final matrix in dict form lets you convert it to different python data structures
co_occ = {ii:dict(co_occ[ii]) for ii in vocab}
display(co_occ)

{'redstone': {'center': 1, 'test': 1},
 'ism': {'infrastructure': 1, 'instools': 1},
 'infrastructure': {'ism': 1, 'instools': 1},
 'gush': {'proto': 1},
 'pigeonnet': {},
 'hive': {'dsl': 1},
 'radio': {'laboratory': 1,
  'test': 1,
  'cognitive': 2,
  'nrl': 1,
  'network': 1,
  'testbed': 1},
 'army': {'ground': 2,
  'ranges': 5,
  'test': 5,
  'us': 5,
  'range': 1,
  'center': 2},
 'vmi': {},
 'geni4yr': {},
 'learn': {'d': 1, 'meas': 1},
 'umlpen': {},
 'open': {'next': 1, 'access': 1, 'orbit': 1},
 'pscr': {'lab': 1, 'research': 1},
 'proto': {'gush': 1},
 'cornet': {'network': 1, 'testbed': 1},
 'public': {'communications': 1, 'safety': 1},
 'crgeni': {},
 'sharing': {'innovation': 1, 'spectrum': 1, 'testbed': 1},
 'ground': {'army': 2, 'electronic': 1, 'proving': 2, 'us': 2, 'yuma': 1},
 'cloudctl': {'vise': 1},
 'instools': {'ism': 1, 'infrastructure': 1},
 'information': {'of': 1, 'design': 1, 'subs': 1},
 'orcaben': {},
 'igeni': {},
 'layer': {'afrl': 1, 'networking': 1, '