<a href="https://colab.research.google.com/github/harnalashok/deeplearning-sequences/blob/main/skipgram_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Last amended: 3rd Nov, 2022
My folder: C:\Users\Ashok\OneDrive\Documents\skipgrams

Ref:
https://ljvmiranda921.github.io/notebook/2021/12/11/word-vectors/#pairs
https://www.kaggle.com/competitions/word2vec-nlp-tutorial/overview/part-2-word-vectors
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/skipgrams
https://stackoverflow.com/a/1994012

Objectives:
        i)   To get a skipgram paired sequence

"""

## Install software

In [6]:
! pip install --upgrade gensim 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Call libraries

In [7]:
# 1.0 Call libraries
%reset -f
import pandas as pd
import numpy as np

# 1.1 Import module imdb & other keras modules
import tensorflow as tf


# 1.2 API to manipulate sequences of words
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import skipgrams

# 1.3
import gensim
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from collections import Counter

# 1.4
import os
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [8]:
# 1.5 Download stopwords from nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# 1.6 Display multiple commands output from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Useful functions


In [10]:
# 2.0 Function to clean text (from Kaggle):

def review_to_wordlist( review, remove_stopwords=False, stem = False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. Remove new lines
    review_text = review_text.replace('\n', '')
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
      stops = set(stopwords.words("english"))
      words = [w for w in words if not w in stops]
    # 5. Optionally perform stemming (false by default)  
    if stem:
      stemmer = PorterStemmer()
      words = [stemmer.stem(w) for w in words ]
    # 5. Return a list of words
    return(words)


## Mount gdrive

In [11]:
# 2.1

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [12]:
# 3.0 We will upload our text file 
#      directly from gdrive:

pathToFolder = "/gdrive/MyDrive/Colab_data_files/skipgram/"

## Read data
Upload text file <i>'football.txt'</i> directly to `/content/` folder of colab virtual machine.

In [13]:
# 3.1 Read foortball.txt:

tx_data = pd.read_csv(pathToFolder+ "combined.txt", sep = "\t", header = 'infer')
tx_data.head()

Unnamed: 0,text
0,"In ordinary language, a crime is an unlawful a..."


In [14]:
# 3.2 Examine relevant column:
tx_data['text']


0    In ordinary language, a crime is an unlawful a...
Name: text, dtype: object

In [None]:
# 3.3 Get complete text as a list of one string:

text = list(tx_data['text'])
text

## Process text
Clean text and create word-to-int index

In [16]:
# 4.0 Clean the text and get tokens:

cleaned_tokens = review_to_wordlist(text[0], remove_stopwords=True)

In [17]:
# 4.1 Look at tokens and how many total words:
cleaned_tokens[:10]
print()
len(cleaned_tokens)

['ordinary',
 'language',
 'crime',
 'unlawful',
 'act',
 'punishable',
 'state',
 'authority',
 'term',
 'crime']




12956

In [18]:
# 4.2 No of unique words:

max_vocab = len(set(cleaned_tokens))
max_vocab  # 4044

4044

In [None]:
# 5.0 Get a dict of which word occurs
#     how many times:

freq_of_words = Counter(cleaned_tokens)
freq_of_words

In [None]:
# 5.1 Sort words in order of freq
#     Most freq at the top:

vocab = sorted(freq_of_words,
               key=freq_of_words.get,
               reverse=True)

# 5.1.1
vocab   # Most freq at the top; least at the bottom

In [21]:
# 5.2 Get a dict of word and its int label:
#     Most freq word gets transformed to 1:

word_index = {word: ii for ii, word in enumerate(vocab, 1)}

In [None]:
# 5.3 Here is our word to int index:

word_index

## Get skipgrams
Transform text to int sequence and get skipgram pairs

In [22]:
# 6.0 Function to map a word list to integers
#     as per mapping in word_index dict:

def seq2int(wordList, word_index):
    return [word_index[x] for x in wordList]

In [23]:
# 6.1 Here is our int seq:

int_seq = seq2int(cleaned_tokens, word_index)
int_seq[:10]

[753, 362, 16, 1789, 90, 363, 32, 1121, 46, 16]

In [24]:
skipgrams = []
for i,j in enumerate(int_seq):
  if i < len(int_seq)-4:
    t = int_seq[i],int_seq[i+1],int_seq[i+2],int_seq[i+3],int_seq[i+4]
    skipgrams.append(t)

In [25]:
skipgrams[:5]

[(753, 362, 16, 1789, 90),
 (362, 16, 1789, 90, 363),
 (16, 1789, 90, 363, 32),
 (1789, 90, 363, 32, 1121),
 (90, 363, 32, 1121, 46)]

In [None]:
# 6.2 Translate the int_seq to pairs of skipgrams:

pairs = skipgrams(
                   int_seq,
                   vocabulary_size = max_vocab,
                   window_size=4,
                   negative_samples=1.0,
                   shuffle=True,
                   categorical=False,
                   sampling_table=None,
                   seed=None
                  )

In [None]:
# 6.3 Look at pairs:

pairs[0]

## Save processed data
Save skipgram pairs and word-to-index dict to `/content/` folder of virtual machine

In [26]:
# 7.0 Transform to pandas DataFrame:

data = pd.DataFrame(skipgrams, columns = ["a", "b", "c","d", "e"])
data.to_pickle(pathToFolder + "skipgrams.pkl")
data.head()

Unnamed: 0,a,b,c,d,e
0,753,362,16,1789,90
1,362,16,1789,90,363
2,16,1789,90,363,32
3,1789,90,363,32,1121
4,90,363,32,1121,46


In [None]:
# 7.0 Transform to pandas DataFrame:

data = pd.DataFrame(pairs[0], columns = ["a", "b"])
data.to_pickle(pathToFolder + "seq.pkl")
data.head()

Unnamed: 0,a,b
0,174,2146
1,3743,3067
2,9,917
3,14,759
4,1283,1173


In [27]:
# 7.1 Save word_to_index dict to a text file:

filehandler = open(pathToFolder + "word_index.txt", 'wt')
data = str(word_index)
filehandler.write(data)

69345

## Read back saved files

In [None]:
# 8.0 Read back pkl file and dictionary:
seq = pd.read_pickle(pathToFolder + "seq.pkl")
seq.head()

In [28]:
# 8.0 Read back pkl file and dictionary:
skipgrams = pd.read_pickle(pathToFolder + "skipgrams.pkl")
skipgrams.head()

Unnamed: 0,a,b,c,d,e
0,753,362,16,1789,90
1,362,16,1789,90,363
2,16,1789,90,363,32
3,1789,90,363,32,1121
4,90,363,32,1121,46


In [29]:
# 8.1 Read saved dict:

filehandler = open(pathToFolder + "word_index.txt", 'r')
filehandler.read()

"{'football': 1, 'religion': 2, 'ball': 3, 'hockey': 4, 'rules': 5, 'game': 6, 'law': 7, 'religious': 8, 'played': 9, 'rugby': 10, 'religions': 11, 'first': 12, 'games': 13, 'th': 14, 'century': 15, 'crime': 16, 'one': 17, 'also': 18, 'many': 19, 'world': 20, 'may': 21, 'used': 22, 'ice': 23, 'modern': 24, 'codes': 25, 'word': 26, 'people': 27, 'called': 28, 'england': 29, 'association': 30, 'players': 31, 'state': 32, 'form': 33, 'time': 34, 'known': 35, 'sport': 36, 'english': 37, 'early': 38, 'two': 39, 'various': 40, 'school': 41, 'include': 42, 'use': 43, 'criminal': 44, 'common': 45, 'term': 46, 'public': 47, 'crimes': 48, 'code': 49, 'including': 50, 'however': 51, 'schools': 52, 'playing': 53, 'american': 54, 'countries': 55, 'international': 56, 'field': 57, 'laws': 58, 'sports': 59, 'play': 60, 'states': 61, 'ancient': 62, 'new': 63, 'league': 64, 'culture': 65, 'life': 66, 'practices': 67, 'based': 68, 'goal': 69, 'system': 70, 'even': 71, 'beliefs': 72, 'social': 73, 'examp

In [None]:
#####################

In [None]:

%reset -f
import bs4
import sys
import requests
import  os
pathToStoreFiles = "/content"
os.chdir(pathToStoreFiles)
os.listdir()
# Your wikipedia page title:
wiki_page = 'Religion'   # Religion, Crime, Hockey, Football
# Process text now
res = requests.get(f'https://en.wikipedia.org/wiki/{wiki_page}' )
res.raise_for_status()
wiki = bs4.BeautifulSoup(res.text,"html.parser")

# open a file named as your wiki page in write mode
with open(wiki_page+".txt", "w", encoding="utf-8") as f:
    for i in wiki.select('p'):
        # write each paragraph to the file
        f.write(i.getText())

os.listdir()


['.config',
 'seq.pkl',
 'football.txt',
 'Hockey.txt',
 'word_index.txt',
 'Crime.txt',
 'sample_data']

1

10

9

13

7

5

6

10

1

371

999

515

332

919

1542

331

508

650

1709

385

1095

516

199

452

406

820

57

295

66

254

204

141

216

285

142

528

176

171

488

464

750

407

261

428

36

656

692

330

191

317

312

125

227

125

709

663

420

504

587

566

508

183

536

332

255

586

468

293

921

88

1068

821

42

738

205

181

496

798

236

239

528

129

696

144

153

270

137

900

254

407

827

711

539

764

350

145

202

1798

300

369

570

496

337

556

304

338

181

458

425

828

609

718

304

196

483

822

143

['.config',
 'seq.pkl',
 'Religion.txt',
 'football.txt',
 'Hockey.txt',
 'word_index.txt',
 'Crime.txt',
 'sample_data']