<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/clusters_forever_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk.data
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from urllib import request

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
url_alice = 'https://www.gutenberg.org/files/11/11-0.txt'
url_moby = 'https://www.gutenberg.org/files/2701/2701-0.txt'
# opening the txt files
response_alice = request.urlopen(url_alice)
response_moby = request.urlopen(url_moby)
# reading the files into raw variables as strings
raw_alice = response_alice.read().decode('utf8')
raw_moby = response_moby.read().decode('utf8')
# Split the raw files into lists of sentences
tokenized_alice = sent_tokenize(raw_alice)
tokenized_moby = sent_tokenize(raw_moby)
# remove the contents
tokenized_alice = tokenized_alice[14:]
tokenized_moby = tokenized_moby[275:]
# join the lists
tokenized_joint = tokenized_alice + tokenized_moby
# split joint list into lists of words
word_split_joint = [sentence.split() for sentence in tokenized_joint]

In [3]:
for sentence in word_split_joint[1000:1010]:
  print(sentence)

['The', 'copyright', 'laws', 'of', 'the', 'place', 'where', 'you', 'are', 'located', 'also', 'govern', 'what', 'you', 'can', 'do', 'with', 'this', 'work.']
['Copyright', 'laws', 'in', 'most', 'countries', 'are', 'in', 'a', 'constant', 'state', 'of', 'change.']
['If', 'you', 'are', 'outside', 'the', 'United', 'States,', 'check', 'the', 'laws', 'of', 'your', 'country', 'in', 'addition', 'to', 'the', 'terms', 'of', 'this', 'agreement', 'before', 'downloading,', 'copying,', 'displaying,', 'performing,', 'distributing', 'or', 'creating', 'derivative', 'works', 'based', 'on', 'this', 'work', 'or', 'any', 'other', 'Project', 'Gutenberg-tm', 'work.']
['The', 'Foundation', 'makes', 'no', 'representations', 'concerning', 'the', 'copyright', 'status', 'of', 'any', 'work', 'in', 'any', 'country', 'outside', 'the', 'United', 'States.']
['1.E.']
['Unless', 'you', 'have', 'removed', 'all', 'references', 'to', 'Project', 'Gutenberg:', '1.E.1.']
['The', 'following', 'sentence,', 'with', 'active', 'link

In [4]:
# pipeline to clean the lists of words (remove uppercase,
# punctuation, words are subsequently stemmed)
import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

table = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

def fantabulize(sentence):
  # lowercase
  sentence = [word.lower() for word in sentence]
  # remove punctuation
  sentence = [word.translate(table) for word in sentence]
  # remove nonalphabetic tokens
  sentence = [word for word in sentence if word.isalpha()]
  # remove stopwords
  sentence = [word for word in sentence if not word in stop_words]
  # stem em words
  sentence = [porter.stem(word) for word in sentence]
  # return em sentence
  return sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# run the fantabulize pipeline on our list of lists of words
count = 0
for sentence in word_split_joint:
  word_split_joint[count] = fantabulize(sentence)
  count += 1

In [6]:
for sentence in word_split_joint[1000:1010]:
  print(sentence)

['copyright', 'law', 'place', 'locat', 'also', 'govern', 'work']
['copyright', 'law', 'countri', 'constant', 'state', 'chang']
['outsid', 'unit', 'state', 'check', 'law', 'countri', 'addit', 'term', 'agreement', 'download', 'copi', 'display', 'perform', 'distribut', 'creat', 'deriv', 'work', 'base', 'work', 'project', 'gutenbergtm', 'work']
['foundat', 'make', 'represent', 'concern', 'copyright', 'statu', 'work', 'countri', 'outsid', 'unit', 'state']
[]
['unless', 'remov', 'refer', 'project', 'gutenberg']
['follow', 'sentenc', 'activ', 'link', 'immedi', 'access', 'full', 'project', 'gutenbergtm', 'licens', 'must', 'appear', 'promin', 'whenev', 'copi', 'project', 'gutenbergtm', 'work', 'work', 'phrase', 'project', 'gutenberg', 'appear', 'phrase', 'project', 'gutenberg', 'associ', 'access', 'display', 'perform', 'view', 'copi', 'distribut', 'ebook', 'use', 'anyon', 'anywher', 'unit', 'state', 'part', 'world', 'cost', 'almost', 'restrict', 'whatsoev']
['may', 'copi', 'give', 'away', 'reus

In [7]:
# let's try removing empty lists:
word_split_joint = [element for element in word_split_joint if element != []]

In [8]:
for sentence in word_split_joint[1000:1010]:
  print(sentence)

['charg', 'fee', 'access', 'view', 'display', 'perform', 'copi', 'distribut', 'project', 'gutenbergtm', 'work', 'unless', 'compli', 'paragraph']
['may', 'charg', 'reason', 'fee', 'copi', 'provid', 'access', 'distribut', 'project', 'gutenbergtm', 'electron', 'work', 'provid', 'pay', 'royalti', 'fee', 'gross', 'profit', 'deriv', 'use', 'project', 'gutenbergtm', 'work', 'calcul', 'use', 'method', 'alreadi', 'use', 'calcul', 'applic', 'tax']
['fee', 'owe', 'owner', 'project', 'gutenbergtm', 'trademark', 'agre', 'donat', 'royalti', 'paragraph', 'project', 'gutenberg', 'literari', 'archiv', 'foundat']
['royalti', 'payment', 'must', 'paid', 'within', 'day', 'follow', 'date', 'prepar', 'legal', 'requir', 'prepar', 'period', 'tax', 'return']
['royalti', 'payment', 'clearli', 'mark', 'sent', 'project', 'gutenberg', 'literari', 'archiv', 'foundat', 'address', 'specifi', 'section', 'inform', 'donat', 'project', 'gutenberg', 'literari', 'archiv', 'foundat']
['provid', 'full', 'refund', 'money', 'pa

In [9]:
from gensim.models import Word2Vec
model = Word2Vec(word_split_joint, min_count=1)
# summarize the loaded model
print(model)

Word2Vec(vocab=11827, size=100, alpha=0.025)
