# Building a Wor2vec model trained on TV Series subtitles:

## Unzip subtitles inside a directory "subs"

In [3]:
!mkdir subs
!unzip how-i-met-your-mother-seventh-season_english-1334377.zip -d subs


mkdir: cannot create directory ‘subs’: File exists
Archive:  how-i-met-your-mother-seventh-season_english-1334377.zip
Subscene
  inflating: subs/How I Met Your Mother S07E01 HDTV.X264-Bzingaz.srt  
  inflating: subs/How I Met Your Mother S07E02 HDTV.X264-Bzingaz.srt  
  inflating: subs/How I Met Your Mother S07E03 HDTV.X264-Bzingaz.srt  
  inflating: subs/How I Met Your Mother S07E04 HDTV.X264-Bzingaz.srt  
  inflating: subs/How I Met Your Mother S07E05 HDTV-DIMENSION.X264-Bzingaz.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E06.720p.HDTV.X264-DIMENSION.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E07.720p.HDTV.X264-DIMENSION.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E08.720p.HDTV.X264-DIMENSION.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E09.720p.HDTV.X264-DIMENSION.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E10.720p.HDTV.X264-DIMENSION.srt  
  inflating: subs/How.I.Met.Your.Mother.S07E11.720p.HDTV.x264-IMMERSE.srt  
  inflating: subs/How.I.Met.Your.Mother.S0

## install pysrt to read and write ".srt" files

In [0]:
! pip install --quiet gensim pysrt

In [0]:
import gensim
import logging
import multiprocessing
import os
import sys
import pysrt
import glob
import nltk


## Download necessary packages of NLTK

In [6]:
nltk.download('popular')


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

## Config Logging messages

In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

## Loading texts from files :

In [0]:
files =  glob.glob("subs/*.srt")

In [9]:
texts = []
for f in files:
    print('Loading '+f)
    subs = pysrt.open(f, encoding='iso-8859-1')
    for s in subs:
        texts.append(s.text)

Loading subs/How.I.Met.Your.Mother.S07E23E24.720p.HDTV.X264-DIMENSION.srt
Loading subs/how.i.met.your.mother.s07e15.720p.hdtv.x264-orenji.bak.srt
Loading subs/How.I.Met.Your.Mother.S07E16.720p.HDTV.X264-DIMENSION.srt
Loading subs/How.I.Met.Your.Mother.S07E11.720p.HDTV.x264-IMMERSE.srt
Loading subs/How.I.Met.Your.Mother.S07E13.720p.HDTV.x264-IMMERSE.srt
Loading subs/How.I.Met.Your.Mother.S07E21.720p.HDTV.X264-DIMENSION.srt
Loading subs/How.I.Met.Your.Mother.S07E18.720p.HDTV.X264-DIMENSION.srt
Loading subs/How.I.Met.Your.Mother.S07E20.HDTV.x264-NYCDream.srt
Loading subs/How I Met Your Mother S07E05 HDTV-DIMENSION.X264-Bzingaz.srt
Loading subs/How I Met Your Mother S07E04 HDTV.X264-Bzingaz.srt
Loading subs/How.I.Met.Your.Mother.S07E12.720p.HDTV.X264-DIMENSION.srt
Loading subs/How.I.Met.Your.Mother.S07E06.720p.HDTV.X264-DIMENSION.srt
Loading subs/How I Met Your Mother S07E03 HDTV.X264-Bzingaz.srt
Loading subs/How.I.Met.Your.Mother.S07E07.720p.HDTV.X264-DIMENSION.srt
Loading subs/How I Met 

In [10]:
print("Number of paragraphs is :",len(texts))

Number of paragraphs is : 12208


In [0]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords


In [0]:
segms = []
for t in texts:
    segms.append(word_tokenize(t))

## Removing none alphanumeric words  then we applying the PorterStemmer

In [13]:
final = []
stemmer = PorterStemmer()

for tokens in segms:
    words = [stemmer.stem(word.lower()) for word in tokens if word.isalpha()]
    print(words[:100])
    final.append(words)

['narrat', 'i', 'kid', 'when', 'aunt', 'lili', 'i', 'went', 'into', 'labor', 'visit', 'to', 'download']
['i', 'marshal', 'wa', 'somewher', 'i', 'slightli', 'inconveni']
['i', 'in', 'atlant', 'citi', 'i', 'with', 'uncl', 'barney']
['i', 'and', 'drunk', 'to', 'the', 'point', 'of', 'i', 'talk', 'like']
['babi', 'come']
['papa', 'gon', 'na', 'be', 'am', 'i']
['hospit', 'must', 'we', 'get', 'now']
['okay', 'there', 'like', 'a', 'wait', 'at', 'the', 'valet', 'stand']
['plu', 'we', 'in', 'no', 'shape', 'to', 'drive', 'so', 'a', 'car', 'out']
['wait', 'do', 'worri', 'we', 'take', 'a', 'cab', 'to', 'the', 'train', 'station']
['man', 'yeah', 'good', 'luck', 'get', 'a', 'cab']
['it', 'imposs', 'becaus', 'of', 'the', 'big', 'thing']
['narrat', 'i', 'kid', 'for', 'the', 'life', 'of', 'i', 'me', 'i', 'ca', 'rememb']
['i', 'what', 'the', 'big', 'thing', 'wa', 'in', 'i', 'atlant', 'citi', 'that']
['son', 'i', 'wa', 'it', 'i', 'a', 'cheerlead', 'convent']
['narrat', 'i', 'no', 'it', 'wa', 'i', 'a', 'po

## Train the model

In [14]:
model = gensim.models.Word2Vec(final,batch_words=8,
                               size=100,iter=20,
                               window=10,
                               min_count=1,
                               workers=multiprocessing.cpu_count())


2018-09-17 18:04:53,698 : INFO : collecting all words and their counts
2018-09-17 18:04:53,701 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-17 18:04:53,732 : INFO : PROGRESS: at sentence #10000, processed 56684 words, keeping 3907 word types
2018-09-17 18:04:53,738 : INFO : collected 4339 word types from a corpus of 68986 raw words and 12208 sentences
2018-09-17 18:04:53,745 : INFO : Loading a fresh vocabulary
2018-09-17 18:04:53,759 : INFO : effective_min_count=1 retains 4339 unique words (100% of original 4339, drops 0)
2018-09-17 18:04:53,761 : INFO : effective_min_count=1 leaves 68986 word corpus (100% of original 68986, drops 0)
2018-09-17 18:04:53,778 : INFO : deleting the raw counts dictionary of 4339 items
2018-09-17 18:04:53,780 : INFO : sample=0.001 downsamples 69 most-common words
2018-09-17 18:04:53,783 : INFO : downsampling leaves estimated 48713 word corpus (70.6% of prior 68986)
2018-09-17 18:04:53,806 : INFO : estimated required mem

## Save the model

In [15]:
model.save('srt.model')

2018-09-17 18:05:20,501 : INFO : saving Word2Vec object under srt.model, separately None
2018-09-17 18:05:20,503 : INFO : not storing attribute vectors_norm
2018-09-17 18:05:20,506 : INFO : not storing attribute cum_table
2018-09-17 18:05:20,575 : INFO : saved srt.model


## Show most similar words to "man"

In [16]:
model.wv.most_similar('man')

2018-09-17 18:05:21,603 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('gasp', 0.9633530974388123),
 ('cab', 0.9622167348861694),
 ('luck', 0.9616068601608276),
 ('wow', 0.9598132371902466),
 ('abandon', 0.958922266960144),
 ('avert', 0.9583507180213928),
 ('such', 0.9546405673027039),
 ('marshgammon', 0.9544152617454529),
 ('job', 0.9529415369033813),
 ('fault', 0.9523583054542542)]

## In order to get better results we need a larger dataset : maybe all the subtitles of 2018 TV series.
