In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pandas as pd
import nltk
import numpy as np

np.random.seed(400)

nltk.download('wordnet')

stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to /home/anvil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [10]:
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [25]:
# https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [29]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [37]:
dictionary = gensim.corpora.Dictionary(processed_docs)
      
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.
Word 520 ("wors") appears 2 time.
Word 721 ("keith") appears 3 time.
Word 732 ("punish") appears 1 time.
Word 803 ("california") appears 1 time.
Word 859 ("institut") appears 1 time.
Word 917 ("similar") appears 1 time.
Word 990 ("allan") appears 1 time.
Word 991 ("anti") appears 1 time.
Word 992 ("arriv") appears 1 time.
Word 993 ("austria") appears 1 time.
Word 994 ("caltech") appears 2 time.
Word 995 ("distinguish") appears 1 time.
Word 996 ("german") appears 1 time.
Word 997 ("germani") appears 3 time.
Word 998 ("hitler") appears 1 time.
Word 999 ("livesey") appears 2 time.
Word 1000 ("motto") appears 2 time.
Word 1001 ("order") appear

In [38]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [39]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.007*"presid" + 0.004*"clinton" + 0.004*"bike" + 0.003*"netcom" + 0.003*"run" + 0.003*"homosexu" + 0.003*"talk" + 0.003*"pitch" + 0.003*"money" + 0.003*"virginia"


Topic: 1 
Words: 0.009*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"weapon" + 0.004*"jew" + 0.004*"countri"


Topic: 2 
Words: 0.017*"game" + 0.015*"team" + 0.011*"play" + 0.009*"player" + 0.008*"hockey" + 0.006*"season" + 0.005*"canada" + 0.005*"leagu" + 0.005*"score" + 0.004*"divis"


Topic: 3 
Words: 0.010*"card" + 0.009*"window" + 0.007*"driver" + 0.007*"sale" + 0.006*"price" + 0.005*"speed" + 0.005*"appl" + 0.005*"video" + 0.005*"monitor" + 0.004*"engin"


Topic: 4 
Words: 0.015*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"imag" + 0.006*"data" + 0.006*"avail" + 0.005*"version" + 0.005*"code"


Topic: 5 
Words: 0.013*"space" + 0.010*"nasa" + 0.006*"scienc" + 0.005*"research" + 0.005*"orbit

In [40]:
num = 100
unseen_document = newsgroups_test.data[num]
print(unseen_document)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [41]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5578656196594238	 Topic: 0.010*"card" + 0.009*"window" + 0.007*"driver" + 0.007*"sale" + 0.006*"price"
Score: 0.41789543628692627	 Topic: 0.015*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip"


In [43]:
print(newsgroups_test.target)

[ 7  5  0 ...  9  6 15]


In [17]:
videos = pd.read_json('./data/youtube_history.json')

In [28]:
print(videos.info())
processed_docs = []

print(videos.head(10))
# for doc in videos.itertuples():
# 		print(doc)
    #processed_docs.append(preprocess(doc['soup']))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2810 entries, 0 to 2809
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              2810 non-null   object
 1   title            2810 non-null   object
 2   visit_count      2810 non-null   int64 
 3   last_visit_time  2810 non-null   int64 
 4   publishedAt      2810 non-null   object
 5   description      2810 non-null   object
 6   channelTitle     2810 non-null   object
 7   channelId        2810 non-null   object
 8   tags             2162 non-null   object
dtypes: int64(2), object(7)
memory usage: 197.7+ KB
None
                                                url  \
0       https://www.youtube.com/watch?v=kw9nTK42bTw   
1       https://www.youtube.com/watch?v=D9veJLKqnpg   
2       https://www.youtube.com/watch?v=ESKDxvPeUUE   
3       https://www.youtube.com/watch?v=RxEA1WleXlM   
4       https://www.youtube.com/watch?v=PzyvTHrLmQk   
5       http