### Steps:
1. Load Data
2. Preprocess:
    1. Extract Text Column
    2. Parsing and Clean Text
    3. Lemmitiz Text
    4. Create Dictionary of word.
3. Models:
    1. LDA + TfIdf
4. Create Pipeline

### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from spacy.lang.en import English
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import gensim
from gensim import corpora, models




### Load Data

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/research_paper.csv")
df

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Sc...,VLDB
1,High performance prime field multiplication fo...,ISCAS
2,enchanted scissors: a scissor interface for su...,SIGGRAPH
3,Detection of channel degradation attack by Int...,INFOCOM
4,Pinning a Complex Network through the Betweenn...,ISCAS
5,Analysis and Design of Memoryless Interconnect...,ISCAS
6,Dynamic bluescreens.,SIGGRAPH
7,A Quantitative Assured Forwarding Service.,INFOCOM
8,Automatic sanitization of social network data ...,WWW
9,A &#916;&#931; IR-UWB radar with sub-mm rangin...,ISCAS


### Preprocess

In [3]:
nltk.download('stopwords')

en_stop = set(nltk.corpus.stopwords.words('english') )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\IP700\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stemmer = SnowballStemmer('english')

def tokenize(text):
    results = []
    tokens= gensim.utils.simple_preprocess(text)
    for token in tokens:
        if len(token) <3:
            continue
        elif token.startswith("http"):
            continue
        elif token.startswith("@"):
            continue
        else:
            results.append(token.lower())
    return results
    
def lemmatize(text):
    return(stemmer.stem(WordNetLemmatizer().lemmatize(text)))

def preprocess(txt_col):
    text_data= []
    for line in df[txt_col]:
        tokens = tokenize(line)
        tokens = [lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if len(token)>3]
        tokens = [token for token in tokens if token not in en_stop]
        text_data.append(tokens)
    return text_data

processed_docs = preprocess('Title')
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=.5)

### Model

In [11]:
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary,passes=2, workers=2)
for idx, topic in lda_model_tfidf.print_topics(num_words=6):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.049*"network" + 0.033*"wireless" + 0.022*"optim" + 0.020*"awar" + 0.019*"multi" + 0.018*"learn"
Topic: 1 Word: 0.041*"base" + 0.025*"network" + 0.023*"model" + 0.023*"time" + 0.022*"filter" + 0.021*"algorithm"
Topic: 2 Word: 0.032*"approach" + 0.030*"rout" + 0.026*"network" + 0.026*"voltag" + 0.022*"data" + 0.020*"base"
Topic: 3 Word: 0.031*"effici" + 0.026*"circuit" + 0.025*"design" + 0.023*"filter" + 0.022*"power" + 0.022*"delay"
Topic: 4 Word: 0.040*"detect" + 0.031*"video" + 0.029*"base" + 0.025*"model" + 0.024*"object" + 0.024*"interact"
Topic: 5 Word: 0.048*"system" + 0.036*"databas" + 0.032*"comput" + 0.025*"control" + 0.021*"time" + 0.019*"mobil"
Topic: 6 Word: 0.037*"imag" + 0.035*"visual" + 0.024*"sensor" + 0.020*"light" + 0.020*"engin" + 0.019*"network"
Topic: 7 Word: 0.039*"design" + 0.029*"system" + 0.025*"data" + 0.025*"manag" + 0.024*"semant" + 0.024*"languag"
Topic: 8 Word: 0.038*"queri" + 0.036*"servic" + 0.030*"generat" + 0.030*"analysi" + 0.028*"sear

In [12]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model_tfidf, tfidf_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
