In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pyLDAvis.gensim
import re
import nltk
from nltk.stem.porter import *
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import gensim
from gensim import corpora, models
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from lda import guidedlda as glda
import numpy as np
%matplotlib inline


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
data = pd.read_pickle("../ETL/data.pkl")
data.head()

  and should_run_async(code)


In [5]:
def preprocess_sent(text):
    """
    Get sentence level preprocessed data from raw texts
    :param rw: review to be processed
    :return: sentence level pre-processed review
    """
    # Add missing delimiter: xxxThis is a --> xxx.
    s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', text)  # before lower case
    # Make words all lower case.
    s = s.lower()
    # Remove < and >
    s = re.sub(r'&gt|&lt', ' ', s)
    # Remove letters that repeat more than 2 time.
    s = re.sub(r'([a-z])\1{2,}', r'\1', s)
    # Remove non-word characters that repeat more than 1 time.
    s = re.sub(r'([\W+])\1{1,}', r'\1', s)
    # Use string * as delimiter
    s = re.sub(r'\*|\W\*|\*\W', '. ', s)
    # Remove words in parenthesis, which are assumed less informal
    s = re.sub(r'\(.*?\)', '. ', s)
    # xxx[?!]. -- > xxx.
    s = re.sub(r'\W+?\.', '.', s)
    # [.?!]xxx --> [.?!] xxx
    s = re.sub(r'(\.|\?|!)(\w)', r'\1 \2', s)
    return s.strip()


  and should_run_async(code)


In [6]:
# define stemmer

p_stemmer = PorterStemmer()
# create stop words list
stop_words = (list(
    set(get_stop_words('en'))
    |set(get_stop_words('es'))
    |set(get_stop_words('de'))
    |set(get_stop_words('it'))
    |set(get_stop_words('ca'))
    |set(get_stop_words('pt'))
    |set(get_stop_words('pl'))
    |set(get_stop_words('da'))
    |set(get_stop_words('ru'))
    |set(get_stop_words('sv'))
    |set(get_stop_words('sk'))
    |set(get_stop_words('nl'))
    |set(["course","data"])
))

  and should_run_async(code)


In [7]:
def preprocess_word(s):
    """
    Get word level preprocessed data from preprocessed sentences
    including: remove punctuation, select noun, stem, stop_words
    :param s: sentence to be processed
    :return: word level pre-processed documents
    """
    if not s:
        return []
    w_list = word_tokenize(s)
    w_list = [word for word in w_list if word.isalpha()]
    w_list = [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']
    w_list = [word for word in w_list if word not in stop_words]
    w_list = [p_stemmer.stem(word) for word in w_list]

    return w_list


  and should_run_async(code)


In [8]:
import codecs
def lda_process(token_lists,output):
    """
    # Normal LDA without seeding
    :param token_lists:
    :return:
    """
    text_lists_clean = [ " ".join(tl) for tl in token_lists ]
    vocab=list(set(word_tokenize(" ".join(text_lists_clean))))
    vectorizer = CountVectorizer(ngram_range=(1,1),vocabulary=vocab)
    X = vectorizer.fit_transform(text_lists_clean)
    word2id = dict((v, idx) for idx, v in enumerate(vocab))
    model = glda.GuidedLDA(n_topics=8, n_iter=1000, random_state=7, refresh=20)
    model.fit(X)
    topic_word = model.topic_word_
    n_top_words = 8
    f = codecs.open(output+"_topic.txt", "w", "utf-8")
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        f.write('Topic {}: {} \n'.format(i, ' '.join(topic_words)))
    f.close()

    doc_topic = pd.DataFrame(model.doc_topic_)
    doc_topic.to_csv(output+"_doc_topic.csv")

  and should_run_async(code)


In [9]:
# LDA for course description
data["description"].fillna(" ", inplace=True)
docs = data["description"].to_list()

text_lists = [preprocess_sent(text) for text in docs]
token_lists = [preprocess_word(text) for text in text_lists]
lda_process(token_lists,"description")

  and should_run_async(code)
INFO:lda:n_documents: 1009
INFO:lda:vocab_size: 8342
INFO:lda:n_words: 64214
INFO:lda:n_topics: 8
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -679531
INFO:lda:<20> log likelihood: -491894
INFO:lda:<40> log likelihood: -480335
INFO:lda:<60> log likelihood: -474530
INFO:lda:<80> log likelihood: -472094
INFO:lda:<100> log likelihood: -470101
INFO:lda:<120> log likelihood: -467892
INFO:lda:<140> log likelihood: -466022
INFO:lda:<160> log likelihood: -464709
INFO:lda:<180> log likelihood: -464565
INFO:lda:<200> log likelihood: -464209
INFO:lda:<220> log likelihood: -463859
INFO:lda:<240> log likelihood: -463305
INFO:lda:<260> log likelihood: -463081
INFO:lda:<280> log likelihood: -463174
INFO:lda:<300> log likelihood: -463015
INFO:lda:<320> log likelihood: -462875
INFO:lda:<340> log likelihood: -462611
INFO:lda:<360> log likelihood: -462247
INFO:lda:<380> log likelihood: -461893
INFO:lda:<400> log likelihood: -462119
INFO:lda:<420> log likelihood: -461717

In [10]:
# LDA for instructor information
data["instructor_info"].fillna(" ", inplace=True)
docs = data["instructor_info"].to_list()

text_lists = [preprocess_sent(text) for text in docs]
token_lists = [preprocess_word(text) for text in text_lists]
lda_process(token_lists,"instructor_info")


  and should_run_async(code)
INFO:lda:n_documents: 1009
INFO:lda:vocab_size: 3771
INFO:lda:n_words: 26981
INFO:lda:n_topics: 8
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -291102
INFO:lda:<20> log likelihood: -195083
INFO:lda:<40> log likelihood: -190382
INFO:lda:<60> log likelihood: -187529
INFO:lda:<80> log likelihood: -186241
INFO:lda:<100> log likelihood: -185322
INFO:lda:<120> log likelihood: -185030
INFO:lda:<140> log likelihood: -184731
INFO:lda:<160> log likelihood: -184347
INFO:lda:<180> log likelihood: -183883
INFO:lda:<200> log likelihood: -183709
INFO:lda:<220> log likelihood: -183494
INFO:lda:<240> log likelihood: -183301
INFO:lda:<260> log likelihood: -183270
INFO:lda:<280> log likelihood: -183120
INFO:lda:<300> log likelihood: -182921
INFO:lda:<320> log likelihood: -182863
INFO:lda:<340> log likelihood: -182844
INFO:lda:<360> log likelihood: -182735
INFO:lda:<380> log likelihood: -182694
INFO:lda:<400> log likelihood: -182608
INFO:lda:<420> log likelihood: -182743

In [15]:
data.drop(['id', 'description','instructor_info'], axis=1,inplace = True)
data.to_csv("final_data.csv")

  and should_run_async(code)
