## Importing modules

In [1]:
import metapy
import csv
import requests
import re
from bs4 import BeautifulSoup,Comment
from selenium import webdriver
import time
import json
import nltk
import pandas as pd
import gensim
import spacy
import en_core_web_sm 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import numpy as np
import lda.datasets
import sklearn.feature_extraction.text as text
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics import pairwise_distances_argmin
from tkinter import *
import webbrowser



## Reading the database

In [2]:
df = pd.read_csv('final_worked_new.csv',encoding='cp1252').dropna()
df.head()

Unnamed: 0,Year,Term,Course_ID,Course_Name,Course_Description
0,2018.0,Fall,AE 100,Intro to Aerospace Engineering,Aerospace Engineering Course.Introduction to t...
1,2018.0,Fall,AE 199,Undergraduate Open Seminar,Aerospace Engineering Course.This is a seminar...
2,2018.0,Fall,AE 202,Aerospace Flight Mechanics,Aerospace Engineering Course.Fundamental princ...
3,2018.0,Fall,AE 298,Special Topics,Aerospace Engineering Course.Lectures and disc...
4,2018.0,Fall,AE 311,Incompressible Flow,Aerospace Engineering Course.Equations of moti...


## Extracting data for training LDA

In [3]:
data = df.Course_Description.values.tolist()

## Cleaning the list

In [4]:
data = [re.sub("\'", "", sent) for sent in data]

## Functions required for bag of words and lemmatization

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

#### The above gensim.utils.simple_preprocess additional information can be found here : https://tedboy.github.io/nlps/generated/generated/gensim.utils.simple_preprocess.html. It is used to convert a document into list of tokens. It lowercases the letters, tokenizes them and if deacc property is set to TRUE removes punctuations 

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        #print(sent)
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
        #texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        #print(texts_out)
    return texts_out

#### Lemmatization is an advanced form of stemming that converts words into their parent word using inputs like noun, adjective, verb and adverb. The default tag is noun. Using this we can find the root word by their characteristics. For example running will become run using lemmatization.

## Applying tokenizer and lemmatizer to the list of documents

In [7]:
data_words = list(sent_to_words(data))
nlp = en_core_web_sm.load(disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])


## Stopwords definition

In [8]:
stop_no = open("numbers.txt")
list_no=[]
for line in stop_no : 
    list_no.append(re.sub(r'[-,?!":;()|]\n',"",line,re.UNICODE).replace("'", "").strip())
STOPLIST = list(set(stopwords.words('english') + list_no + ["new","experience","edu","illinois","advise","advice","topic","area","approval","use","consent","letter","grading","term","repeat","provide","require","need","include","study","approve","vary","exist","consider","make","prerequisite","credit", "course","hour","hours","courses","student","students","instructor","instructors"]+list(ENGLISH_STOP_WORDS)))
STOPLIST.extend(('.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','/','-'))

#### Very common words can be fatal to machine learning and text mining analysis and hence we define stopwords. This stopwords list is then put into a count vectorizer which removes these stopwords from the list of documents

## Count Vectorizer definition

In [9]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        
                             stop_words=STOPLIST,             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}',              
                            )

#### CountVectorizer function converts a collection of documents into token counts. The documentation for the same can be found here : http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [10]:
data_vectorized = vectorizer.fit_transform(data_lemmatized)

## Observing feature names in the vectorizer

In [11]:
vectorizer.get_feature_names()

['able',
 'academic',
 'access',
 'accountancy',
 'accounting',
 'accy',
 'acquisition',
 'activity',
 'additional',
 'address',
 'administration',
 'admission',
 'advanced',
 'aerospace',
 'affect',
 'algorithm',
 'alternative',
 'analysis',
 'analytic',
 'analytical',
 'analyze',
 'application',
 'apply',
 'approach',
 'appropriate',
 'art',
 'aspect',
 'asset',
 'assignment',
 'associate',
 'augment',
 'background',
 'base',
 'basic',
 'beam',
 'behavior',
 'bioengineer',
 'bioengineering',
 'biological',
 'biology',
 'book',
 'boundary',
 'build',
 'building',
 'business',
 'calculus',
 'campus',
 'capital',
 'career',
 'case',
 'cell',
 'challenge',
 'change',
 'characteristic',
 'chem',
 'chemical',
 'chemistry',
 'circuit',
 'civil',
 'class',
 'classical',
 'classification',
 'cluster',
 'collection',
 'college',
 'communication',
 'community',
 'company',
 'complete',
 'completion',
 'complex',
 'component',
 'composition',
 'computation',
 'computational',
 'compute',
 'compu

In [12]:
vocab = np.array(vectorizer.get_feature_names()) ##Making vocabulary in python

## LDA model!

#### To chose the best LDA model we can use GridSearchCV. This would give us the best model to use in between n_topics and learning decays

In [13]:
search_params = {'n_components': [10, 15,20], 'learning_decay': [.5, .7, .9]}
lda_model = LatentDirichletAllocation(random_state=100,n_topics=10)
model = GridSearchCV(lda_model, param_grid=search_params)
model.fit(data_vectorized)





GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=10,
             perp_tol=0.1, random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [10, 15, 20], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## Selecting the Best LDA model

In [14]:
best_lda_model = model.best_estimator_
print("Chosen LDA Model is: ", model.best_params_)
lda_output = best_lda_model.transform(data_vectorized)

Chosen LDA Model is:  {'learning_decay': 0.9, 'n_components': 10}


#### This model has already been fitted to the vectorizer as can be seen above. Hence our LDA model with best features is ready!!

## Visualization

#### To analyze how well our topics are divided we use the famous pyLDAvis package.This helps to give us intricate details about our model and its classifications. This documentation can be found here: http://pyldavis.readthedocs.io/en/latest/

In [15]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

#### To investigate further we can print probabilities of the LDA model per Topic. This can be seen below.

In [16]:
topic_keywords = pd.DataFrame(best_lda_model.components_)
topic_names = ["Topic" + str(i) for i in range(best_lda_model.n_topics)]
topic_keywords.columns = vectorizer.get_feature_names()
topic_keywords.index = topic_names

topic_keywords

Unnamed: 0,able,academic,access,accountancy,accounting,accy,acquisition,activity,additional,address,...,visualization,water,wave,way,web,work,world,write,writing,year
Topic0,0.138781,0.183117,0.152868,0.214326,0.173065,0.180842,0.145121,0.15428,0.151145,0.146955,...,0.191011,4.096251,1.018265,0.151208,0.85196,2.140356,0.199293,0.159479,0.452996,0.165714
Topic1,0.151885,18.275117,14.563248,1.269699,0.612096,0.734374,5.858323,11.64239,5.180503,1.105621,...,0.166526,3.180184,0.150405,18.45594,17.749721,24.317451,16.589375,13.430963,29.795139,0.494686
Topic2,3.745402,0.73492,3.70978,0.222807,0.180338,0.182239,0.13742,2.168063,1.966314,1.681309,...,0.193265,0.930266,0.258504,0.223478,3.340019,17.161929,4.727434,3.54435,8.368015,0.313594
Topic3,0.16721,0.149265,0.186799,0.167053,0.139834,0.173595,0.182069,0.152196,0.688202,0.180482,...,1.139777,10.824552,6.307171,0.150821,0.274161,3.229569,1.643807,1.615067,0.453466,0.189281
Topic4,0.144508,0.139705,0.14428,0.217326,0.141739,0.177302,0.149053,0.158874,0.151313,0.157721,...,0.150595,0.152102,0.149765,0.145783,0.144998,0.145155,0.142388,0.146247,0.148422,0.14311
Topic5,0.146006,0.148806,0.148112,31.723563,15.995476,26.84532,0.17452,0.164542,0.662121,0.480991,...,0.15442,0.150885,0.151218,0.158114,0.139941,0.898313,0.142131,0.155457,0.152959,0.144067
Topic6,1.681101,3.11171,0.140372,4.557555,4.095597,9.004562,1.535132,4.250848,2.313645,3.6493,...,0.161126,0.144203,0.141073,3.572123,0.149886,21.309,11.364097,0.176969,0.182725,4.950635
Topic7,5.979701,1.056762,0.153671,1.479576,0.736161,0.637822,0.421859,0.574087,0.171175,2.261466,...,36.850993,0.160552,0.142051,3.763918,24.148075,16.296166,0.238222,0.247762,0.199329,0.154259
Topic8,0.18018,0.146139,1.286264,0.196953,0.157487,0.19288,4.737991,0.412475,0.213388,0.170158,...,0.565546,0.14665,19.346747,0.15593,0.195107,0.304455,0.333085,3.239195,0.497259,1.960816
Topic9,0.147413,6.824665,0.149782,4.862481,0.570949,3.385597,2.354425,0.209433,0.159512,0.166012,...,0.173454,0.318313,0.150159,2.279638,0.169716,0.255578,0.347993,0.378368,0.233239,4.841503


## Predicting Courses that Match a specific Query

#### Lets dive into predicting the list of topics based on the input by the user. For this we need to define the predict function.

In [17]:
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
    tokens = list(sent_to_words(text))
    lemm = lemmatization(tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    vector = vectorizer.transform(lemm)
    topic_probability_scores = best_lda_model.transform(vector)
    topic = topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores


#### Defining a similarity function for getting the top 20 course names

In [18]:
def similar_documents(text, doc_topic_probs, documents = df, nlp=nlp, top_n=20):
    
    t1 = "".join(text)
    text = []
    text.append(t1)
    topic, x  = predict_topic(text)
    dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0]
    doc_ids = np.argsort(dists)[:top_n]
    course_name = []
    course_id = []
    dot_product = doc_ids.tolist()
    for i in dot_product : 
        course_id.append(df["Course_ID"][i])
        course_name.append(df["Course_Name"][i])
    
    final_rank = list(zip(course_id, course_name))
    searchf=Tk()
    
    searchf.configure(background="sienna1")
    str_list = " ".join(text)
    searchf.wm_title("Course Search")
    searchf.wm_iconbitmap("images/uiuc_logo.ico")
    blank='           '
    blanklabel=Label( searchf,text=blank*40,font=("ComicSansMS", 10),bg="sienna1")
    label1 = Label( searchf,text="Search Results for \"" + str_list+"\"",font=("ComicSansMS", 20),bg="sienna1")
    label1.pack()
    blanklabel.pack()
    def callbacks(event):
        webbrowser.open_new(r"https://courses.illinois.edu/schedule/2018/fall")
    
    b = Button(searchf,text="Go to Course Page!",anchor=W, justify=LEFT, padx=20,font=("ComicSansMS", 10))
    b.bind("<Button-1>", callbacks)
    b.pack()
    blanklabel=Label( searchf,text=blank*40,font=("ComicSansMS", 10),bg="sienna1")
    blanklabel.pack()
    #scrollbar.pack(side=RIGHT, fill=Y)
    for i in final_rank[:]:
        labl=Label( searchf,text=i,font=("ComicSansMS", 12),justify=LEFT, fg="blue", cursor="hand2",bg="sienna1")
        labl.pack()
        

## Using Tkinter to take value from USER

In [None]:
try:
    top= Tk()
except:
    top=Toplevel()
    top= Tk()
top.wm_title("Course Search")
photo = PhotoImage(file="images/uiuc_heading.gif")
dimensions = "image size: %dx%d" % (photo.width(), photo.height())
w = Label(top,image=photo)
w.pack(side = "top")
photoimage = PhotoImage(file="images/Alma_mater.gif")
v = Label(top, image=photoimage)

v.pack(side = "bottom", fill = "both")
top.wm_iconbitmap("images/uiuc_logo.ico")
f = Frame(top, width=600,height=350,bg="royal blue")
f.pack(fill=X, expand=True)
e1=Entry(top,bd=10,width=40,font=("Helvetica", 15))
e1.insert(END, 'search here')
e1.place(relx=0.5, rely=0.35, anchor=CENTER)
text = e1.get()

b1=Button(top,font=("Helvetica", 15),text="Click me!",command= lambda:similar_documents(text=list(e1.get()), doc_topic_probs=lda_output, documents = df, top_n=20))
b1.place(relx=0.5, rely=0.5, anchor=CENTER)
top.mainloop()