In [1]:
import pandas as pd

from pandas.core.frame import DataFrame
from pandas.core.series import Series

data_frame: DataFrame = pd.read_csv(r'./Courses_Berkeley_2018-01-15.csv', encoding='latin1')

#Unique Courses
data_frame = data_frame.drop_duplicates(subset=['Name'], keep='last')

#Description is null
data_frame = data_frame.dropna(subset=["Description"])

#Description too short
data_frame = data_frame[data_frame['Description'].map(len) > 20]


print(data_frame)

        Year              Field  \
8       2011      Public Health   
14      1977  Aerospace Studies   
15      1977  Aerospace Studies   
17      1978  Aerospace Studies   
18      1978  Aerospace Studies   
...      ...                ...   
305662  1970            Biology   
305663  1968            Zoology   
305665  1969            Zoology   
305667  1918        Agriculture   
305668  1917        Agriculture   

                                                     Name       Number  \
8         Statistical Analysis of Continuous Outcom- Data          145   
14                          United Slates Air Force Today     1A-1B-1C   
15                 The Developmental Qrowtti of Air Power  21A-21B-21C   
17                     Introduction to Aerospace Studies             1   
18                               The military Force Today            2   
...                                                   ...          ...   
305662    Introduction to the Science of Living Organisms    

In [30]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


lemmatizer: WordNetLemmatizer = WordNetLemmatizer()
stemmer : PorterStemmer = PorterStemmer()
sw_nltk = set(stopwords.words("english"))

descriptions: Series = data_frame['Description']


custom_stop_words: set = ("part", "class", "course", "one", "two", "three", "four", "discussion", "lecture", "hour", "day","month","semester","week","sophomore","junior",
"senior","fresh","seminar","exam","required","summer","winter","student","pre","requisite","prerequisite","lecture","introduction","introduces","essay","notes","textbook",
"etc","covering","sp","credit","pr","fsp","info","session","read","basic","hard","emphasis","form","primary","understand","learn","discus","learning","general","concept","study",
"overciew","focus","emphasize","presented","learning","seminar","proseminar","topic","major","year","distinguished", "presentation",
"hour", "concept","hours", "per", "week", "weeks","year", "years", "month", "day", "days", "definition", "define", "elaborate", "fwsp")

stopwords_removed_description: list = []

i = 0
    
for index, value in descriptions.iteritems():
    text = str(value).lower()
    text = re.sub('(\\d|\\W)+',' ',text)
    text = re.sub('[\(\)]', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if not word in sw_nltk]
    text = [word for word in text if not word in custom_stop_words]
    
    nouns_text:list = []
    for word, pos in nltk.pos_tag(text):
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            nouns_text.append(word)
    
    text = nouns_text
    text = [lemmatizer.lemmatize(word) for word in text]
    #text = [stemmer.stem(word) for word in text]
    
    stopwords_removed_description.append((' ').join(text))

In [31]:
#Verifying the length of output
print(len(stopwords_removed_description))

52275


In [54]:
# BoW -> Bag Of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(stopwords_removed_description)

vectorized_descriptions = vectorizer.transform(stopwords_removed_description)

print(vectorized_descriptions.shape)

['laboratory regression model outcome data square property coefficient prediction model model assumption transformation outlier point variable interaction si covariance correlation correlation method com analysis variance facto model test assumption comparison effect model health component measure model', 'defines role structure category force air force dynamic instrumentofnationalpower camacho', 'trace air power concept application identifies change evolution seek impact development air superiority concept', 'air force rotc program survey state air preview study course cadet commission force comacho f', 'investigation structure state air force structure air force organization malor command']
(52275, 55049)


In [176]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

labelencoder = LabelEncoder()

labels = data_frame['Area'].values

labelencoder.fit(labels)    

encoded_labels = labelencoder.transform(labels)

categorical_labels = tf.keras.utils.to_categorical(encoded_labels)

print(len(data_frame['Area'].unique()))
print(encoded_labels.shape)
print(encoded_labels[120])
print(len(categorical_labels))
print(categorical_labels.shape)

90
(52275,)
34
52275
(52275, 90)


In [206]:
# Training model

from keras.models import Sequential
from keras import layers

input_dim = vectorized_descriptions.shape[1]  

model = Sequential()
model.add(layers.Dense(90, input_dim = input_dim, activation='relu'))
model.add(layers.Dense(120, activation='relu'))
model.add(layers.Dense(120, activation='relu'))
model.add(layers.Dense(90, activation = 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 90)                4954500   
                                                                 
 dense_1 (Dense)             (None, 120)               10920     
                                                                 
 dense_2 (Dense)             (None, 120)               14520     
                                                                 
 dense_3 (Dense)             (None, 90)                10890     
                                                                 
Total params: 4,990,830
Trainable params: 4,990,830
Non-trainable params: 0
_________________________________________________________________


In [207]:
from keras.backend import clear_session
clear_session()

# import scikeras
# from scikeras.wrappers import KerasClassifier

# estimator = KerasClassifier(build_fn=model, epochs=100, batch_size=5000, verbose=0)

history = model.fit(vectorized_descriptions,
                    categorical_labels, 
                    epochs=200, 
                    verbose=False, 
                    batch_size=2000)

In [208]:
accuracy = model.evaluate(vectorized_descriptions, categorical_labels, verbose=False)

print(f"Training Accuracy: {accuracy}")

Training Accuracy: 0.1486382782459259


In [248]:
test_case_one = 'In this introductory course of Psychology for Computer Science students,there are two main challenges to be met.The first is to acquaint the students with the rich diversity of the subject while maintaining coherence in the course.The second is to establish the relevance of Psychology for students of an ostensibly unrelated discipline. To address both these issues,we will focus,in the first few sessions,on the concept of Personality,as differently understood by various approaches in Psychology,as well as the emergence and evolution of those sometimes competing,at other times complementary,schools of thought.All the students will be vaguely familiar with the notion of Personality and ought to find discussion on this topic quite relevant to their own lives.After all, anything that deals with human experience and behaviour is germane to all of us.Once we have viewed Personality from a plurality of perspectives,there will be a natural lead-up to the intellectual history and contemporary.'
text = str(test_case_one).lower()
text = re.sub('(\\d|\\W)+',' ',text)
text = re.sub('[\(\)]', ' ', text)
text = re.sub('[^a-zA-Z]', ' ', text)
text = word_tokenize(text)
text = [word for word in text if not word in sw_nltk]
text = [word for word in text if not word in custom_stop_words]
    
nouns_text:list = []
for word, pos in nltk.pos_tag(text):
    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
        nouns_text.append(word)
    
text = nouns_text
text = [lemmatizer.lemmatize(word) for word in text]
text = (' ').join(text)

print(text)

introductory psychology computer science student challenge acquaint student diversity coherence relevance psychology student discipline address issue session personality approach evolution time school student notion personality life anything deal experience behaviour germane personality plurality history


In [251]:
text_array = []
text_array.append(text)

vectorized_test_case = vectorizer.transform(text_array)
print(vectorized_test_case.shape)

(1, 55049)


In [252]:
import numpy as np

predicted_class = model.predict(vectorized_test_case) 
classes_x = np.argmax(predicted_class,axis=1)

print(predicted_class)
print(classes_x)
print((model.predict(vectorized_test_case) > 0.5).astype("int32"))

# print(labels[39])
# print(data_frame['Description'].values[39])

for i in range(len(encoded_labels)):
    if encoded_labels[i] == classes_x[0]:
        print(labels[i])

[[1.34336397e-15 4.10313189e-22 3.38987660e-09 6.60466806e-19
  2.01153320e-31 1.87463774e-19 1.84893593e-11 1.24455737e-15
  1.90658954e-31 2.48429916e-32 1.22489318e-15 7.64066586e-04
  1.60377313e-04 1.89119667e-16 3.93779163e-04 6.84369644e-11
  1.31415998e-28 6.24599046e-14 4.52428242e-13 6.71453563e-07
  3.64806669e-06 6.85365194e-07 1.10848594e-10 9.35169183e-22
  1.17975237e-17 7.66600022e-17 1.59309637e-02 5.16041694e-15
  8.86490668e-08 3.49013067e-21 1.25809381e-24 1.10685205e-15
  1.37384348e-09 6.53156688e-16 1.41801154e-12 3.52988754e-19
  3.64867404e-31 1.48524196e-04 2.87907854e-14 8.79100829e-13
  1.80786078e-21 9.07706446e-04 7.68659187e-23 2.76409404e-35
  3.90210537e-19 2.46105884e-07 9.11841132e-16 3.16129777e-35
  2.03843950e-37 2.57772972e-25 6.49133796e-13 1.25299135e-19
  5.44340949e-27 8.37603411e-20 1.25278168e-18 1.28135809e-11
  2.29118442e-07 2.75910494e-17 1.69622183e-12 1.90653526e-12
  9.19865671e-17 2.87683842e-18 2.35591457e-08 1.30623556e-17
  1.0254

Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Planning
Urban Plan

In [82]:
for i in range(len(encoded_labels)):
    if(encoded_labels[i] == predicted_class[0][0]):
        print(f"{i}")

10142


In [92]:
print(data_frame["Name"].values[10142])

 22 units from Group II (aslistedherein) including: German 100 (3), 101 (2), 102 (2), 103 (3), 104 (3), 112 (3) and six units taken from literature courses;
