In [1]:
import pandas as pd

from pandas.core.frame import DataFrame
from pandas.core.series import Series

data_frame: DataFrame = pd.read_csv(r'./Courses_Berkeley_2018-01-15.csv', encoding='latin1')

#Unique Courses
data_frame = data_frame.drop_duplicates(subset=['Name'], keep='last')

#Description is null
data_frame = data_frame.dropna(subset=["Description"])

#Description too short
data_frame = data_frame[data_frame['Description'].map(len) > 20]


print(data_frame)

        Year              Field  \
8       2011      Public Health   
14      1977  Aerospace Studies   
15      1977  Aerospace Studies   
17      1978  Aerospace Studies   
18      1978  Aerospace Studies   
...      ...                ...   
305662  1970            Biology   
305663  1968            Zoology   
305665  1969            Zoology   
305667  1918        Agriculture   
305668  1917        Agriculture   

                                                     Name       Number  \
8         Statistical Analysis of Continuous Outcom- Data          145   
14                          United Slates Air Force Today     1A-1B-1C   
15                 The Developmental Qrowtti of Air Power  21A-21B-21C   
17                     Introduction to Aerospace Studies             1   
18                               The military Force Today            2   
...                                                   ...          ...   
305662    Introduction to the Science of Living Organisms    

In [30]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


lemmatizer: WordNetLemmatizer = WordNetLemmatizer()
stemmer : PorterStemmer = PorterStemmer()
sw_nltk = set(stopwords.words("english"))

descriptions: Series = data_frame['Description']


custom_stop_words: set = ("part", "class", "course", "one", "two", "three", "four", "discussion", "lecture", "hour", "day","month","semester","week","sophomore","junior",
"senior","fresh","seminar","exam","required","summer","winter","student","pre","requisite","prerequisite","lecture","introduction","introduces","essay","notes","textbook",
"etc","covering","sp","credit","pr","fsp","info","session","read","basic","hard","emphasis","form","primary","understand","learn","discus","learning","general","concept","study",
"overciew","focus","emphasize","presented","learning","seminar","proseminar","topic","major","year","distinguished", "presentation",
"hour", "concept","hours", "per", "week", "weeks","year", "years", "month", "day", "days", "definition", "define", "elaborate", "fwsp")

stopwords_removed_description: list = []

i = 0
    
for index, value in descriptions.iteritems():
    text = str(value).lower()
    text = re.sub('(\\d|\\W)+',' ',text)
    text = re.sub('[\(\)]', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if not word in sw_nltk]
    text = [word for word in text if not word in custom_stop_words]
    
    nouns_text:list = []
    for word, pos in nltk.pos_tag(text):
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            nouns_text.append(word)
    
    text = nouns_text
    text = [lemmatizer.lemmatize(word) for word in text]
    #text = [stemmer.stem(word) for word in text]
    
    stopwords_removed_description.append((' ').join(text))

In [31]:
#Verifying the length of output
print(len(stopwords_removed_description))

52275


In [54]:
# BoW -> Bag Of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(stopwords_removed_description)

vectorized_descriptions = vectorizer.transform(stopwords_removed_description)

print(vectorized_descriptions.shape)

['laboratory regression model outcome data square property coefficient prediction model model assumption transformation outlier point variable interaction si covariance correlation correlation method com analysis variance facto model test assumption comparison effect model health component measure model', 'defines role structure category force air force dynamic instrumentofnationalpower camacho', 'trace air power concept application identifies change evolution seek impact development air superiority concept', 'air force rotc program survey state air preview study course cadet commission force comacho f', 'investigation structure state air force structure air force organization malor command']
(52275, 55049)


In [176]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

labelencoder = LabelEncoder()

labels = data_frame['Area'].values

labelencoder.fit(labels)    

encoded_labels = labelencoder.transform(labels)

categorical_labels = tf.keras.utils.to_categorical(encoded_labels)

print(len(data_frame['Area'].unique()))
print(encoded_labels.shape)
print(encoded_labels[120])
print(len(categorical_labels))
print(categorical_labels.shape)

90
(52275,)
34
52275
(52275, 90)


In [206]:
# Training model

from keras.models import Sequential
from keras import layers

input_dim = vectorized_descriptions.shape[1]  

model = Sequential()
model.add(layers.Dense(90, input_dim = input_dim, activation='relu'))
model.add(layers.Dense(120, activation='relu'))
model.add(layers.Dense(120, activation='relu'))
model.add(layers.Dense(90, activation = 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 90)                4954500   
                                                                 
 dense_1 (Dense)             (None, 120)               10920     
                                                                 
 dense_2 (Dense)             (None, 120)               14520     
                                                                 
 dense_3 (Dense)             (None, 90)                10890     
                                                                 
Total params: 4,990,830
Trainable params: 4,990,830
Non-trainable params: 0
_________________________________________________________________


In [207]:
from keras.backend import clear_session
clear_session()

# import scikeras
# from scikeras.wrappers import KerasClassifier

# estimator = KerasClassifier(build_fn=model, epochs=100, batch_size=5000, verbose=0)

history = model.fit(vectorized_descriptions,
                    categorical_labels, 
                    epochs=200, 
                    verbose=False, 
                    batch_size=2000)

In [208]:
accuracy = model.evaluate(vectorized_descriptions, categorical_labels, verbose=False)

print(f"Training Accuracy: {accuracy}")

Training Accuracy: 0.1486382782459259


In [229]:
test_case_one = 'This course will provide a basic understanding of computing, networking, programming concepts, and exploitation techniques, as they relate to computer security. In security testing, an ethical hacker with legal permission attempts to penetrate a system or systems to find a weak link and then analyze ways to correct the security flaws. Ethical hacking relies on a combination of creativeness, expansion of knowledge based on best practices, legal issues, and client industry regulations as well as known threats and the breath of the target. organization’s security presence or point of risk.'
text = str(test_case_one).lower()
text = re.sub('(\\d|\\W)+',' ',text)
text = re.sub('[\(\)]', ' ', text)
text = re.sub('[^a-zA-Z]', ' ', text)
text = word_tokenize(text)
text = [word for word in text if not word in sw_nltk]
text = [word for word in text if not word in custom_stop_words]
    
nouns_text:list = []
for word, pos in nltk.pos_tag(text):
    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
        nouns_text.append(word)
    
text = nouns_text
text = [lemmatizer.lemmatize(word) for word in text]
text = (' ').join(text)

print(text)

concept exploitation technique computer security security hacker permission attempt system system link way security flaw relies combination expansion knowledge practice issue client industry regulation threat target organization security presence point risk


In [230]:
text_array = []
text_array.append(text)

vectorized_test_case = vectorizer.transform(text_array)
print(vectorized_test_case.shape)

(1, 55049)


In [232]:
import numpy as np

predicted_class = model.predict(vectorized_test_case) 
classes_x = np.argmax(predicted_class,axis=1)

print(predicted_class)
print(classes_x)
print((model.predict(vectorized_test_case) > 0.5).astype("int32"))

# print(labels[39])
# print(data_frame['Description'].values[39])

for i in range(len(encoded_labels)):
    if encoded_labels[i] == classes_x[0]:
        print(labels[i])

[[4.47212551e-17 6.10463463e-31 3.61998394e-30 1.48730368e-25
  0.00000000e+00 1.73652913e-23 0.00000000e+00 3.30881236e-27
  7.99098671e-20 7.60956299e-28 3.66748445e-27 1.52443412e-32
  4.86283056e-07 1.43012071e-37 6.00445866e-34 1.47147653e-29
  1.71510140e-29 5.61318665e-21 6.13609713e-19 9.99993443e-01
  6.83299679e-19 2.37833197e-14 3.09668847e-24 0.00000000e+00
  3.92366261e-13 3.45533877e-08 1.10550044e-13 8.85970342e-20
  3.20280320e-28 5.61198893e-28 1.59985120e-30 0.00000000e+00
  4.34533129e-30 1.81253501e-25 1.73473857e-11 1.02405655e-20
  2.47398885e-37 3.18994831e-15 0.00000000e+00 2.70917500e-30
  0.00000000e+00 1.02378619e-14 3.05450385e-06 0.00000000e+00
  2.98301064e-26 2.50728740e-16 1.47635636e-11 1.09129630e-28
  5.03199652e-12 3.84629070e-35 6.22148600e-13 0.00000000e+00
  1.43239823e-36 1.22512852e-23 3.00956958e-06 3.74041171e-08
  3.55367162e-27 3.96294952e-15 1.18833903e-27 2.19415090e-21
  4.65983442e-37 9.85964247e-29 4.36354689e-18 1.96653164e-21
  0.0000

In [82]:
for i in range(len(encoded_labels)):
    if(encoded_labels[i] == predicted_class[0][0]):
        print(f"{i}")

10142


In [92]:
print(data_frame["Name"].values[10142])

 22 units from Group II (aslistedherein) including: German 100 (3), 101 (2), 102 (2), 103 (3), 104 (3), 112 (3) and six units taken from literature courses;
