In [1]:
import pandas as pd
training_data = pd.read_csv("content/training_data.csv")
testing_data = pd.read_csv("content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")

------------------------------------
Size of training dataset: 7808
Size of testing dataset: 867
------------------------------------
------------------------------------
Sample Data
LABEL: F / SENTENCE: 'Half of it is going straight to charity, another quarter going straight to scientific research, an eighth to the parkour community, a sixteenth to towards spreading information about health and...|||Find a path or suffer more.|||http://personalitycafe.com/enneagram-personality-theory-forum/85323-enneagram-type-mbti-type-compared-statistics.html yep.|||I kind of anchor on Fi and Ne makes having Ni really fun. INFP for me as they tire me out less and our views tend to align more.|||The two ESTPs I have gotten the chance to know seem to experience much more than other people who have been on the planet for the same amount of time and are quite the renaissance (wo)men.  Is this...|||I don't really have a best friend ISTP(passion-amateur group co-founder), INTJ(intellectual and various sma

In [2]:
# Preview of the data in the csv file, which has two columns: 
# (1)type - label of the post (2)posts - the corresponding post content
training_data.head()

Unnamed: 0,type,posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,T,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...
3,T,"'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...


In [3]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

In [4]:
# remove the URL from the post and compare, by experimental results, when you remove the URL from the post versus keeping the URL in the post.
# function to remove the URL from the post
def remove_url(post):
    import re
    # remove the URL from the post
    post=re.sub(r'http\S+', '', post)
    return post

# use the function to remove the URL from the post
training_posts_no_url=[remove_url(post) for post in training_posts]
testing_posts_no_url=[remove_url(post) for post in testing_posts]


In [5]:
#calculate the test results
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# calculate the test results
def calculate_test_results(training_posts, training_labels, testing_posts, testing_labels):
    # Create the bag of words
    count_vect = CountVectorizer()
    # Fit the bag of words on the training data
    training_data_features = count_vect.fit_transform(training_posts)
    # Transform the testing data
    testing_data_features = count_vect.transform(testing_posts)
    # Create a Multinomial Naive Bayes classifier
    clf = MultinomialNB().fit(training_data_features, training_labels)
    # Predict the labels on the testing data
    predicted_labels = clf.predict(testing_data_features)
    # Calculate the accuracy of the model
    accuracy = accuracy_score(testing_labels, predicted_labels)
    return accuracy

#before removing the URL
print("------------------------------------")
print("Before removing the URL")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")


# after removing the URL, the accuracy of the model is higher
print("------------------------------------")
print("After removing the URL")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts_no_url, training_labels, testing_posts_no_url, testing_labels)))
print("------------------------------------")


# The result is that the accuracy of the model is lower when you remove the URL from the post.

------------------------------------
Before removing the URL
Accuracy of the model: 0.7820069204152249
------------------------------------
------------------------------------
After removing the URL
Accuracy of the model: 0.7797001153402537
------------------------------------


In [6]:
#pre-process the training set by integrating several text pre-processing techniques (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
#  You should test and justify the reason why you apply the specific preprocessing techniques based on the test results.

#tokenisation
import re, string
# function to tokenise the post
def tokenise(post):
    # tokenise the post
    post=re.sub("[^a-zA-Z]", " ", post)
    post=post.lower()
    return post

#remove numbers from the post
def remove_numbers(post):
    # remove the numbers from the post
    post=re.sub("[0-9]", " ", post)
    return post

#convert to lowercase
def convert_to_lowercase(post):
    # convert to lowercase
    post=post.lower()
    return post

#remove stop words
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

# function to remove the stop words from the post
def remove_stop_words(post):
    # remove the stop words from the post
    post=post.split()
    post=[word for word in post if not word in stop_words]
    post=" ".join(post)
    return post

# function to remove punctuations
def remove_punctuations(post):
    # remove the punctuations from the post
    post = [word for word in post if word not in string.punctuation]
    post = ' '.join(post)
    return post

#stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english")

# function to stem the post
def stem_post(post):
    # stem the post
    post=post.split()
    post=[stemmer.stem(word) for word in post]
    post=" ".join(post)
    return post


# apply the tokenisation, remove numbers, convert to lowercase, remove stop words, stemming
def preprocess_training_set(training_posts):
    # apply the tokenisation, remove numbers, convert to lowercase, remove stop words, stemming
    training_posts=[tokenise(post) for post in training_posts]
    training_posts=[remove_numbers(post) for post in training_posts]
    training_posts=[convert_to_lowercase(post) for post in training_posts]
    training_posts=[remove_stop_words(post) for post in training_posts]
    # training_posts = [remove_punctuations(post) for post in training_posts]
    training_posts=[stem_post(post) for post in training_posts]
    return training_posts

processed_training_posts=preprocess_training_set(training_posts)

#test the results
print("------------------------------------")
print("Before processing the training set")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")

#after processing the training set
print("------------------------------------")
print("After processing the training set")
print("Accuracy of the model: {0}".format(calculate_test_results(processed_training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")

------------------------------------
Before processing the training set
Accuracy of the model: 0.7820069204152249
------------------------------------
------------------------------------
After processing the training set
Accuracy of the model: 0.657439446366782
------------------------------------


## PART B


In [7]:
""" build a word embedding model (for representing word vectors, such as word2vec-CBOW, word2vec-Skip gram, fastText, and Glove) 
for the input embedding of your sequence model  """

# Training word embeddings using processed_training_posts for the input embedding of your sequence model
#start
import gensim
from gensim.models import Word2Vec

# training word embeddings
def train_word_embeddings(training_posts):
    # training word embeddings
    model = Word2Vec(training_posts, window=5, workers=4)
    return model

#training model for word embedding
word_embedding_model=train_word_embeddings(processed_training_posts)

In [8]:
# extract and apply the pretrained word embedding to the training set
def extract_pretrained_word_embedding(training_posts, word_embedding_model):
    # extract and apply the pretrained word embedding to the training set
    training_posts_embedding = []
    for post in training_posts:
        post_embedding = []
        for word in post.split():
            try:
                post_embedding.append(word_embedding_model[word])
            except:
                continue
        if len(post_embedding) > 0:
            training_posts_embedding.append(post_embedding)
    return training_posts_embedding

# apply the pretrained word embedding to the testing set
def apply_pretrained_word_embedding(testing_posts, word_embedding_model):
    # apply the pretrained word embedding to the testing set
    testing_posts_embedding = []
    for post in testing_posts:
        post_embedding = []
        for word in post.split():
            try:
                post_embedding.append(word_embedding_model[word])
            except:
                continue
        if len(post_embedding) > 0:
            testing_posts_embedding.append(post_embedding)
    return testing_posts_embedding

# extract and apply the pretrained word embedding to the training set
processed_training_posts_embedding=extract_pretrained_word_embedding(processed_training_posts, word_embedding_model)
# apply the pretrained word embedding to the testing set
processed_testing_posts_embedding=apply_pretrained_word_embedding(testing_posts, word_embedding_model)

In [9]:
import numpy as np
#import keras


#concatenate the trained word embedding and pretrained word embedding and apply to the sequence model
def concatenate_pretrained_word_embedding(training_posts_embedding, testing_posts_embedding):
    # concatenate the trained word embedding and pretrained word embedding and apply to the sequence model
    training_posts_embedding = np.array(training_posts_embedding)
    testing_posts_embedding = np.array(testing_posts_embedding)
    training_posts_embedding = np.concatenate((training_posts_embedding, testing_posts_embedding), axis=0)
    return training_posts_embedding

#apply the pretrained word embedding to the training set
training_posts_embedding=extract_pretrained_word_embedding(processed_training_posts_embedding, word_embedding_model)


In [27]:
#bi-directional sequence model in order to classify the label (T or F)
#start
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Bidirectional
from keras.utils import np_utils

#precision and recall
from sklearn.metrics import precision_score, recall_score

# function to build the bi-directional sequence model
def build_bi_directional_sequence_model(training_posts_embedding, training_labels):
    # build the bi-directional sequence model
    model = Sequential()
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# function to train the bi-directional sequence model
def train_bi_directional_sequence_model(model, training_posts_embedding, training_labels):
    # train the bi-directional sequence model
    model.fit(training_posts_embedding, training_labels, epochs=10, batch_size=32, verbose=1)
    return model

# function to test the bi-directional sequence model
def test_bi_directional_sequence_model(model, testing_posts_embedding, testing_labels):
    # test the bi-directional sequence model
    scores = model.evaluate(testing_posts_embedding, testing_labels, verbose=1)
    return scores

# function to predict the label (T or F)
def predict_label(model, testing_posts_embedding):
    # predict the label (T or F)
    predicted_labels = model.predict(testing_posts_embedding)
    predicted_labels = np.where(predicted_labels > 0.5, 1, 0)
    return predicted_labels

# function to calculate the accuracy of the bi-directional sequence model
def calculate_bi_directional_sequence_model_accuracy(predicted_labels, testing_labels):
    # calculate the accuracy of the bi-directional sequence model
    accuracy = np.mean(predicted_labels == testing_labels)
    return accuracy

# function to calculate the F1 score of the bi-directional sequence model
def calculate_bi_directional_sequence_model_f1_score(predicted_labels, testing_labels):
    # calculate the F1 score of the bi-directional sequence model
    f1_score = f1_score(testing_labels, predicted_labels)
    return f1_score

# function to calculate the precision of the bi-directional sequence model
def calculate_bi_directional_sequence_model_precision(predicted_labels, testing_labels):
    # calculate the precision of the bi-directional sequence model
    precision = precision_score(testing_labels, predicted_labels)
    return precision

# function to calculate the recall of the bi-directional sequence model
def calculate_bi_directional_sequence_model_recall(predicted_labels, testing_labels):
    # calculate the recall of the bi-directional sequence model
    recall = recall_score(testing_labels, predicted_labels)
    return recall

# build the bi-directional sequence model
bi_directional_sequence_model = build_bi_directional_sequence_model(training_posts_embedding, training_labels)

In [3]:


# apply tSemantic-Syntactic word relationship tests for the trained word embeddings
def apply_semantic_syntactic_word_relationship_tests(training_posts_embedding, training_labels):
    # apply tSemantic-Syntactic word relationship tests for the trained word embeddings
    training_posts_embedding_tSemantic_Syntactic_word_relationship_tests = []
    for post in training_posts_embedding:
        post_embedding_tSemantic_Syntactic_word_relationship_tests = []
        for word_embedding in post:
            word_embedding_tSemantic_Syntactic_word_relationship_tests = []
            for word_embedding_test in post:
                word_embedding_tSemantic_Syntactic_word_relationship_tests.append(np.dot(word_embedding, word_embedding_test))
            post_embedding_tSemantic_Syntactic_word_relationship_tests.append(word_embedding_tSemantic_Syntactic_word_relationship_tests)
        training_posts_embedding_tSemantic_Syntactic_word_relationship_tests.append(post_embedding_tSemantic_Syntactic_word_relationship_tests)
    return training_posts_embedding_tSemantic_Syntactic_word_relationship_tests


# display the results of applying tSemantic-Syntactic word relationship tests for the trained word embeddings
def display_semantic_syntactic_word_relationship_tests_results(training_posts_embedding_tSemantic_Syntactic_word_relationship_tests, training_labels):
    # display the results of applying tSemantic-Syntactic word relationship tests for the trained word embeddings
    print('\n\n\n\n')
    print('The results of applying tSemantic-Syntactic word relationship tests for the trained word embeddings:')
    print('\n\n')
    for i in range(len(training_posts_embedding_tSemantic_Syntactic_word_relationship_tests)):
        print('The results of applying tSemantic-Syntactic word relationship tests for the post number: ' + str(i + 1))
        print('\n')
        for j in range(len(training_posts_embedding_tSemantic_Syntactic_word_relationship_tests[i])):
            print('The results of applying tSemantic-Syntactic word relationship tests for the word number: ' + str(j + 1))
            print('\n')
            print('The tSemantic-Syntactic word relationship tests results: ' + str(training_posts_embedding_tSemantic_Syntactic_word_relationship_tests[i][j]))
            print('\n')
        print('\n\n')
    print('\n\n\n\n')

# display the results of applying tSemantic-Syntactic word relationship tests for the trained word embeddings
display_semantic_syntactic_word_relationship_tests_results(apply_semantic_syntactic_word_relationship_tests(training_posts_embedding, training_labels), training_labels)

NameError: name 'training_posts_embedding' is not defined

In [31]:
# Performance Evaluation with Data Processing Techiques
# apply tSemantic-Syntactic word relationship tests for the trained word embeddings
def apply_semantic_syntactic_word_relationship_tests_for_testing_posts_embedding(testing_posts_embedding):
    # apply tSemantic-Syntactic word relationship tests for the trained word embeddings
    testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests = []
    for post in testing_posts_embedding:
        post_embedding_tSemantic_Syntactic_word_relationship_tests = []
        for word_embedding in post:
            word_embedding_tSemantic_Syntactic_word_relationship_tests = []
            for word_embedding_test in post:
                word_embedding_tSemantic_Syntactic_word_relationship_tests.append(np.dot(word_embedding, word_embedding_test))
            post_embedding_tSemantic_Syntactic_word_relationship_tests.append(word_embedding_tSemantic_Syntactic_word_relationship_tests)
        testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests.append(post_embedding_tSemantic_Syntactic_word_relationship_tests)
    return testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests

# print the results
def print_semantic_syntactic_word_relationship_tests_results(testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests, testing_labels):
    # print the results
    print('\n\n\n\n')
    print('The results of applying tSemantic-Syntactic word relationship tests for the testing posts embeddings:')
    print('\n\n')
    for i in range(len(testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests)):
        print('The results of applying tSemantic-Syntactic word relationship tests for the post number: ' + str(i + 1))
        print('\n')
        for j in range(len(testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests[i])):
            print('The results of applying tSemantic-Syntactic word relationship tests for the word number: ' + str(j + 1))
            print('\n')
            print('The tSemantic-Syntactic word relationship tests results: ' + str(testing_posts_embedding_tSemantic_Syntactic_word_relationship_tests[i][j]))
            print('\n')
        print('\n\n')
    print('\n\n\n\n')

# print the results
print_semantic_syntactic_word_relationship_tests_results(apply_semantic_syntactic_word_relationship_tests_for_testing_posts_embedding(processed_testing_posts_embedding), testing_labels)


In [1]:
# Performance Evaluation with Different Sequence Models
# import ConvID, GlobalMaxPoolingID
from keras.layers import Conv1D, GlobalMaxPooling1D
def performance_evaluation_with_different_sequence_models(model, testing_posts_embedding, testing_labels):
    # performance evaluation with different sequence models
    for dimension in [100, 200, 300]:
        for window_size in [3, 5]:
            model = Sequential()
            model.add(Embedding(len(word_embedding_model.wv.vocab), dimension, input_length=testing_posts_embedding.shape[1]))
            model.add(Conv1D(250, window_size, padding='valid', activation='relu', strides=1))
            model.add(GlobalMaxPooling1D())
            model.add(Dense(250))
            model.add(Dropout(0.2))
            model.add(Activation('relu'))
            model.add(Dense(1))
            model.add(Activation('sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            score = model.evaluate(testing_posts_embedding, testing_labels, verbose=1)
            print("Dimension: {0}, Window Size: {1}, Accuracy: {2}".format(dimension, window_size, score[1]))


In [6]:
# HyperParameter Testing
# import ConvID, GlobalMaxPoolingID
from keras.layers import Conv1D, GlobalMaxPooling1D
def hyper_parameter_testing(model, testing_posts_embedding, testing_labels):
    # hyper parameter testing
    for dimension in [100, 200, 300]:
        for window_size in [3, 5]:
            model = Sequential()
            model.add(Embedding(len(word_embedding_model.wv.vocab), dimension, input_length=))
            model.add(Conv1D(250, window_size, padding='valid', activation='relu', strides=1))
            model.add(GlobalMaxPooling1D())
            model.add(Dense(250))
            model.add(Dropout(0.2))
            model.add(Activation('relu'))
            model.add(Dense(1))
            model.add(Activation('sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            score = model.evaluate(testing_posts_embedding, testing_labels, verbose=1)
            print("Dimension: {0}, Window Size: {1}, Accuracy: {2}".format(dimension, window_size, score[1]))

SyntaxError: invalid syntax (<ipython-input-6-4a5d3cbc03b3>, line 9)

In [None]:
#@title Personality Type Prediction

text = "" #@param {type:"string"}

# design a user interface so that user can input a textual sentence via the colab form fields user interface to get the personality type classification result from your trained model
def get_personality_type_prediction(text):
    # get the personality type classification result from your trained model
    return "I am a " + text

# print the result
print(get_personality_type_prediction(text))