In [2]:
import pandas as pd
training_data = pd.read_csv("content/training_data.csv")
testing_data = pd.read_csv("content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")

------------------------------------
Size of training dataset: 7808
Size of testing dataset: 867
------------------------------------
------------------------------------
Sample Data
LABEL: F / SENTENCE: 'Half of it is going straight to charity, another quarter going straight to scientific research, an eighth to the parkour community, a sixteenth to towards spreading information about health and...|||Find a path or suffer more.|||http://personalitycafe.com/enneagram-personality-theory-forum/85323-enneagram-type-mbti-type-compared-statistics.html yep.|||I kind of anchor on Fi and Ne makes having Ni really fun. INFP for me as they tire me out less and our views tend to align more.|||The two ESTPs I have gotten the chance to know seem to experience much more than other people who have been on the planet for the same amount of time and are quite the renaissance (wo)men.  Is this...|||I don't really have a best friend ISTP(passion-amateur group co-founder), INTJ(intellectual and various sma

In [3]:
# Preview of the data in the csv file, which has two columns: 
# (1)type - label of the post (2)posts - the corresponding post content
training_data.head()

Unnamed: 0,type,posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,T,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...
3,T,"'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...


In [4]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

In [6]:
# remove the URL from the post and compare, by experimental results, when you remove the URL from the post versus keeping the URL in the post.
# function to remove the URL from the post
def remove_url(post):
    import re
    # remove the URL from the post
    post=re.sub(r'http\S+', '', post)
    return post

# use the function to remove the URL from the post
training_posts_no_url=[remove_url(post) for post in training_posts]
testing_posts_no_url=[remove_url(post) for post in testing_posts]


In [11]:
#calculate the test results
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# calculate the test results
def calculate_test_results(training_posts, training_labels, testing_posts, testing_labels):
    # Create the bag of words
    count_vect = CountVectorizer()
    # Fit the bag of words on the training data
    training_data_features = count_vect.fit_transform(training_posts)
    # Transform the testing data
    testing_data_features = count_vect.transform(testing_posts)
    # Create a Multinomial Naive Bayes classifier
    clf = MultinomialNB().fit(training_data_features, training_labels)
    # Predict the labels on the testing data
    predicted_labels = clf.predict(testing_data_features)
    # Calculate the accuracy of the model
    accuracy = accuracy_score(testing_labels, predicted_labels)
    return accuracy

#before removing the URL
print("------------------------------------")
print("Before removing the URL")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")


# after removing the URL, the accuracy of the model is higher
print("------------------------------------")
print("After removing the URL")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts_no_url, training_labels, testing_posts_no_url, testing_labels)))
print("------------------------------------")


# The result is that the accuracy of the model is lower when you remove the URL from the post.

------------------------------------
Before removing the URL
Accuracy of the model: 0.7820069204152249
------------------------------------
------------------------------------
After removing the URL
Accuracy of the model: 0.7797001153402537
------------------------------------


In [12]:
#pre-process the training set by integrating several text pre-processing techniques (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
#  You should test and justify the reason why you apply the specific preprocessing techniques based on the test results.

#tokenisation
import re
# function to tokenise the post
def tokenise(post):
    # tokenise the post
    post=re.sub("[^a-zA-Z]", " ", post)
    post=post.lower()
    return post

#remove numbers from the post
def remove_numbers(post):
    # remove the numbers from the post
    post=re.sub("[0-9]", " ", post)
    return post

#convert to lowercase
def convert_to_lowercase(post):
    # convert to lowercase
    post=post.lower()
    return post

#remove stop words
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

# function to remove the stop words from the post
def remove_stop_words(post):
    # remove the stop words from the post
    post=post.split()
    post=[word for word in post if not word in stop_words]
    post=" ".join(post)
    return post

#stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english")

# function to stem the post
def stem_post(post):
    # stem the post
    post=post.split()
    post=[stemmer.stem(word) for word in post]
    post=" ".join(post)
    return post


# apply the tokenisation, remove numbers, convert to lowercase, remove stop words, stemming
def preprocess_training_set(training_posts):
    # apply the tokenisation, remove numbers, convert to lowercase, remove stop words, stemming
    training_posts=[tokenise(post) for post in training_posts]
    training_posts=[remove_numbers(post) for post in training_posts]
    training_posts=[convert_to_lowercase(post) for post in training_posts]
    training_posts=[remove_stop_words(post) for post in training_posts]
    training_posts=[stem_post(post) for post in training_posts]
    return training_posts

processed_training_posts=preprocess_training_set(training_posts)

#test the results
print("------------------------------------")
print("Before processing the training set")
print("Accuracy of the model: {0}".format(calculate_test_results(training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")

#after processing the training set
print("------------------------------------")
print("After processing the training set")
print("Accuracy of the model: {0}".format(calculate_test_results(processed_training_posts, training_labels, testing_posts, testing_labels)))
print("------------------------------------")

------------------------------------
Before processing the training set
Accuracy of the model: 0.7820069204152249
------------------------------------
------------------------------------
After processing the training set
Accuracy of the model: 0.657439446366782
------------------------------------


## PART B

In [18]:
""" build a word embedding model (for representing word vectors, such as word2vec-CBOW, word2vec-Skip gram, fastText, and Glove) 
for the input embedding of your sequence model  """

# Training word embeddings using processed_training_posts for the input embedding of your sequence model
#start
import gensim
from gensim.models import Word2Vec

# training word embeddings
def train_word_embeddings(training_posts):
    # training word embeddings
    model = Word2Vec(training_posts, window=5, workers=4)
    return model

#train
word_embeddings=train_word_embeddings(processed_training_posts)


In [None]:
# Build training model for word embeddings
#start
