In [1]:
import codecs
import re
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords

In [2]:
label_lookup = {
    'Autos & Vehicles':'1',
    'Comedy':'2',
    'Education':'3',
    'Entertainment':'4',
    'Film & Animation':'5',
    'Gaming':'6',
    'Howto & Style':'7',
    'Music':'8',
    'News & Politics':'9',
    'Nonprofits & Activism':'10',
    'Pets & Animals':'11',
    'Science & Technology':'12',
    'Sports':'13',
    'Travel & Events':'14'
}

In [3]:
ps = PorterStemmer()

In [4]:
def remove_link(input_string):
    return re.sub(r"http\S+", "", input_string)

In [5]:
def remove_spec_chars(input_string):
    return re.sub(r"[^\w# @_]", "", input_string)

In [6]:
def read_windows_file(filepath, encoding='cp1252'):
    with codecs.open(filepath, 'r', encoding) as file:
        return list(filter(str.strip, file.readlines()))

In [7]:
def get_tweet_features_and_preprocess(data_set):
    individual_features = [tweet.split("\t") for tweet in data_set]
    for tweet in individual_features:
        tweet[1] = ps.stem(remove_spec_chars(remove_link(tweet[1].lower())))
    return individual_features

In [8]:
training_tweets = read_windows_file('../Tweets.14cat.train')
testing_tweets = read_windows_file('../Tweets.14cat.test')

In [9]:
# a list of tweets (string), no id, no label, links and special characters have been removed
# training_tweets_text = [remove_spec_chars(remove_link((tweet.split("\t"))[1].lower())) for tweet in training_tweets]

In [10]:
# same as above, but includes stemming
training_tweets_text = [ps.stem(remove_spec_chars(remove_link((tweet.split("\t"))[1].lower()))) for tweet in training_tweets]

In [11]:
unique_words = set(word_tokenize(' '.join(training_tweets_text)))

In [12]:
unique_words_without_stopwords = unique_words.difference(set(stopwords.words('english')))

In [13]:
# the set of features (BoW) will consist of integers (1->N) mapping each token (word) in the training set
feature_set = dict(enumerate(list(unique_words_without_stopwords), 1))

In [14]:
# write our feature set to disk
with codecs.open('./feats.dic', 'w', 'UTF-8') as file:
    for attribute, value in feature_set.items():
        file.write("{}\t{}\n".format(attribute, value))

In [15]:
# our application requires features to be attribute value pairs, where the attribute is the word & value is the int
useable_feature_set = {v:k for k,v in feature_set.items()}

In [16]:
training_features = get_tweet_features_and_preprocess(training_tweets)
testing_features = get_tweet_features_and_preprocess(testing_tweets)

In [17]:
# TODO: refactor into 3 seperate functions? write label, write body and write comment?
# TODO: consider passing useable_feature_set, label_lookup as parameters, instead of globals
# globals R bad
# useable_feature_set should really be variable..

# recall, we can't actually have to remove stopwords, it would improve the training time, but it wouldn't make
# a difference to accuracy, as the stopwords don't exist in our feats.dict file anyway..
# this means the loop would just 'continue' when it encountered a stop word.
# However, if we were doing this at scale, we would probably want to remove these to improve training times..

def write_ml_ready_file(filepath, feature_data, encoding="UTF-8"):
    with codecs.open(filepath, 'w', encoding) as file:
        for tweet in feature_data:
            try:
                file.write(label_lookup[tweet[2].strip()] + " ")
                feature_body = []
                for tweet_text in word_tokenize(tweet[1]):
                    try:
                        feature_body.append(useable_feature_set[tweet_text.strip()])
                    except Exception:
                        continue
                feature_body = set(feature_body)
                feature_body = sorted(feature_body)
                file.write(' '.join([str(feature) + ":1" for feature in feature_body]) + " ")
                file.write("#" + str(tweet[0]) + "\n")
            except Exception as e:
                print("Whoops!")
                print(e)
                file.write("\n")

In [18]:
write_ml_ready_file('./feats.train', training_features)
write_ml_ready_file('./feats.test', testing_features)

In [19]:
# apply stopping and stemming in preprocessing

In [20]:
# try applying lematization

In [21]:
# Plan Moving Forwards
# 1) improve the accuracy of the SVM as much as possible, ideally I'd like to hit 70% +
# 2) switch architecture...
# 2.1) we could try and use the SVM with word embeddings, or..
# 2.2) we could switch over to word embeddings + LSTM NN..
#    ) Reference: Pycon Ireland video thingy, Embedding Layer, LSTM, Dense + Dropout (we'd probably use a softmax)

In [22]:
# I'm going to need a function that I can pass a word vector to, and it can generate word embedding matrix

In [23]:
# then I'm going to need to use Keras, LSTM