In [1]:
import codecs
import re
from nltk import word_tokenize

In [2]:
with codecs.open('../Tweets.14cat.train', 'r', 'cp1252') as file:
    training_tweets = list(filter(str.strip, file.readlines()))

In [3]:
def remove_link(input_string):
    return re.sub(r"http\S+", "", input_string)

In [4]:
def remove_spec_chars(input_string):
    return re.sub(r"[^\w# @_]", "", input_string)

In [5]:
# a list of tweets (string), no id, no label, links and special characters have been removed
training_tweets_text = [remove_spec_chars(remove_link((tweet.split("\t"))[1].lower())) for tweet in training_tweets]

In [6]:
unique_words = set(word_tokenize(' '.join(training_tweets_text)))

In [7]:
# the set of features (BoW) will consist of integers (1->N) mapping each token (word) in the training set
feature_set = dict(enumerate(list(unique_words), 1))

In [8]:
# write our feature set to file
with codecs.open('./feats.dic', 'w', 'UTF-8') as file:
    for attribute, value in feature_set.items():
        file.write("{}\t{}\n".format(attribute, value))

In [9]:
# our application requires features to be attribute value pairs, where the attribute is the word & value is the int
useable_feature_set = {v:k for k,v in feature_set.items()}

In [10]:
training_tweets_features = [tweet.split("\t") for tweet in training_tweets]

In [11]:
for tweet in training_tweets_features:
    tweet[1] = remove_spec_chars(remove_link(tweet[1].lower()))

In [12]:
label_lookup = {
    'Autos & Vehicles':'1',
    'Comedy':'2',
    'Education':'3',
    'Entertainment':'4',
    'Film & Animation':'5',
    'Gaming':'6',
    'Howto & Style':'7',
    'Music':'8',
    'News & Politics':'9',
    'Nonprofits & Activism':'10',
    'Pets & Animals':'11',
    'Science & Technology':'12',
    'Sports':'13',
    'Travel & Events':'14'
}

In [13]:
with codecs.open('./feats.train', 'w', 'UTF-8') as file:
    for tweet in training_tweets_features:
        try:
            file.write(label_lookup[tweet[2].strip()] + " ")
            feature_body = []
            for tweet_text in word_tokenize(tweet[1]):
                try:
                    feature_body.append(useable_feature_set[tweet_text.strip()])
                except Exception:
                    continue
            feature_body = set(feature_body)
            feature_body = sorted(feature_body)
            file.write(' '.join([str(feature) + ":1" for feature in feature_body]) + " ")
            file.write("#" + str(tweet[0]) + "\n")
        except Exception as e:
            print("Whoops!")
            print(e)
            file.write("\n")

In [14]:
with codecs.open('../Tweets.14cat.test', 'r', 'cp1252') as file:
    testing_tweets = list(filter(str.strip, file.readlines()))

In [15]:
testing_tweets_features = [tweet.split("\t") for tweet in testing_tweets]

In [16]:
for tweet in testing_tweets_features:
    tweet[1] = remove_spec_chars(remove_link(tweet[1].lower()))

In [17]:
with codecs.open('./feats.test', 'w', 'UTF-8') as file:
    for tweet in testing_tweets_features:
        try:
            file.write(label_lookup[tweet[2].strip()] + " ")
            feature_body = []
            for tweet_text in word_tokenize(tweet[1]):
                try:
                    feature_body.append(useable_feature_set[tweet_text.strip()])
                except Exception:
                    continue
            feature_body = set(feature_body)
            feature_body = sorted(feature_body)
            file.write(' '.join([str(feature) + ":1" for feature in feature_body]) + " ")
            file.write("#" + str(tweet[0]) + "\n")
        except Exception as e:
            print("Whoops!")
            print(e)
            file.write("\n")