In [57]:
# imports
import codecs
import chardet
import re
import json
from nltk import word_tokenize

In [58]:
# constants
# configuration
# write_tokens_to_file: format
TEXT = 1
JSON = 2
BINARY = 3
# CLASS IDS
class_look_up = {
    'Autos & Vehicles':1,
    'Comedy':2,
    'Education':3,
    'Entertainment':4,
    'Film & Animation':5,
    'Gaming':6,
    'Howto & Style':7,
    'Music':8,
    'News & Politics':9,
    'Nonprofits & Activism':10,
    'Pets & Animals':11,
    'Science & Technology':12,
    'Sports':13,
    'Travel & Events':14
}
# TYPES
DICT = {}
LIST = []

In [59]:
# I actually downloaded the source code for this library
# there is no way to differentiate between cp1252 and Windows-1252
# so I'll have to implement my own configurable override
# ralised Python handles this for you, ah well - that's an hour I'm never going to see again!
def detect_charset(filepath, operation="rb", win_flip_cp=False, repl_val='cp'):
    result = chardet.detect(open(filepath, operation).read())
    if 'encoding' not in result.keys():
        raise Exception("Encoding not recognised by detect_charset")
    return result['encoding'] if not win_flip_cp else re.sub(r'\AWindows-', repl_val, result['encoding'])

In [60]:
# functions
def read_file(filepath, function="r", encoding="UTF-8"):
    if not filepath:
        raise Exception("Must at least pass filepath to read_file")
    file = codecs.open(filepath, function, encoding)
    return file.readlines()

In [61]:
def remove_link(input_string):
    return re.sub(r"http\S+", "", input_string)

In [62]:
def read_tweet(input_tweet):
    return input_tweet.strip().split("\t")

In [63]:
def verify_value_as_int(value):
    return isinstance(value, int)

In [64]:
def remove_special_characters(input_string):
    return re.sub(r"[^\w# @_]", "", input_string)

In [65]:
def preprocess_string(data_string):
    return remove_special_characters(remove_link(data_string.lower()))

In [66]:
def compartmentalise_tweets(list_of_tweets):
    return [list_idx.split("\t") for list_idx in list_of_tweets]

## TEST CASES ##

In [67]:
def extract_tweet_text(compartmentalised_list, tweet_idx=1):
    tweet_text = []
    for tweet in compartmentalised_list:
        try:
            tweet_text.append(tweet[tweet_idx])
        except IndexError:
            pass
    return tweet_text

## TEST CASES ##

In [68]:
def get_unique_tokens(struct):
    if isinstance(struct, list):
        return set(word_tokenize(' '.join(struct)))
    elif isinstance(struct, str):
        return set(word_tokenize(struct))
    # else
    raise Exception("Parameter Data Structure is Unknown")

## TEST CASES ##

In [69]:
def persist_appropriate_file_ext(fp, ext):
    if not isinstance(fp, str) or not isinstance(ext, str):
        raise Exception("Both Parameters Must Be Of Type String!")
    return fp if fp.lower().endswith(ext.lower()) else (fp + "." + ext)

## TEST CASES ##
# print(persist_appropriate_file_ext("hello_world.txt", "txt"))
# print(persist_appropriate_file_ext("feats", "dict"))

In [70]:
# improve this by figuring out a nice way of detecting the last iteration in the loop and not adding a new line
def write_tokens_to_file_as_text(obj, fp, as_features, encoding="UTF-8"):
    with codecs.open(fp, "w", encoding) as file:
        if isinstance(obj, dict):
            for iden, term in obj.items():
                write_str = (str(iden) + "\t" + str(term) + "\n") if as_features else (term + "\n")
                file.write(write_str)
        elif isinstance(obj, list):
            for iden, term in enumerate(obj):
                write_str = (str(iden) + "\t" + str(term) + "\n") if as_features else (term + "\n")
                file.write(write_str)
        else:
            raise TypeError("Attempting to store unknown data structure to file.")
    return True

In [71]:
def write_tokens_to_file_as_json(obj, fp, as_features):
    raise Exception("not yet implemented")

In [72]:
def write_tokens_to_file_as_bin(obj, fp, as_features):
    raise Exception("not yet implemented")

In [73]:
# TODO: refactor
# too much repeated code, we can reduce the size of this function by 2 at least.
# not to mention, the code should probably be written using inheritance, not using if, elifs.
# consider placing try-catch block above this level of call-stack
def read_tokens_from_file_as_text(obj_return_type, fp, encoding="UTF-8", enable_stripping=True):
    #try:
        if obj_return_type == DICT:
            return_ds = {}
            with codecs.open(fp, 'r', encoding) as file:
                for line in file.readlines():
                    feature_components = line.split("\t")
                    if len(feature_components) < 1: continue # or throw an error
                    feature_key = (feature_components[1]).strip() if enable_stripping else feature_components[1]
                    return_ds[feature_key] = feature_components[0]
            return return_ds
        elif obj_return_type == LIST:
            return_list = []
            with codecs.open(fp, 'r', encoding) as file:
                for line in file.readlines():
                    feature_components = line.split("\t")
                    if len(feature_components) < 1: continue # or throw an error
                    return_feature = (feature_components[1]).strip() if enable_stripping else feature_components[1]
                    return_list.append(return_feature)
            return return_list
        # else
        raise Exception("Unknown Object Return Type Param")
    #except (FileNotFoundError, KeyError, TypeError, Exception):
    #    # do something to alert the user as to what's wrong..
    #    print("Oh noes! Something went terribly wrong!")

In [74]:
## TEST CASES ##
read_tokens_from_file_as_text(DICT, "./feats.dic")

{'officially': '1',
 'claude': '2',
 'jacket': '3',
 'thecomedypub': '4',
 'beneath': '5',
 'hotellife': '6',
 'urdu': '7',
 'charts': '8',
 'longisland': '9',
 'kinect': '10',
 'welve': '11',
 'safilo': '12',
 'far': '13',
 'indiauntasted': '14',
 'freeway': '15',
 'mx5': '16',
 'pairings': '17',
 'youngster': '18',
 'bares': '19',
 'labrascals': '20',
 'brynmusic': '21',
 'becomes': '22',
 'henriesmorse': '23',
 'phillips': '24',
 'mohamednawito': '25',
 'smithsonian': '26',
 'gg': '27',
 'period': '28',
 'beyondthesurface': '29',
 'kong': '30',
 'awww': '31',
 'mobilephones': '32',
 'love__kai': '33',
 'superstart20': '34',
 'flocka': '35',
 'exus': '36',
 'hours': '37',
 'jordanian': '38',
 'splitsider': '39',
 'pisses': '40',
 'retw': '41',
 '41': '42',
 'ga': '43',
 'status': '44',
 'moon': '45',
 'drainage': '46',
 'believes': '47',
 'give': '48',
 'readerrun': '49',
 'itimestweets': '50',
 'jets': '51',
 'embarrassment': '52',
 'syrianchildren': '53',
 'butta': '54',
 'indiedev

In [75]:
def read_tokens_from_file_as_json(obj_return_type, fp):
    if obj_return_type == DICT:
        pass
    elif obj_return_type == LIST:
        pass
    # else
    raise Exception("Unknown Object Return Type Param")

In [76]:
def read_tokens_from_file_as_bin(obj_return_type, fp):
    if obj_return_type == DICT:
        pass
    elif obj_return_type == LIST:
        pass
    # else
    raise Exception("Unknown Object Return Type Param")

In [77]:
hello = dict
def test_hello(obj):
    return type(obj)
x = test_hello(hello)
print(x.__dict__)
print(x.__repr__)
# this is insightful for those who don't python.
# probably, deffo check out this talk: https://youtu.be/cKPlPJyQrt4

{'__repr__': <slot wrapper '__repr__' of 'type' objects>, '__call__': <slot wrapper '__call__' of 'type' objects>, '__getattribute__': <slot wrapper '__getattribute__' of 'type' objects>, '__setattr__': <slot wrapper '__setattr__' of 'type' objects>, '__delattr__': <slot wrapper '__delattr__' of 'type' objects>, '__init__': <slot wrapper '__init__' of 'type' objects>, '__new__': <built-in method __new__ of type object at 0x10d516d98>, 'mro': <method 'mro' of 'type' objects>, '__subclasses__': <method '__subclasses__' of 'type' objects>, '__prepare__': <method '__prepare__' of 'type' objects>, '__instancecheck__': <method '__instancecheck__' of 'type' objects>, '__subclasscheck__': <method '__subclasscheck__' of 'type' objects>, '__dir__': <method '__dir__' of 'type' objects>, '__sizeof__': <method '__sizeof__' of 'type' objects>, '__basicsize__': <member '__basicsize__' of 'type' objects>, '__itemsize__': <member '__itemsize__' of 'type' objects>, '__flags__': <member '__flags__' of 't

In [78]:
def read_tokens_from_file_as_json(obj_return_type, fp):
    pass

In [79]:
def read_tokens_from_file_as_binary(obj_return_type, fp):
    pass

In [80]:
'''
## TEST CASE ##
my_funky_obj = ["hello", "world", "these", "are", "my", "features", "."]
write_tokens_to_file_as_text(my_funky_obj, "./test_feats.dic", True)
##
my_funky_dict = {0:'hello', 1:'world'}
write_tokens_to_file_as_text(my_funky_dict, "./test_dic_feats.dic", True)
##
'''

'\n## TEST CASE ##\nmy_funky_obj = ["hello", "world", "these", "are", "my", "features", "."]\nwrite_tokens_to_file_as_text(my_funky_obj, "./test_feats.dic", True)\n##\nmy_funky_dict = {0:\'hello\', 1:\'world\'}\nwrite_tokens_to_file_as_text(my_funky_dict, "./test_dic_feats.dic", True)\n##\n'

In [81]:
# as_features should be responsible for controlling whether the tokens are labelled or not
def write_tokens_to_file(obj, fp="./feats", as_features=True, file_format=TEXT):
    if file_format == TEXT:
        fp = persist_appropriate_file_ext(fp, "dic")
        return write_tokens_to_file_as_text(obj, fp, as_features)
    elif file_format == JSON:
        # ensure this is pretty printed
        fp = persist_appropriate_file_ext(fp, "json")
        return write_tokens_to_file_as_json(obj, fp, as_features)
    elif file_format == BINARY:
        # pickle save as obj
        fp = persist_appropriate_file_ext(fp, "obj")
        return write_tokens_to_file_as_bin(obj, fp, as_features)
    # else
    raise Exception("Unknown File Format Parameter")

In [82]:
def read_tokens_from_file(obj_return_type, fp, file_format=TEXT):
    if file_format == TEXT:
        fp = persist_appropriate_file_ext(fp, "dic")
        return read_tokens_from_file_as_text(obj_return_type, fp)
    elif file_format == JSON:
        fp = persist_appropriate_file_ext(fp, "json")
        return read_tokens_from_file_as_json(obj_return_type, fp)
    elif file_format == BINARY:
        fp = persist_appropriate_file_ext(fp, "obj")
        return read_tokens_from_file_as_bin(obj_return_type, fp)
    # else
    raise Exception("Unknown File Format Prameter")

In [83]:
# we already have a dictionary in memory for the time being
# dict_of_tokens

In [84]:
# however, we need to reverse the key-value pairs
res_dict_of_tokens = {v:k for k,v in dict_of_tokens.items()}

NameError: name 'dict_of_tokens' is not defined

In [85]:
# Read in the tweets & print features
'''
feature_ized_training_tweets = []

for tweet in training_set_compart:
    feature_ized_tweet = []
    try:
        clu_key = tweet[2].strip()
        feature_ized_tweet.append(str(class_look_up[clu_key]))
        for token in word_tokenize(tweet[1]):
            feature_ized_tweet.append(str(res_dict_of_tokens[token]) + ":1")
        feature_ized_tweet.append("#" + str(tweet[0]))
        new_str_tweet = ' '.join(feature_ized_tweet)
        feature_ized_training_tweets.append(new_str_tweet)
    except (IndexError, TypeError, KeyError) as e:
        pass

print(len(feature_ized_training_tweets))
    
for tweet in feature_ized_training_tweets:
    print(tweet)
    print()
    print()

# do the same thing for the testing tweets.. I'll get round to this..
    
# testing_set_compart = compartmentalise_tweets(testing_set)
'''

'\nfeature_ized_training_tweets = []\n\nfor tweet in training_set_compart:\n    feature_ized_tweet = []\n    try:\n        clu_key = tweet[2].strip()\n        feature_ized_tweet.append(str(class_look_up[clu_key]))\n        for token in word_tokenize(tweet[1]):\n            feature_ized_tweet.append(str(res_dict_of_tokens[token]) + ":1")\n        feature_ized_tweet.append("#" + str(tweet[0]))\n        new_str_tweet = \' \'.join(feature_ized_tweet)\n        feature_ized_training_tweets.append(new_str_tweet)\n    except (IndexError, TypeError, KeyError) as e:\n        pass\n\nprint(len(feature_ized_training_tweets))\n    \nfor tweet in feature_ized_training_tweets:\n    print(tweet)\n    print()\n    print()\n\n# do the same thing for the testing tweets.. I\'ll get round to this..\n    \n# testing_set_compart = compartmentalise_tweets(testing_set)\n'

In [86]:
'''
with codecs.open("./feats.train", "w", "UTF-8") as training_file:
    for data_point in feature_ized_training_tweets:
        training_file.write(str(data_point) + "\n")
''''''

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-86-a2c96ffdc824>, line 5)

In [87]:
new_dict_of_tokens = read_tokens_from_file(DICT, "./feats")

In [88]:
new_dict_of_tokens

{'officially': '1',
 'claude': '2',
 'jacket': '3',
 'thecomedypub': '4',
 'beneath': '5',
 'hotellife': '6',
 'urdu': '7',
 'charts': '8',
 'longisland': '9',
 'kinect': '10',
 'welve': '11',
 'safilo': '12',
 'far': '13',
 'indiauntasted': '14',
 'freeway': '15',
 'mx5': '16',
 'pairings': '17',
 'youngster': '18',
 'bares': '19',
 'labrascals': '20',
 'brynmusic': '21',
 'becomes': '22',
 'henriesmorse': '23',
 'phillips': '24',
 'mohamednawito': '25',
 'smithsonian': '26',
 'gg': '27',
 'period': '28',
 'beyondthesurface': '29',
 'kong': '30',
 'awww': '31',
 'mobilephones': '32',
 'love__kai': '33',
 'superstart20': '34',
 'flocka': '35',
 'exus': '36',
 'hours': '37',
 'jordanian': '38',
 'splitsider': '39',
 'pisses': '40',
 'retw': '41',
 '41': '42',
 'ga': '43',
 'status': '44',
 'moon': '45',
 'drainage': '46',
 'believes': '47',
 'give': '48',
 'readerrun': '49',
 'itimestweets': '50',
 'jets': '51',
 'embarrassment': '52',
 'syrianchildren': '53',
 'butta': '54',
 'indiedev

In [89]:
res_dict_of_tokens

NameError: name 'res_dict_of_tokens' is not defined

In [90]:
# library function
def compare_dicts(dict_one, dict_two):
    return { k : dict_two[k] for k in set(dict_two) - set(dict_one) }

In [91]:
# sanity check 
compare_dicts(new_dict_of_tokens, res_dict_of_tokens)

NameError: name 'res_dict_of_tokens' is not defined

In [92]:
# compartmentalised tweet

# TODO: implement a method of counting the number of exceptions that occur and log to error reporting

def return_data_point(compart_tweet, feature_set):
    feature_ized_tweet = []
    try:
        # compartmentalised lookup key
        clu_key = compart_tweet[2].strip()
        # class_look_up is a global variable containing int keys for labels
        feature_ized_tweet.append(str(class_look_up[clu_key]))
        # obtaining all tokens within the tweet text itself
        for token in word_tokenize(compart_tweet[1]):
            # look up each token in and find its co-responding feature ID, :1 for binary 'YES'
            try:
                feature_ized_tweet.append(str(feature_set[token]) + ":1")
            except (IndexError, KeyError):
                #print("Looks like we're running into some kind of indexing issue Euston..")
                continue
        feature_ized_tweet.append("#" + str(compart_tweet[0]))
        return ' '.join(feature_ized_tweet)
    except (IndexError, TypeError, KeyError) as e:
        #print("############################## EXCEPTION ##########################")
        #print(e)
        #print("############################## EXCEPTION ##########################")
        return ''

In [93]:
for tweet in training_set_compart:
    print(return_data_point(tweet, new_dict_of_tokens))
    print()
    print()

11 10109:1 1913:1 1256:1 3895:1 7689:1 9738:1 9083:1 6777:1 5597:1 10323:1 2231:1 9666:1 1462:1 8715:1 #45029314109075046


11 6777:1 4259:1 #45033090867215155


11 6777:1 6699:1 6777:1 2217:1 6777:1 10056:1 6777:1 5540:1 6777:1 5579:1 6777:1 7831:1 6777:1 914:1 6777:1 10054:1 6777:1 7399:1 6777:1 7008:1 6777:1 7992:1 #45036625162627481


11 8279:1 3633:1 6777:1 3003:1 8279:1 8279:1 6874:1 8279:1 9661:1 8279:1 8279:1 #45086603513077350


11 8279:1 1664:1 899:1 5540:1 3633:1 3903:1 3177:1 8279:1 146:1 8279:1 #45138968053405286


11 5579:1 6777:1 10148:1 6777:1 7464:1 6777:1 6318:1 6777:1 624:1 6777:1 7831:1 6777:1 6074:1 6777:1 6318:1 #45171179411842662


11 8279:1 829:1 3988:1 7451:1 6837:1 9713:1 1462:1 1304:1 5918:1 6777:1 3903:1 6777:1 5918:1 6777:1 5560:1 #45214142046457446


11 8279:1 6463:1 #45274909095128268


2 6691:1 6777:1 8823:1 4207:1 7451:1 9058:1 1199:1 2231:1 8370:1 6777:1 3798:1 #45035120831876710


1 6777:1 1990:1 3429:1 8596:1 398:1 3429:1 2548:1 2548:1 9178:1 841:1 3

8 8279:1 6777:1 7153:1 #45207762888362803


8 3497:1 3633:1 9433:1 8302:1 48:1 398:1 #45210570914950758


5 5183:1 9178:1 9474:1 6777:1 6054:1 6777:1 7161:1 3177:1 8279:1 6154:1 #45198609181049241


4 8279:1 8279:1 3339:1 8279:1 1135:1 6777:1 7153:1 #45201182890670489


12 8279:1 4605:1 10379:1 2916:1 6777:1 4966:1 3923:1 3429:1 6489:1 510:1 5163:1 6692:1 #45201691006732288


5 2489:1 6726:1 7479:1 398:1 6728:1 #45202139616905216


5 6777:1 964:1 6777:1 9900:1 6777:1 1684:1 6777:1 2186:1 6777:1 6777:1 4789:1 6777:1 8785:1 6777:1 9050:1 6777:1 9514:1 #45204983602021990


5 3638:1 9083:1 7145:1 8665:1 7451:1 1985:1 6728:1 6777:1 9250:1 6777:1 6104:1 6777:1 8665:1 #45212260542922342


3 3384:1 8556:1 1199:1 6777:1 4207:1 10109:1 6777:1 5135:1 6777:1 10278:1 6777:1 9839:1 #45204729158802636


4 8279:1 9681:1 9788:1 10109:1 6340:1 6777:1 7153:1 #45204974052011212


12 5183:1 1199:1 6777:1 4287:1 6777:1 1428:1 #45206319513011404


2 8279:1 3793:1 8771:1 10109:1 7377:1 4224:1 6728:1 6777:1 67

10 8279:1 8749:1 7548:1 6146:1 2756:1 3458:1 4224:1 3429:1 8765:1 3198:1 3429:1 3923:1 510:1 6777:1 6146:1 3429:1 8630:1 510:1 2685:1 9932:1 6777:1 #45612956731364147


5 2525:1 3475:1 9076:1 1780:1 8263:1 6967:1 6777:1 8263:1 6777:1 236:1 6777:1 8665:1 6777:1 1997:1 #45616987642855424


4 8279:1 2231:1 7548:1 6777:1 7153:1 #45620755704906547


12 8279:1 9736:1 581:1 1500:1 6777:1 5201:1 4207:1 4828:1 7548:1 8923:1 5492:1 6777:1 8654:1 #45621314663863091


2 3834:1 2236:1 6869:1 4168:1 4493:1 3235:1 4207:1 1295:1 3720:1 6728:1 2231:1 10307:1 6753:1 #45626008306044928


5 8537:1 5925:1 7908:1 957:1 2231:1 7385:1 5008:1 1199:1 581:1 8927:1 273:1 2236:1 6777:1 5335:1 #45626898448962355


5 8279:1 5262:1 6175:1 6728:1 8123:1 1437:1 6777:1 8993:1 5625:1 5370:1 2401:1 8279:1 9137:1 6777:1 6777:1 6777:1 #45630544729355059


4 8279:1 6777:1 7153:1 #45631470843423539


7 8279:1 6777:1 755:1 6777:1 1007:1 6777:1 5837:1 4207:1 2151:1 8663:1 9158:1 6777:1 1697:1 6777:1 150:1 #45634649400701747


3


12 1493:1 6728:1 4207:1 10307:1 6554:1 #46085075755494604


12 3429:1 #46095626979731046


9 6777:1 8371:1 6736:1 327:1 9108:1 4887:1 3698:1 1199:1 2231:1 8371:1 7611:1 2767:1 3429:1 4650:1 5280:1 4573:1 #46086499391570739


6 6452:1 6452:1 6777:1 1684:1 6777:1 6807:1 #46086906992395878


12 1199:1 9813:1 5087:1 674:1 9317:1 2231:1 8279:1 1199:1 9289:1 6777:1 6777:1 #46087882168454348


13 6777:1 5685:1 4923:1 9906:1 3429:1 8867:1 8216:1 8358:1 8452:1 2510:1 6382:1 #46088586925613465


2 8279:1 2684:1 6777:1 6777:1 #46089419471087616


5 6777:1 3569:1 6777:1 7567:1 6777:1 2581:1 6777:1 5540:1 6777:1 9486:1 6777:1 3938:1 6777:1 9910:1 6777:1 1715:1 6777:1 1506:1 6777:1 #46089918934682828


13 3236:1 8507:1 6156:1 2955:1 6340:1 909:1 6777:1 6777:1 6777:1 6777:1 #46091284554346905


3 8279:1 9143:1 3384:1 8556:1 1199:1 6777:1 4207:1 10109:1 6777:1 5135:1 6777:1 10278:1 6777:1 9839:1 #46091737459905741


13 9471:1 8468:1 9083:1 9738:1 8279:1 2768:1 3390:1 5878:1 2231:1 3834:1 7241:1 8580:


11 6029:1 6777:1 8547:1 6777:1 2577:1 10337:1 3301:1 6777:1 9887:1 #549944790838943744


11 6452:1 6777:1 220:1 6777:1 8262:1 #549993017869082624


11 8279:1 688:1 398:1 10109:1 3832:1 7355:1 47:1 1199:1 657:1 1312:1 3832:1 6777:1 7292:1 6777:1 5764:1 #550105739587751936


11 8279:1 3193:1 6777:1 6074:1 #550373786626043904


11 3429:1 1199:1 6777:1 6777:1 6777:1 7532:1 6777:1 1697:1 #550375523235295232


11 2231:1 8997:1 6777:1 8997:1 6777:1 2037:1 6777:1 7833:1 6777:1 8997:1 6777:1 856:1 6777:1 3091:1 6777:1 #550503993622159360


11 6777:1 2039:1 #550550791791329280


11 8279:1 7982:1 4089:1 7831:1 2236:1 7711:1 7124:1 8820:1 6777:1 9663:1 2736:1 6452:1 6080:1 5142:1 4794:1 6853:1 #550654847427821570


11 9178:1 8733:1 6777:1 3091:1 #550691508719218688


11 8279:1 3749:1 2626:1 899:1 10109:1 3633:1 6777:1 6777:1 6777:1 882:1 6777:1 6777:1 6777:1 #550709989846032385


11 8279:1 1502:1 8279:1 8878:1 8279:1 8760:1 581:1 5332:1 9698:1 9770:1 1500:1 957:1 10104:1 3198:1 3563:1 6777:1 #550

7 6777:1 6777:1 #550577064630177793


7 6777:1 9413:1 6777:1 4959:1 6777:1 3632:1 6777:1 2037:1 6777:1 10398:1 6777:1 4738:1 6777:1 214:1 6777:1 2564:1 6777:1 10067:1 6777:1 9756:1 #550671601155309569


7 6777:1 1631:1 #550939441087676416


7 6777:1 4692:1 #551002006064988160


7 6777:1 6777:1 #551443777097715712


7 7548:1 2231:1 1304:1 6777:1 825:1 6777:1 8411:1 6777:1 7653:1 6777:1 8840:1 6777:1 3983:1 6777:1 6487:1 6777:1 5749:1 6777:1 1697:1 6777:1 827:1 #551459013498785792


7 10138:1 6423:1 908:1 6452:1 8703:1 925:1 4207:1 979:1 6777:1 6777:1 1697:1 6777:1 4917:1 6777:1 6423:1 9767:1 #551724397267279872


7 6777:1 7778:1 1697:1 2035:1 7778:1 1697:1 2035:1 #551828268895068160


7 3429:1 #552043293056712704


12 485:1 9178:1 6575:1 5342:1 3834:1 3183:1 5349:1 7479:1 2918:1 7451:1 626:1 7548:1 2529:1 #539414157502980096


12 9598:1 1243:1 2510:1 5334:1 2805:1 7451:1 8985:1 3834:1 7982:1 798:1 7632:1 3923:1 2987:1 6777:1 1684:1 #539433723356127232


12 6452:1 6777:1 7117:1 #53947234

In [94]:
# training data must be compartmentalised (when implementing this in OO, we can enforce this constraint)
def write_feature_file(data_set_compart, feature_set, fp="./feats.train", encoding="UTF-8"):
    with codecs.open(fp, "w", encoding) as file:
        for compart_tweet in data_set_compart:
            try:
                line_to_write = return_increasting_data_point(compart_tweet, feature_set)
                if len(line_to_write) > 0:
                    file.write(return_increasting_data_point(compart_tweet, feature_set))
                    file.write("\n")
            except (KeyError, TypeError, IndexError, Exception) as e:
                continue
    return True

In [95]:
def preprocess_tweet_text(tweet_text):
    preprocessed = []
    for tweet in tweet_text:
        if tweet.startswith('RT') or tweet.startswith('"RT'): continue
        preprocessed.append(preprocess_string(tweet))
    return preprocessed

In [96]:
def preprocess_compart_tweets(data_set):
    preprocessed_set = []
    for compart_tweet in data_set:
        if len(compart_tweet) != 3: 
            print("WHATS GOING ON? : ")
            print(compart_tweet)
            continue
        #if compart_tweet[1].startswith('RT') or compart_tweet[1].startswith('RT'): continue
        preprocessed_set.append([compart_tweet[0], preprocess_string(compart_tweet[1]), compart_tweet[2]])
    return preprocessed_set

In [97]:
# get training and testing file encoding
charset_train = detect_charset("./Tweets.14cat.train")
charset_test = detect_charset("./Tweets.14cat.test")

# load respective files
training_set = read_file("./Tweets.14cat.train", function="r", encoding=charset_train)
testing_set = read_file("./Tweets.14cat.test", function="r", encoding=charset_test)

# extract BoW from training file, initially break up the tweets: ID / TWEET / LABEL
training_set_compart = compartmentalise_tweets(training_set)
testing_set_compart = compartmentalise_tweets(testing_set)

# preprocess
training_set_compart_pre = preprocess_compart_tweets(training_set_compart)
testing_set_compart_pre = preprocess_compart_tweets(testing_set_compart)

# then obtain only the tweet text (no id, no label)
training_tweet_text_pre = extract_tweet_text(training_set_compart_pre)
testing_tweet_text_pre = extract_tweet_text(testing_set_compart_pre)

# reduce tweet text to unique tokens
training_unique_tokens = get_unique_tokens(training_tweet_text_pre)
testing_unique_tokens = get_unique_tokens(testing_tweet_text_pre)

# transform tokens into features
training_dict_of_tokens = dict(enumerate(training_unique_tokens, 1))
testing_dict_of_tokens = dict(enumerate(testing_unique_tokens, 1))

# write features to file
write_tokens_to_file(training_dict_of_tokens)

# we need to reverse the feature set before we generate any training / testing sets, load them from disk
training_feature_set = read_tokens_from_file(DICT, "./feats")

# I appear to be missing ALOT of features in my testing feature set, I think this is because the entire tweet
# is being ignored, not just the single words.. let me try and generate a testing_feature_set
# testing_feature_set = {v:k for k,v in testing_dict_of_tokens.items()}
# yes, this works fine, but I need to modify my code to only ignore single terms
# generate training set using training data (tweets) and feature set (dict_of_tokens)
write_feature_file(training_set_compart_pre, training_feature_set)

# generate testing set using testing data (tweets) and feature set from the training data
write_feature_file(testing_set_compart_pre, training_feature_set, "./feats.test")

WHATS GOING ON? : 
['\r\n']


True

In [98]:
# so, I'm getting an error with the SVM classifier, apparently my features need to be in increasing order.. 
# so let's make that happen

In [99]:
def return_increasting_data_point(compart_tweet, feature_set):
    feature_ized_tweet = []
    try:
        # compartmentalised lookup key
        clu_key = compart_tweet[2].strip()
        # class_look_up is a global variable containing int keys for labels
        #feature_ized_tweet.append(str(class_look_up[clu_key]))
        # obtaining all tokens within the tweet text itself
        for token in word_tokenize(compart_tweet[1]):
            # look up each token in and find its co-responding feature ID, :1 for binary 'YES'
            try:
                feature_ized_tweet.append(int(feature_set[token]))
            except (IndexError, KeyError):
                #print("Looks like we're running into some kind of indexing issue Euston..")
                continue
        
        # order by increasing..
        
        feature_ized_tweet = set(feature_ized_tweet)
        
        feature_ized_tweet = list(feature_ized_tweet)
        
        feature_ized_tweet.sort()
        
        feature_ized_tweet = [str(feature) + ":1" for feature in feature_ized_tweet]
        
        feature_ized_tweet.insert(0, str(class_look_up[clu_key]))

        feature_ized_tweet.append("#" + str(compart_tweet[0]))
        
        return ' '.join(feature_ized_tweet)
    
    except (IndexError, TypeError, KeyError) as e:
        #print("############################## EXCEPTION ##########################")
        #print(e)
        #print("############################## EXCEPTION ##########################")
        return ''

In [100]:
return_increasting_data_point(training_set_compart[255], training_feature_set)

'7 403:1 899:1 1664:1 1697:1 3458:1 3633:1 4038:1 5900:1 6777:1 7552:1 #45304627000613683'

In [101]:
# right, that seems to work.. let's see if our SVM kicks up a fuss..

In [102]:
# okay, so we're still kicking up a fuss! What's the problem exactly, let's dive into the source code..

In [103]:
# Okay, so this is where we're falling down..

In [104]:
# The position is the pointer on the current line for parseing the file..
# and wpos is the word position..
# so the conditional we're failing @ is: if the word position is > 0 and the value (struct, I assume) at some 
# memory location (representing the final word)s wnum value is >= wnum, then we exit..
# now.. what's wnum..?
# I'm half tempted to just remove this conditional and recompile, lol

# it looks like wnum is the (temporarily) binary property that we're using to say that this feature is present
# within this data point, it's weird because each word has a wnum value and there appears to be some kind of
# global wnum..

# I believe.. We can just remove this 'naughtily' remove this conditional form the source code and recompile..
# because I really don't see what I'm doing wrong.

In [105]:
# jesus, now features have to start from 1, not zero, kill me lord

In [106]:
training_dict_of_tokens

{1: 'officially',
 2: 'claude',
 3: 'jacket',
 4: 'thecomedypub',
 5: 'beneath',
 6: 'hotellife',
 7: 'urdu',
 8: 'charts',
 9: 'longisland',
 10: 'kinect',
 11: 'welve',
 12: 'safilo',
 13: 'far',
 14: 'indiauntasted',
 15: 'freeway',
 16: 'mx5',
 17: 'pairings',
 18: 'youngster',
 19: 'bares',
 20: 'labrascals',
 21: 'brynmusic',
 22: 'becomes',
 23: 'henriesmorse',
 24: 'phillips',
 25: 'mohamednawito',
 26: 'smithsonian',
 27: 'gg',
 28: 'period',
 29: 'beyondthesurface',
 30: 'kong',
 31: 'awww',
 32: 'mobilephones',
 33: 'love__kai',
 34: 'superstart20',
 35: 'flocka',
 36: 'exus',
 37: 'hours',
 38: 'jordanian',
 39: 'splitsider',
 40: 'pisses',
 41: 'retw',
 42: '41',
 43: 'ga',
 44: 'status',
 45: 'moon',
 46: 'drainage',
 47: 'believes',
 48: 'give',
 49: 'readerrun',
 50: 'itimestweets',
 51: 'jets',
 52: 'embarrassment',
 53: 'syrianchildren',
 54: 'butta',
 55: 'indiedev',
 56: 'kind',
 57: 'suspend',
 58: '2395',
 59: 'hitlers',
 60: 'jasten_artist',
 61: 'grim',
 62: 'we

In [107]:
# okay, so if I re-introduce the conditional into the source code for the SVM, it doesn't run
# I believe it's because the feature values (on the left hand side) are not increaseing from 1 to 14
# so... let's make some changes and see if modifying that increases the accuracy from ~61%

In [108]:
training_set_compart

[['45029314109075046',
  'Furniture for - so cute! gotta show my #granddog mama the last one especially :) http://t.co/F69aT71TVQ http://t.co/YQVK09pZzB',
  'Pets & Animals\r\n'],
 ['45033090867215155', '"#Sunday aww"": Mr Peebles', 'Pets & Animals\r\n'],
 ['45036625162627481',
  'CATS ART http://t.co/cJre1jn2Bl #creative #feline #art #love #cat #cats #kittens #housecat #domestic #alley #tomcat',
  'Pets & Animals\r\n'],
 ['45086603513077350',
  'RT @Masala_chaai: Keep Calm & Hug your Dog ! #PetLovers cc @MyICETag @pooja330 @huftindia @PranitaBalar @BarknBond http://t.co/JJHSvf…',
  'Pets & Animals\r\n'],
 ['45138968053405286',
  'RT @TheSoulfulEMU: RETWEET if you love your dog!! http://t.co/QWvjFFnfiP via @earthposts @LUKIKA ',
  'Pets & Animals\r\n'],
 ['45171179411842662',
  'Missing cat Atlantic Gardens http://t.co/e2mu2yiv6H #southjersey #petnews #petadoption #pettips #cats #dogs #petadoption',
  'Pets & Animals\r\n'],
 ['45214142046457446',
  'RT @Doggy_Stylin: First-time custome

In [109]:
with codecs.open("./feats.train", "r", "UTF-8") as file:
    with codecs.open("./feats.train.ordered", "w", "UTF-8") as write_file:
        for line in sorted(file.readlines(), key=lambda line: int(line.split()[0])):
            if len(line) > 1:
                write_file.write(line)

In [110]:
'newest' in training_dict_of_tokens.values()

True

In [111]:
training_set_compart_pre

[['45029314109075046',
  'furniture for  so cute gotta show my #granddog mama the last one especially   ',
  'Pets & Animals\r\n'],
 ['45033090867215155', '#sunday aww mr peebles', 'Pets & Animals\r\n'],
 ['45036625162627481',
  'cats art  #creative #feline #art #love #cat #cats #kittens #housecat #domestic #alley #tomcat',
  'Pets & Animals\r\n'],
 ['45086603513077350',
  'rt @masala_chaai keep calm  hug your dog  #petlovers cc @myicetag @pooja330 @huftindia @pranitabalar @barknbond ',
  'Pets & Animals\r\n'],
 ['45138968053405286',
  'rt @thesoulfulemu retweet if you love your dog  via @earthposts @lukika ',
  'Pets & Animals\r\n'],
 ['45171179411842662',
  'missing cat atlantic gardens  #southjersey #petnews #petadoption #pettips #cats #dogs #petadoption',
  'Pets & Animals\r\n'],
 ['45214142046457446',
  'rt @doggy_stylin firsttime customers receive a 10 off one fullservice day grooming  #dog #grooming #puppies',
  'Pets & Animals\r\n'],
 ['45274909095128268', 'rt @petsweekly so', 