In [4]:
import pickle
import numpy as np
import collections
from collections import Counter
import string

In [5]:
def remove_hash_tokenize(tweet):
    new = tweet.replace('#', '')
    return new.split()

In [6]:
BASE = '../data/'
pos = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'train_pos_full.txt', encoding='utf8')])
neg = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'train_neg_full.txt', encoding='utf8')])
test = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'test_data.txt', encoding='utf8')])

In [7]:
pos_tkn = [remove_hash_tokenize(tweet) for tweet in pos] 
neg_tkn = [remove_hash_tokenize(tweet) for tweet in neg] 
test_tkn = [remove_hash_tokenize(tweet) for tweet in test] 

In [8]:
all_tkn = [w for tokens in pos_tkn+neg_tkn+test_tkn for w in tokens]

In [9]:
len(all_tkn)

39499657

In [10]:
tkn_counter = Counter(all_tkn)
len(tkn_counter)

571468

### Upload Diictionaries

In [11]:
def upload_dict(file, BASE = './dict/', hltud=False):
    path = BASE + file
    words = np.asarray([line.rstrip('\n').lower() for line in open(path)])
    if hltud:
        keys = [w.split()[1] for w in words]
        values = [w.split()[3] for w in words]
    else:
        keys = [w.split()[0] for w in words]
        values = [w.split()[1] for w in words]
    return dict(zip(keys, values))

In [12]:
emnlp = upload_dict('emnlp_dict.txt')
luulu = upload_dict('luulu_typo-corpus-r1.txt')
hltd = upload_dict('hltutdallas.txt', hltud=True)

In [13]:
spell_dict = set(list(luulu)+list(emnlp)+list(hltd))

In [15]:
len(spell_dict)

49349

#### upload english dictionary

In [16]:
BASE = '../data/dictionaries/'
english_words = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'english_words.txt')])
idx = np.arange(int(len(english_words)/3))
english_dictionary = dict(zip(english_words[3*idx+1], english_words[3*idx+1]))
freq =  dict(zip(english_words[3*idx+1], english_words[3*idx+2]))

In [17]:
len(english_dictionary), len(set(english_dictionary))

(36662, 36662)

#### upload acronyms / smileys

In [18]:
def correct(sentence, contrac_dict={}):
	'replace contractions in sentence and remove punctuation'
	tokens = sentence.split()
	new_tokens = []
	for token in tokens:
		if token in contrac_dict:
			new_tokens.append(contrac_dict[token])
		if len(token)>1:
			new_tokens.append(''.join(c for c in token if c not in string.punctuation))
	return ' '.join(new_tokens)

In [23]:
## Acronyms
acronyms = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_acronyms.txt')])
idx = np.arange(int(len(acronyms)/2))
acronyms_dict = dict(zip(acronyms[2*idx], acronyms[2*idx+1]))
#Remove multi explications
for key in acronyms_dict:
	acronyms_dict[key] = acronyms_dict[key].split('/ ')[0]
  #correct descriptions
for key in acronyms_dict:
	acronyms_dict[key] = correct(acronyms_dict[key])

## Smileys
smileys = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_smileys.txt')])
idx = np.arange(int(len(smileys)/2))
smileys_dict = dict(zip(smileys[2*idx], smileys[2*idx+1]))
#Remove multi explications
for key in smileys_dict:
	smileys_dict[key] = smileys_dict[key].split('- ')[0]
#remove '-' from smiley and add it
keys = list(smileys_dict.keys())
for s in keys:
    if '-' in s and len(s)>2:
        smileys_dict[s.replace('-', '')] = smileys_dict[s]

In [24]:
len(acronyms), len(smileys_dict)

(5032, 412)

### Count words

In [66]:
def count_presence(tkn_counter, word_dict):
    count = 0
    for w in tkn_counter:
        if w in word_dict:
            count += tkn_counter[w]
    print('{}% found'.format(round(count*100/sum(tkn_counter.values()), 2)))

def get_unknown_words(tkn_counter, word_dict, Threshold=100):
    unknown_words = []
    for w in tkn_counter:
        if w not in word_dict and len(w)<=Threshold:
            unknown_words.append(w)
    return unknown_words

In [63]:
count_presence(tkn_counter, spell_dict)

6.14% found


In [27]:
count_presence(tkn_counter, english_dictionary)

68.16% found


In [28]:
tmp = list(english_dictionary.keys())+list(spell_dict)+list(smileys_dict.keys())+list(acronyms_dict.keys())
tmp = set(tmp)

In [35]:
count_presence(tkn_counter, tmp)

74.73% found


In [56]:
unknown = get_unknown_words(tkn_counter, tmp, 2)
len(unknown)

803

In [57]:
sum([tkn_counter[w] for w in unknown])*100/len(all_tkn)

9.817214362139904

In [60]:
for w in list(tkn_counter.keys()):
    if w in unknown:
        del tkn_counter[w]

In [64]:
count_presence(tkn_counter, tmp)

82.87% found


In [67]:
unknown = get_unknown_words(tkn_counter, tmp)

In [68]:
unknown

['<user>',
 'tsk',
 '<url>',
 'casper',
 'crakkbitch',
 "don't",
 '...',
 'lifecompleted',
 'facebook',
 '1dnextalbumtitle',
 'rollercoaster',
 'cocept',
 'followmeplz',
 'x15',
 'coworker',
 'replying',
 'nationals2013',
 'finna',
 'monet',
 'katemelo',
 'arrived',
 'yeslord',
 'barca',
 'grains',
 'jaeyay',
 'werna',
 'khatam',
 'hojaeygi',
 'baaqi',
 'buttering',
 'karaygay',
 'sab',
 'mid-term',
 "i'm",
 '10:30-',
 '2:30',
 '16millionbritneyfan',
 "it's",
 ":')",
 'pros',
 'mikes',
 "q's",
 '4/20',
 'reasons2dothebirdmanhandrub',
 'lols',
 'tomlinson',
 'bigger',
 'yougetmajorpointsif',
 'southall',
 "let's",
 'followers',
 '500',
 'teamjessie',
 'thevoiceuk',
 'schmidt',
 'charmer',
 'gazillion',
 'kills',
 'buddy',
 'zuers',
 'midterms',
 'twfanmily',
 'figured',
 'starts',
 'tweets',
 'xoxoxo',
 'aww',
 'lifts',
 'toes',
 '4.20',
 'sparking',
 'jaomikay',
 'kathrynbernardoasmikay',
 'thankyou',
 '40s',
 'gunun',
 'sorusu',
 'via',
 'bieber',
 'assumed',
 'chikaroo',
 'nitrous',


In [71]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


cat
cactus
goose
rock
python
good
best
run
run


In [79]:
unknown_lem = [lemmatizer.lemmatize(w) for w in unknown]

In [82]:
in_dict = [w for w in unknown_lem if w in tmp]

In [84]:
len(in_dict), len(unknown)

(8961, 512816)

In [88]:
sum([tkn_counter[w] for w in in_dict])*100/sum(tkn_counter.values())

26.58061302809556

In [91]:
sum([tkn_counter[w] for w in set(unknown_lem)- set(in_dict)])*100/sum(tkn_counter.values())

16.160163984556576

In [94]:
not_in_dict = list(set(unknown_lem)- set(in_dict))
not_in_dict

['horanywhores',
 'nicolaus',
 'deactived',
 'rufford',
 'substantialerror',
 'tiny.lol',
 'hamat',
 'sluttiness',
 'files-y',
 'crnp',
 '33154',
 '-6728-090',
 'teheee',
 'dontwanttogobacktojail',
 'twill',
 'kaleidotile',
 't-bevel',
 'crazybfprobz',
 'afram',
 'w8t',
 'mind--how',
 'nxgga',
 'kyodai',
 'wanttocry',
 'luvya',
 'bnch',
 "merrell's",
 'g33k',
 'time.good',
 'anydoctorshere',
 "eyebrow's",
 'yesweregay',
 'machu',
 'for.work',
 '72.5',
 'griffs',
 'mydaywillcome',
 'pre-fo',
 'tewmorrow',
 'lilliput',
 'hammermill',
 'utero',
 'sescelduled',
 'gotcho',
 'moltke',
 'clendenen',
 "kob's",
 'fa1772',
 'festo',
 'responsi',
 'lazed',
 'hair-do',
 'bagong',
 'p-diddy',
 'cameronresign',
 'harry_jeffery',
 'teamplapla',
 'madryd',
 'everysingletime',
 'maimed',
 'ineeeht',
 'ms7580',
 'carterton',
 'jewoisdktr',
 'gunne',
 "reusch's",
 'ts300',
 'ilovewhen',
 'screenthis',
 'rarerthanhensteeth',
 'allownig',
 'tarnecky',
 'struggglin',
 'eyebolt',
 'missyouprettyhead',
 'harr

In [97]:
from wordsegment import load, segment

In [98]:
load()

In [113]:
segment('cd_78')

['cd78']

In [99]:
segments =[[w, segment(w)] for w in not_in_dict]

KeyboardInterrupt: 

In [None]:
segments