# Collect English loanword candidates
In which we collect the loanword candidates that are specific to the social media platforms of interest, using previously collected word counts ("fetch" study) and POS tags (to isolate noun/verb pairs).

In [1]:
import numpy as np
import pandas as pd

## Get POS tags (reddit)
Let's get the POS tags for reddit using the "fetch" study tags as a starting point. We'll expand to a bigger vocabulary if needed.

In [2]:
reddit_tag_pcts = pd.read_csv('../../data/mined_reddit_comments/reddit_2013_2016_tag_pcts.tsv', sep='\t', index_col=0)
display(reddit_tag_pcts.head())

Unnamed: 0,!,#,$,&,",",A,D,E,G,L,...,R,S,T,U,V,X,Y,Z,^,~
nunnery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003542,0.0
sowell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003641,0.0,0.0,0.0,0.991653,0.0
woods,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.6e-05,0.0,0.0,0.0,1.8e-05,0.0
clotted,0.0,0.0,0.0,0.0,0.0,0.99088,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002703,0.0,0.0,0.0,0.004951,0.0
spiders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000112,0.0,0.0,0.0,0.000682,0.0


In [5]:
# let's try some candidate noun/verb pairs
test_words = ['post', 'upvote', 'downvote', 'block', 'troll', 'trolled']
for test_word in test_words:
    tag_pct_i = reddit_tag_pcts.loc[test_word, :].sort_values(inplace=False, ascending=False)
    tag_pct_i = tag_pct_i[tag_pct_i > 0.]
    print('test word %s has tag pcts\n%s'%(test_word, tag_pct_i))

test word post has tag pcts
N    0.714383
V    0.284010
A    0.001562
P    0.000024
R    0.000021
Name: post, dtype: float64
test word upvote has tag pcts
V    5.824138e-01
N    3.742320e-01
A    2.004457e-02
G    1.407947e-02
!    4.537287e-03
^    3.788560e-03
R    5.133337e-04
O    2.714444e-04
P    1.045992e-04
E    8.919910e-06
D    5.498812e-06
T    5.552668e-07
Name: upvote, dtype: float64
test word downvote has tag pcts
V    0.666391
N    0.280947
A    0.035475
^    0.008346
R    0.005686
!    0.002126
G    0.000623
P    0.000153
D    0.000152
O    0.000095
T    0.000007
Name: downvote, dtype: float64
test word block has tag pcts
N    0.583007
V    0.415149
A    0.001771
^    0.000068
R    0.000005
Name: block, dtype: float64
test word troll has tag pcts
N    0.950627
V    0.043040
^    0.005972
A    0.000336
R    0.000025
Name: troll, dtype: float64
test word trolled has tag pcts
V    0.989717
N    0.005804
A    0.004478
Name: trolled, dtype: float64


Most of these make sense, except for `troll` which I would expect to have a higher verb distribution.

Maybe a better way to find verb candidates is to look in the `EN` vocab for consistent groups of verbs based on suffixes (`trolling`, `trolled`).

Basic test for finding verb/noun pairs:

- find words with high likelihood of being noun (>=50%)
- look for verb matches in vocab (`troll` => `trolled`, etc.)
- if # verb matches > 0, then pair

In [44]:
noun_cutoff = 0.25
noun_candidate_tag_pcts = reddit_tag_pcts[reddit_tag_pcts.loc[:, 'N'] >= noun_cutoff].loc[:, 'N'].sort_values(inplace=False, ascending=False)
display(noun_candidate_tag_pcts.tail(50))

addi                0.250883
dosing              0.250882
wikibot             0.250859
cruces              0.250857
unicums             0.250849
tomoko              0.250827
pubstomping         0.250752
bristled            0.250731
trichotillomania    0.250693
zl                  0.250677
journaled           0.250676
tanaan              0.250614
oxidizes            0.250594
cannonballs         0.250593
waluigi             0.250561
clippers            0.250542
relabeled           0.250533
faints              0.250524
stater              0.250523
whirls              0.250455
daggerfall          0.250450
booom               0.250448
corsair             0.250437
aldmeri             0.250432
neuvy               0.250401
chronomancer        0.250389
fcp                 0.250384
vodlocker           0.250383
urs                 0.250381
romanticizing       0.250337
andro               0.250323
srd                 0.250304
frodan              0.250298
radeon              0.250259
karmas        

Sure! Let's look for some noun/verb pairs.

In [45]:
import re
verb_inflections = ['ed', 'ing']
# handle words with word-final <e>
noun_ending_matcher = re.compile('e$')
# TODO: handle short-vowel words? "bat" => "batting"
noun_candidates = noun_candidate_tag_pcts.index
en_vocab = reddit_tag_pcts.index.tolist()
noun_verb_candidates = []
for noun_candidate in noun_candidates:
    # handle words with word-final <e>
    if(noun_ending_matcher.search(noun_candidate)):
        noun_candidate_stem = noun_ending_matcher.sub('', noun_candidate)
        generated_verbs = ['%s%s'%(noun_candidate_stem, verb_inflection) for verb_inflection in verb_inflections]
    else:
        generated_verbs = ['%s%s'%(noun_candidate, verb_inflection) for verb_inflection in verb_inflections]
    valid_generated_verbs = list(filter(lambda x: x in en_vocab, generated_verbs))
    if(len(valid_generated_verbs) > 0):
        noun_verb_candidate_pair = [noun_candidate, valid_generated_verbs]
        noun_verb_candidates.append(noun_verb_candidate_pair)

In [51]:
print(noun_verb_candidates[:20])

[['damme', ['dammed', 'damming']], ['parte', ['parted', 'parting']], ['people', ['peopled']], ['teeth', ['teething']], ['fad', ['faded', 'fading']], ['superpower', ['superpowered']], ['mode', ['moded', 'moding']], ['football', ['footballing']], ['sugar', ['sugared', 'sugaring']], ['tooth', ['toothed']], ['mouth', ['mouthed', 'mouthing']], ['proportion', ['proportioned']], ['conference', ['conferencing']], ['fate', ['fated']], ['hair', ['haired']], ['panel', ['paneling']], ['intention', ['intentioned']], ['event', ['evented', 'eventing']], ['course', ['coursing']], ['neighbor', ['neighboring']]]


Convert to dict for easy lookup.

In [47]:
noun_verb_candidates_lookup = {k : v for k,v in noun_verb_candidates}

In [49]:
[x for x in en_vocab if 'downvote' in str(x)]

['downvote', 'downvoters', 'downvotes', 'downvoter', 'downvoted']

In [52]:
# check for manual pairs
test_words = ['post', 'upvote', 'downvote', 'block', 'troll']
for test_word in test_words:
    if(test_word) in noun_verb_candidates_lookup:
        print('noun %s verbs %s'%(test_word, noun_verb_candidates_lookup[test_word]))

noun post verbs ['posted', 'posting']
noun upvote verbs ['upvoted', 'upvoting']
noun downvote verbs ['downvoted', 'downvoting']
noun block verbs ['blocked', 'blocking']
noun troll verbs ['trolled', 'trolling']


OK! We have a valid filtering strategy, as long as we're OK with a relatively low `N`/`V` threshold.

In [139]:
import scipy.sparse
# combined
# POS_counts_data = scipy.sparse.load_npz('../../data/mined_tweets/POS_tag_stats/tweets_POS_tags_LANG=en.npz')
# POS_count_rows = [l.strip() for l in open('../../data/mined_tweets/POS_tag_stats/tweets_POS_tags_LANG=en_rows.txt', 'r')]
# POS_count_cols = [l.strip() for l in open('../../data/mined_tweets/POS_tag_stats/tweets_POS_tags_LANG=en_cols.txt', 'r')]
# sample
POS_counts_data = scipy.sparse.load_npz('../../data/mined_tweets/POS_tag_stats/tweets-Aug-01-17-03-51_LANG=en_tagged_tag_pct.npz')
POS_count_rows = [l.strip() for l in open('../../data/mined_tweets/POS_tag_stats/tweets-Aug-01-17-03-51_LANG=en_tagged_tag_pct_rows.txt', 'r')]
POS_count_cols = [l.strip() for l in open('../../data/mined_tweets/POS_tag_stats/tweets-Aug-01-17-03-51_LANG=en_tagged_tag_pct_cols.txt', 'r')]
# convert to dataframe
POS_counts = pd.DataFrame(POS_counts_data.todense(), index=POS_count_rows, columns=POS_count_cols)
POS_max_count_tags = POS_counts.idxmax(axis=1)
print(POS_max_count_tags.head(50))

@user     @
>         E
<         ~
:         ~
url       N
rt        ~
.         ,
num       ^
#hash     N
,         ,
…         ,
!         ,
"         ,
?         ,
-         ,
just      R
...       ,
like      P
&         &
will      V
(         ,
one       $
get       V
can       V
people    N
)         ,
love      V
now       R
“         ,
new       A
time      N
'         ,
know      V
”         ,
good      A
day       N
need      V
see       V
want      V
u         O
i’m       L
go        V
today     N
us        O
best      A
don’t     V
really    R
>.<       E
back      R
make      V
dtype: object


In [140]:
# a = POS_counts.head(20) / POS_counts.head(20).sum(axis=1)
a = POS_counts.copy()
print(a.sum(axis=1))

@user        1.0
>            1.0
<            1.0
:            1.0
url          1.0
            ... 
ypu          1.0
bargains     1.0
barker       1.0
you✅🐶        1.0
daydreams    1.0
Length: 32278, dtype: float64


In [131]:
display(POS_counts.head())

Unnamed: 0,X,#,S,P,A,T,^,E,V,$,...,O,Z,&,~,@,D,G,U,N,","
<,0.0,0.0,0.0,0.0,2e-06,0.0,8e-05,2.6e-05,0.0,0.0,...,0.0,0.0,0.0,0.481052,0.0,0.0,0.000209,0.0,0.0,0.0
>,0.0,0.0,0.0,0.0,3e-06,0.0,0.041749,0.728516,1.174961e-06,0.0,...,0.0,0.0,0.0,1.6e-05,0.0,0.0,0.672169,0.0,2.773987e-07,0.055594
@user,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.999744,0.0,0.0,0.0,0.0,0.0
:,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000862,0.0,0.0,...,0.0,0.0,0.0,0.256671,0.0,0.0,3e-06,0.0,0.0,0.070761
url,0.0,0.0,0.0,0.0,0.0,0.0,0.000199,0.0,5.874806e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8e-05,0.0,0.202295,0.0


In [66]:
## fixing EN tag file format
import re
import os
from ast import literal_eval

en_tag_file_matcher = re.compile('LANG=en')
en_tag_data_dir = '/hg190/corpora/twitter-crawl/new-archive/POS_tags/'
en_tag_files = list(filter(lambda x: en_tag_file_matcher.search(x) is not None, os.listdir(en_tag_data_dir)))
en_tag_files = list(map(lambda x: os.path.join(en_tag_data_dir, x), en_tag_files))
for f in en_tag_files:
    df = pd.read_csv(f, sep='\t', index_col=False, compression='gzip', converters={'1' : literal_eval})
    display(df.head())
    break
#     df = df.assign(**{
#         '1' : df.loc[:, '1'].apply(lambda x: list(map(lambda y: y.split('/'))))
#     })
# print(en_tag_files)

Unnamed: 0,0,1
0,8.477182e+17,"[@MinorSmile09/@, Whenst/O, will/V, NibbAS/^, ..."
1,8.477182e+17,"[RT/~, @BYDOYOUNG/@, :/~, 도영/N, 음색/N, ->/G, ht..."
2,8.477182e+17,"[도영/^, 공명/^, ->/G, https://t.co/CLHHOaV1TY/U]"
3,8.477182e+17,"[도영/^, 움짤/^, ->/G, https://t.co/stPDn1b5ta/U]"
4,8.477182e+17,"[RT/~, @wordstionary/@, :/~, https://t.co/UNwI..."


In [72]:
f = '/hg190/corpora/twitter-crawl/new-archive/POS_tags/tweets-Sep-01-17-03-43_LANG=en_tagged.tsv.gz'
df = pd.read_csv(f, sep='\t', index_col=False, compression='gzip', converters={'1' : literal_eval})
display(df.head())

Unnamed: 0,0,1
0,903172109370843136,"[(rt, ~), (@USER, @), (:, ~), (q, N), (:, ,), ..."
1,903172109396037632,"[(@USER, @), (u, O), (r, V), (a, D), (good, A)..."
2,903172109387665408,"[(i'm, L), (the, D), (biggest, A), (baby, N), ..."
3,903172109404225537,"[(rt, ~), (@USER, @), (:, ~), (if, P), (two, $..."
4,903172109404446721,"[(and, &), (i, O), (ain't, V), (going, V), (to..."


In [79]:
x = df.loc[:, '1'].apply(lambda x: ' '.join(['//'.join(y) for y in x])).values[:30]

In [82]:
x[:10]

array(['rt//~ @USER//@ ://~ q//N ://, photo//N vs//P video//N ?//, nj//^ ://, photo//N jin//^ ://, photo//N yg//N ://, photo//N hs//N ://, video//N jm//G ://~ video//N v://G video//N jk//G ://~ video//N',
       '@USER//@ u//O r//V a//D good//A soul//N .//, 😁👍//E',
       "i'm//L the//D biggest//A baby//N when//R i'm//L sick//A and//& i//O annoy//V the//D shit//N out//P of//P everyone//N so//P please//V bare//A with//P me//O .//,",
       "rt//~ @USER//@ ://~ if//P two//$ people//N are//V meant//V to//P be//V together//R ,//, they//O will//V eventually//R find//V their//D way//N back//R into//P each//D other's//S arms//N ,//, no//D matter//N what//O .//,",
       "and//& i//O ain't//V going//V to//P war//V about//P no//D nigga//N unless//P it's//L my//D brothers//N nigga//N",
       'girls//N naked//A and//& nude//A having//V a//D sex//N naked//A man//N festival//N japan//^ <//~ URL//N >//,',
       'block//N hash//N <//~ NUM//^ >//G adab//G <//~ NUM//^ >//G d//G <//~ NUM//^ >//G ff//G

In [81]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=lambda x: x.split(' '))
dtm = cv.fit_transform(x)
print(dtm)

  (0, 215)	1
  (0, 22)	1
  (0, 14)	3
  (0, 206)	1
  (0, 13)	5
  (0, 200)	4
  (0, 277)	1
  (0, 274)	5
  (0, 21)	1
  (0, 181)	1
  (0, 146)	1
  (0, 300)	1
  (0, 129)	1
  (0, 148)	1
  (0, 273)	1
  (0, 147)	1
  (1, 22)	1
  (1, 263)	1
  (1, 207)	1
  (1, 25)	1
  (1, 115)	1
  (1, 234)	1
  (1, 10)	1
  (1, 309)	1
  (2, 10)	1
  :	:
  (28, 13)	1
  (28, 249)	1
  (28, 16)	1
  (28, 187)	1
  (28, 20)	1
  (28, 140)	1
  (28, 79)	1
  (28, 266)	1
  (28, 253)	1
  (29, 215)	1
  (29, 22)	6
  (29, 14)	1
  (29, 25)	1
  (29, 10)	1
  (29, 249)	1
  (29, 254)	1
  (29, 51)	1
  (29, 305)	1
  (29, 291)	1
  (29, 124)	1
  (29, 209)	1
  (29, 40)	1
  (29, 271)	1
  (29, 225)	1
  (29, 74)	1


In [90]:
slash_matcher = re.compile('(.+)//(.)')
print(x[10].split(' '))
print([y.split('//') for y in x[10].split(' ')])

['github//^', '-//,', 'rhysd/unite-ruby-require//V', './/,', 'vim//N', '://,', 'a//D', 'unite//N', './/,', 'vim//N', 'source//N', 'for//P', 'searching//V', 'gems//N', 'to//P', 'require//V', '<//~', 'URL//N', '>//E', '<//~', 'URL//N', '>//E']
[['github', '^'], ['-', ','], ['rhysd/unite-ruby-require', 'V'], ['.', ','], ['vim', 'N'], [':', ','], ['a', 'D'], ['unite', 'N'], ['.', ','], ['vim', 'N'], ['source', 'N'], ['for', 'P'], ['searching', 'V'], ['gems', 'N'], ['to', 'P'], ['require', 'V'], ['<', '~'], ['URL', 'N'], ['>', 'E'], ['<', '~'], ['URL', 'N'], ['>', 'E']]
