In [1]:
import pandas as pd
from nltk.tag import StanfordNERTagger
import os
from tqdm import tqdm

# Alternatively to setting the CLASSPATH add the jar via their path:
os.environ["CLASSPATH"] = "../stanford-ner.jar"

# Set Path
path = '../'
output_path = '../output_files/'

# Set StanfordNERTagger
st = StanfordNERTagger(path + 'classifiers/english.muc.7class.distsim.crf.ser.gz')

In [2]:
# Set data name list
data_list = ['b_comment_full', 'c_comment_full', 'full_comment_full', 'full_text_full', 'story_full', 'update_full']
#data_list = ['c_comment_full', 'update_full', 'story_full']

In [51]:
data = 'full_text_full'

In [52]:
# Load Dataset
df = pd.read_csv(path + 'data/' + data + '.tsv', sep='\t')
    
print('Start counting...', data)

# Make dataframe to save the NER counts
df_ner_count = pd.DataFrame(columns={'ID','LOCATION', 'PERSON', 'ORGANIZATION', 
                                         'MONEY', 'PERCENT', 'DATE', 'TIME'})

Start counting... full_text_full


In [53]:
df

Unnamed: 0,ID,success_or_fail,content
0,ambiolight-a-one-touch-room-makeover,0,This is why we created the AmbioLight. Room ma...
1,arc-island-a-brave-new-civilization,0,Once in a generation comes a project so audaci...
2,asylum-playing-cards,1,Asylum is a fully customized Bicycle® Playing ...
3,code-hero-a-game-that-teaches-you-to-make-game...,1,Code Hero is a game that teaches you how to ma...
4,dark-skyes-an-epic-brony-dating-sim,0,Dark Skyes is a pony dating sim with deep RPG ...
...,...,...,...
203,junkyard-glass,1,8 INCH JUNKYARD GLASS BOWL VESSEL SINK MADE FR...
204,handspun-single-sheep-breed-yarns,1,Bring super luxury hand spun made from single ...
205,tgt-tight-a-new-kind-of-wallet,1,"After the Kickstarter, please come visit at ti..."
206,picobrew-zymatic-the-automatic-beer-brewing-ap...,1,"THANK YOU BACKERS, FOR AN AMAZING KICKSTARTER ..."


In [54]:
print('Start counting NER...')

Start counting NER...


In [55]:
df_ner_count = pd.DataFrame()

for idx, text in enumerate(tqdm(df['content'])):
    ner_tag_list = st.tag(text.split())
        
    index_id = df['ID'][idx]
    #print(index_id)
    
    tag_count_dict = {'ID':index_id, 'LOCATION':0, 'PERSON':0, 'ORGANIZATION':0, 'MONEY':0, 
                          'PERCENT':0, 'DATE':0, 'TIME':0}
        
    for word, tag in ner_tag_list:
        if tag in tag_count_dict.keys():
            tag_count_dict[tag] += 1
    
    df_ner_count = df_ner_count.append(tag_count_dict, ignore_index=True)
    

100%|██████████| 208/208 [18:37<00:00,  5.37s/it]


In [56]:
df_new = pd.merge(df, df_ner_count)

In [57]:
df_ner_count

Unnamed: 0,DATE,ID,LOCATION,MONEY,ORGANIZATION,PERCENT,PERSON,TIME
0,0.0,ambiolight-a-one-touch-room-makeover,4.0,0.0,11.0,0.0,6.0,0.0
1,2.0,arc-island-a-brave-new-civilization,11.0,0.0,8.0,0.0,9.0,0.0
2,35.0,asylum-playing-cards,48.0,0.0,129.0,0.0,81.0,1.0
3,96.0,code-hero-a-game-that-teaches-you-to-make-game...,49.0,9.0,272.0,2.0,395.0,5.0
4,3.0,dark-skyes-an-epic-brony-dating-sim,11.0,0.0,9.0,0.0,28.0,0.0
...,...,...,...,...,...,...,...,...
203,1.0,junkyard-glass,1.0,0.0,1.0,0.0,2.0,0.0
204,2.0,handspun-single-sheep-breed-yarns,3.0,2.0,29.0,0.0,7.0,1.0
205,58.0,tgt-tight-a-new-kind-of-wallet,67.0,0.0,92.0,0.0,99.0,3.0
206,51.0,picobrew-zymatic-the-automatic-beer-brewing-ap...,28.0,2.0,109.0,0.0,62.0,1.0


In [58]:
df_new

Unnamed: 0,ID,success_or_fail,content,DATE,LOCATION,MONEY,ORGANIZATION,PERCENT,PERSON,TIME
0,ambiolight-a-one-touch-room-makeover,0,This is why we created the AmbioLight. Room ma...,0.0,4.0,0.0,11.0,0.0,6.0,0.0
1,arc-island-a-brave-new-civilization,0,Once in a generation comes a project so audaci...,2.0,11.0,0.0,8.0,0.0,9.0,0.0
2,asylum-playing-cards,1,Asylum is a fully customized Bicycle® Playing ...,35.0,48.0,0.0,129.0,0.0,81.0,1.0
3,code-hero-a-game-that-teaches-you-to-make-game...,1,Code Hero is a game that teaches you how to ma...,96.0,49.0,9.0,272.0,2.0,395.0,5.0
4,dark-skyes-an-epic-brony-dating-sim,0,Dark Skyes is a pony dating sim with deep RPG ...,3.0,11.0,0.0,9.0,0.0,28.0,0.0
...,...,...,...,...,...,...,...,...,...,...
203,junkyard-glass,1,8 INCH JUNKYARD GLASS BOWL VESSEL SINK MADE FR...,1.0,1.0,0.0,1.0,0.0,2.0,0.0
204,handspun-single-sheep-breed-yarns,1,Bring super luxury hand spun made from single ...,2.0,3.0,2.0,29.0,0.0,7.0,1.0
205,tgt-tight-a-new-kind-of-wallet,1,"After the Kickstarter, please come visit at ti...",58.0,67.0,0.0,92.0,0.0,99.0,3.0
206,picobrew-zymatic-the-automatic-beer-brewing-ap...,1,"THANK YOU BACKERS, FOR AN AMAZING KICKSTARTER ...",51.0,28.0,2.0,109.0,0.0,62.0,1.0


In [59]:
df_new.to_csv('./full_text_full_ner.tsv', sep='\t')