In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize

import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

from matplotlib import pyplot as plt

import gensim



In [2]:
nlp = en_core_web_sm.load()

In [3]:
# Read the contents
df = pd.read_excel('../data/output/SS_Extracted_content.xlsx')

In [4]:
df["sent_list"] = None
df["refined_content"] = None
df["NER_list"] = None
df["NER_most_common"] = None

In [6]:
ignored_sent_count = 0

for index, row in df.iterrows():
    sent_list = []
    
    d = str( row.content)
    d = d.replace(".\n", ". ")
    d = d.replace(".\r", ". ")
    d = d.replace("\n", ". ")
    d = d.replace("\r", ". ")
    sent = sent_tokenize(d)
    
    sent = [ s for s in sent if s != "." ] # remove sentenances with only a dot
    
    # Ignore Non-english sentenances
    # Sentenances with more than 50% of unicode chars are ignored
    for each_sent in sent:
        
        if each_sent.startswith( "{\"player\":{\"description\":"):
            continue
        
        non_english_count = 0
        for c in each_sent:
            if ord(c) > 255:
                non_english_count += 1

        if len(each_sent) > 2 and \
            non_english_count > len(each_sent)/2:

            # ignore this sentenance
            ignored_sent_count += 1
        else:
            sent_list.append( each_sent)
            
    
    refined_content = ""
    for sent in sent_list:
        if len( refined_content) > 0:
            refined_content = refined_content + " "
        
        refined_content = refined_content + sent
    
    df.at[index, 'sent_list'] = sent_list
    df.at[index, 'refined_content'] = refined_content
    
    if index % 100 == 0:
        print( str(index) + ", ", end='')
    
print( ".\n sentences extraction complete")

0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, .
 sentences extraction complete


In [7]:
NER_labels = set()
for index, row in df.iterrows():
    sample_doc = df.at[index, 'refined_content']
    sample_doc = nlp( sample_doc)
    
    NER_labels.update( [x.label_ for x in sample_doc.ents])
    
    counter = Counter( [x.label_ for x in sample_doc.ents])
    
    for label in counter:
        df.at[index, "NER_" + label] = counter[label]
        
    # extract indiviudal NER entities
    counter = Counter([ent.text for ent in sample_doc.ents])
    NER_list = [text for text in counter]
    NER_most_common = [text[0] for text in counter.most_common(10)]
    
    df.at[index, "NER_list"] = ",".join(NER_list)
    df.at[index, "NER_most_common"] = ",".join(NER_most_common)
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')
        
    
# set NaNs in df to 0
for label in NER_labels:
    df["NER_" + label].fillna(0, inplace=True)
        
print( ".\n NER count extraction complete")

0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, .
 NER count extraction complete


In [8]:
df.count()

Id                 7795
url                7795
title              7795
html               7795
txt                7795
sent_list          7795
refined_content    7795
NER_list           7795
NER_most_common    7795
NER_GPE            7795
NER_DATE           7795
NER_CARDINAL       7795
NER_NORP           7795
NER_PERSON         7795
NER_TIME           7795
NER_ORG            7795
NER_WORK_OF_ART    7795
NER_QUANTITY       7795
NER_EVENT          7795
NER_ORDINAL        7795
NER_MONEY          7795
NER_FAC            7795
NER_PRODUCT        7795
NER_LAW            7795
NER_PERCENT        7795
NER_LOC            7795
NER_LANGUAGE       7795
dtype: int64

In [9]:
df_new = df.drop( 'sent_list', axis=1)

# Write all values in one fine
df_new.to_excel('../data/output/SS_Extracted_content_NER_all.xlsx', index=False)

In [10]:
df_NER_text = df[ ['Id', 'url', 'NER_list', 'NER_most_common']]
df_NER_text.count()

Id                 7795
url                7795
NER_list           7795
NER_most_common    7795
dtype: int64

In [11]:
df_NER_text.to_excel('../data/output/SS_Extracted_content_NER_text.xlsx', index=False)

In [14]:
df_numeric = df.drop( ['url', 'html', 'title', 'content', 'refined_content', 'sent_list','NER_list', 'NER_most_common'], axis=1)
df_numeric.count()

Id                 7795
NER_GPE            7795
NER_DATE           7795
NER_CARDINAL       7795
NER_NORP           7795
NER_PERSON         7795
NER_TIME           7795
NER_ORG            7795
NER_WORK_OF_ART    7795
NER_QUANTITY       7795
NER_EVENT          7795
NER_ORDINAL        7795
NER_MONEY          7795
NER_FAC            7795
NER_PRODUCT        7795
NER_LAW            7795
NER_PERCENT        7795
NER_LOC            7795
NER_LANGUAGE       7795
dtype: int64

In [15]:
df_numeric.to_excel('../data/output/2_NER_Type_Count.xlsx', index=False)