In [5]:
import spacy
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

In [6]:
def readArticle(input_file_path):
    """
        file path: where the input file located 
        
        return: a dataframe
    """
    file = open(input_file_path, 'r', encoding = 'utf8')
    articles_dt = file.read().split('\n')[:-1]
    pubId, canonicalUrl,firstScrape,title,text,lang_reliability = [],[],[],[],[],[]
    for article in articles_dt:    
        row = article.split('\t')
        pubId.append(row[0])
        canonicalUrl.append(row[4])
        firstScrape.append(row[5])
        lang_reliability.append(row[7])
        title.append(row[8])
        text.append(row[9])
    articles_df = pd.DataFrame()
    articles_df['pubId'], articles_df['canonicalUrl'], articles_df['firstScrape'], articles_df['title'], articles_df['text'], articles_df['lang_reliability']= pubId, canonicalUrl,firstScrape,title,text,lang_reliability
    return articles_df

In [7]:
## text normzalization
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a article 
        
        return: normalized the text data
    """
    text = re.sub(r"[^\w\s]", '', text) ## Remove all non-word characters (everything except numbers and letters)
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    
    return text

In [24]:
df = readArticle('../data/raw/articles.txt')

In [25]:
df.shape

(213605, 6)

In [16]:
# get the label data
label1 = pd.read_csv('../data/raw/labels/cave_rescue/lower_bound.txt', header=None)
label1.columns = ['canonicalUrl']
label1['label'] = 'cave_rescue'
label2 = pd.read_csv('../data/raw/labels/duckboat/lower_bound.txt', header=None)
label2.columns = ['canonicalUrl']
label2['label'] = 'duckboat'
label3 = pd.read_csv('../data/raw/labels/helsinki_summit/lower_bound.txt', header=None)
label3.columns = ['canonicalUrl']
label3['label'] = 'helsinki'
label_df = pd.concat([label1, label2, label3])
label_df.head()

Unnamed: 0,canonicalUrl,label
0,bbc.com/news/av/world-asia-44875089/thai-cave-...,cave_rescue
1,indystar.com/story/news/nation-now/2018/07/16/...,cave_rescue
2,washingtonpost.com/world/asia_pacific/these-di...,cave_rescue
3,au.news.yahoo.com/navy-seal-died-thai-cave-res...,cave_rescue
4,yahoo.com/news/m/8adca8cd-6cc3-307c-b109-9cd1d...,cave_rescue


In [12]:
#merged df and the label_df
merged_df = df.merge(label_df, on='canonicalUrl', how='left')
#drop na labels
merged_df = merged_df.dropna(subset=['label'])
merged_df.shape

(141, 7)

In [13]:
#need to generate balanced data 
merged_df.groupby(['label']).agg(['count'])

Unnamed: 0_level_0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability
Unnamed: 0_level_1,count,count,count,count,count,count
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
cave_rescue,31,31,31,31,31,31
duckboat,21,21,21,21,21,21
helsinki,89,89,89,89,89,89


In [23]:
#we need 859 artiels so from each categroy 
sub = df[~df['canonicalUrl'].isin(merged_df['canonicalUrl'])]
sub.shape

(213464, 6)

In [39]:
sub = sub[sub['lang_reliability'] == '1']
sub.shape

(211436, 6)

In [40]:
rest_df = sub[['canonicalUrl', 'text']].sample(859)

In [41]:
merged_df = merged_df[['canonicalUrl', 'text']]

In [42]:
sample_df = pd.concat([merged_df, rest_df])
sample_df.shape

(1000, 2)

In [44]:
sample_df.to_csv('../data/embedding/sample_output.csv')

In [9]:
#random select samples
text_df = merged_df['text'].apply(clean_text)
sample_text = text_df.sample(n = 10000, random_state = 2, replace=True) 
sample_label = merged_df.loc[sample_text.index]

In [17]:
#label_encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(sample_label['label'])
print(list(le.classes_))
sample_y = le.transform(sample_label['label']) 


['cave_rescue', 'duckboat', 'helsinki']


In [18]:
#get the noun, verb, word entity for each articles
nlp = spacy.load("en_core_web_sm")
noun_phrases_list = []
verb_phrases_list = []
entites_list = []
for text in sample_text:
    doc = nlp(text)
    noun_phrases_list.append([chunk.text for chunk in doc.noun_chunks])
    verb_phrases_list.append([token.lemma_ for token in doc if token.pos_ == "VERB"])
    entites_list.append([entity.text for entity in doc.ents])

In [19]:
def format_list(input_lists):
    """
        input_lists: a list of string lists
        
        return: a list of string
    """
    str_list = []
    #join the verb_list in to string
    for l in input_lists:
        str_list.append(' '.join(l))
    return str_list

In [20]:
#embedding the entites BoW
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
def BoW_embedding(input_list):
    #unigram
    """        
        input_list: a list of string
        return: embedding output
    """
    cv = CountVectorizer()
    text_counts= cv.fit_transform(input_list)
    return text_counts

In [21]:
#classified with SVM
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def SVM(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    print('accuracy_score', accuracy_score(y_test, y_pred))
    

### Approach 1: 
#### get the noun from the articles> embedding the noun(BoW) > classified the artciles with the baseline (SVM)

In [22]:
#noun 
string_list = format_list(noun_phrases_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7875757575757576


### Approach 2: 
#### get the verb from the articles> embedding the verbs(BoW) > classified the artciles with the baseline (SVM)

In [23]:
#entities: entites_list
string_list = format_list(verb_phrases_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7363636363636363


### Approach 3: 
#### get the entites from the articles> embedding the entites (BoW)> classified the artciles with the baseline (SVM )


In [24]:
#entities: entites_list
string_list = format_list(entites_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7336363636363636


# Generate the entity output

In [9]:
#merged df and the label_df
merged_df = df.merge(label_df, on='canonicalUrl', how='left')
merged_df.shape

(213610, 7)

In [10]:
text_df = merged_df['text'].apply(clean_text)
text_df.shape

(213610,)

In [11]:
# get the eneties for the whole data:text_df
#get the noun, verb, word entity for each articles
nlp = spacy.load("en_core_web_sm")
noun_phrases_list = []
verb_phrases_list = []
entites_list = []
for text in text_df:
    doc = nlp(text)
    noun_phrases_list.append([chunk.text for chunk in doc.noun_chunks])
    verb_phrases_list.append([token.lemma_ for token in doc if token.pos_ == "VERB"])
    entites_list.append([entity.text for entity in doc.ents])

In [12]:
len(noun_phrases_list)

213610

In [13]:
#save the entites
with open('../data/embedding/noun_phrases.txt', 'w') as f:
    for item in noun_phrases_list:
        f.write("%s\n" % item)

In [14]:
with open('../data/embedding/verb_phrase.txt', 'w') as f:
    for item in verb_phrases_list:
        f.write("%s\n" % item)

In [15]:
with open('../data/embedding/entites.txt', 'w') as f:
    for item in entites_list:
        f.write("%s\n" % item)

In [16]:
entites_list[0]

['phoenix capita',
 '19',
 '2018',
 '849',
 '1 trillion',
 'next year',
 '19',
 '2018',
 '849',
 'uk',
 'two',
 'uk',
 '19',
 '801 neil mitchell',
 'uk',
 'uk',
 '19',
 '801 0',
 '43bn 5 billion',
 'eu',
 'us',
 'european union',
 'five billion dollar',
 'one',
 'july 19 2018',
 'wednesday',
 'alphabet',
 'browser apps',
 'eu',
 '5',
 'netherlands',
 'chinese',
 'us',
 'european unions',
 'ftc',
 'joseph simons',
 'congress',
 'wednesday',
 'eu',
 'several years ago',
 'trumps comments',
 'european commission',
 'jeanclaude junckers',
 'washington',
 'next week',
 'russia',
 'nato',
 'eu']