In [1]:
import spacy
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

In [2]:
def readArticle(input_file_path):
    """
        file path: where the input file located 
        
        return: a dataframe
    """
    file = open(input_file_path, 'r', encoding = 'utf8')
    articles_dt = file.read().split('\n')[:-1]
    pubId, canonicalUrl,firstScrape,title,text,lang_reliability = [],[],[],[],[],[]
    for article in articles_dt:    
        row = article.split('\t')
        pubId.append(row[0])
        canonicalUrl.append(row[4])
        firstScrape.append(row[5])
        lang_reliability.append(row[7])
        title.append(row[8])
        text.append(row[9])
    articles_df = pd.DataFrame()
    articles_df['pubId'], articles_df['canonicalUrl'], articles_df['firstScrape'], articles_df['title'], articles_df['text'], articles_df['lang_reliability']= pubId, canonicalUrl,firstScrape,title,text,lang_reliability
    return articles_df

In [3]:
## text normzalization
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a article 
        
        return: normalized the text data
    """
    text = re.sub(r"[^\w\s]", '', text) ## Remove all non-word characters (everything except numbers and letters)
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

In [4]:
df = readArticle('../data/raw/articles.txt')

In [5]:
# get the label data
label1 = pd.read_csv('../data/raw/labels/cave_rescue/population.txt', header=None)
label1.columns = ['canonicalUrl']
label1['label'] = 'cave_rescue'
label2 = pd.read_csv('../data/raw/labels/duckboat/population.txt', header=None)
label2.columns = ['canonicalUrl']
label2['label'] = 'duckboat'
label3 = pd.read_csv('../data/raw/labels/helsinki_summit/population.txt', header=None)
label3.columns = ['canonicalUrl']
label3['label'] = 'helsinki'
label_df = pd.concat([label1, label2, label3])
label_df.head()

Unnamed: 0,canonicalUrl,label
0,denverpost.com/2018/07/20/greeley-police-lip-s...,cave_rescue
1,bbc.com/news/av/world-asia-44875089/thai-cave-...,cave_rescue
2,news.google.com/stories/caaqzggkimbdqkltuwpvsm...,cave_rescue
3,chron.com/news/crime/article/young-nubians-rev...,cave_rescue
4,cnbc.com/2018/07/20/at-least-13-die-when-duck-...,cave_rescue


In [6]:
#merged df and the label_df
merged_df = df.merge(label_df, on='canonicalUrl', how='left')
#drop na labels
merged_df = merged_df.dropna(subset=['label'])
merged_df.head()

Unnamed: 0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability,label
12,290,zerohedge.com/news/2018-07-19/la-liberals-stag...,7/19/2018 7:50:19 PM -04:00,"LA Liberals Stage ""Emergency Protest"" At Koshe...","by Phoenix Capita… - Jul 19, 2018 8:49 am ### ...",1,helsinki
23,290,zerohedge.com/news/2018-07-19/wells-fargo-has-...,7/19/2018 11:53:37 AM -04:00,Wells Fargo Caught In Yet Another Scandal | Ze...,"by Phoenix Capita… - Jul 19, 2018 8:49 am ### ...",1,helsinki
30,290,zerohedge.com/news/2018-07-19/white-house-aske...,7/20/2018 1:35:15 AM -04:00,"Asked 8 Times For Trump-Rouhani Meeting, Iran...","by Phoenix Capita… - Jul 19, 2018 8:49 am ### ...",1,helsinki
34,290,zerohedge.com/news/2018-07-18/wheres-panic-why...,7/19/2018 8:26:52 AM -04:00,"""Where’s The Panic"": Why Trade War Hedges Aren...","by Knave Dave - Jul 18, 2018 1:11 pm ### This ...",1,helsinki
53,290,zerohedge.com/news/2018-07-19/maxine-waters-fe...,7/19/2018 1:03:04 PM -04:00,"Maxine Waters Fears ""Armed Protests"" As Oath K...","by Phoenix Capita… - Jul 19, 2018 8:49 am ### ...",1,cave_rescue


In [16]:
#random select samples
text_df = merged_df['text'].apply(clean_text)
sample_text = text_df.sample(n = 10000, random_state = 2, replace=True) 
sample_label = merged_df.loc[sample_text.index]

In [17]:
#label_encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(sample_label['label'])
print(list(le.classes_))
sample_y = le.transform(sample_label['label']) 


['cave_rescue', 'duckboat', 'helsinki']


In [18]:
#get the noun, verb, word entity for each articles
nlp = spacy.load("en_core_web_sm")
noun_phrases_list = []
verb_phrases_list = []
entites_list = []
for text in sample_text:
    doc = nlp(text)
    noun_phrases_list.append([chunk.text for chunk in doc.noun_chunks])
    verb_phrases_list.append([token.lemma_ for token in doc if token.pos_ == "VERB"])
    entites_list.append([entity.text for entity in doc.ents])

In [19]:
def format_list(input_lists):
    """
        input_lists: a list of string lists
        
        return: a list of string
    """
    str_list = []
    #join the verb_list in to string
    for l in input_lists:
        str_list.append(' '.join(l))
    return str_list

In [20]:
#embedding the entites BoW
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
def BoW_embedding(input_list):
    #unigram
    """        
        input_list: a list of string
        return: embedding output
    """
    cv = CountVectorizer()
    text_counts= cv.fit_transform(input_list)
    return text_counts

In [21]:
#classified with SVM
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def SVM(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    print('accuracy_score', accuracy_score(y_test, y_pred))
    

### Approach 1: 
#### get the noun from the articles> embedding the noun(BoW) > classified the artciles with the baseline (SVM)

In [22]:
#noun 
string_list = format_list(noun_phrases_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7875757575757576


### Approach 2: 
#### get the verb from the articles> embedding the verbs(BoW) > classified the artciles with the baseline (SVM)

In [23]:
#entities: entites_list
string_list = format_list(verb_phrases_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7363636363636363


### Approach 3: 
#### get the entites from the articles> embedding the entites (BoW)> classified the artciles with the baseline (SVM )


In [24]:
#entities: entites_list
string_list = format_list(entites_list)
X = BoW_embedding(string_list)
SVM(X,sample_y)

accuracy_score 0.7336363636363636
