In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
import pickle
import os

In [3]:
#python -m spacy download en_core_web_sm
import spacy

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [5]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [6]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [7]:
RANDOM_STATE = 5

In [8]:
os.chdir("..")
wd = os.getcwd()
wd

'F:\\Google Drive back ups\\Sem 2\\Text analytics\\Group Project\\text_analytics_propaganda_detection'

## Load data

**Load data**

In [42]:
TRAIN_SET_PATH_TASK2 = "datasets-v5/tasks-2-3/train"

In [43]:
def parse_articles(path):
    data = {}
    for i, filename in enumerate(os.listdir(path)):
        if(i%3==2):
            article_id = filename.split(".")[0][7:]
            data[article_id] = []
            with open(TRAIN_SET_PATH_TASK2+"/"+filename, 'r', encoding="utf8") as f:
                count = 1
                for j, line in enumerate(f):
                    if(j%2==0):
                        data[article_id].append([count, line[:-1]])
                        count = count+1
    return data

In [44]:
data_dict = parse_articles(TRAIN_SET_PATH_TASK2)

In [45]:
data_dict

{'111111112': [[1, 'US bloggers banned from entering UK'],
  [2,
   'Two prominent US bloggers have been banned from entering the UK, the Home Office has said.'],
  [3,
   'Pamela Geller and Robert Spencer co-founded anti-Muslim group Stop Islamization of America.'],
  [4,
   'They were due to speak at an English Defence League march in Woolwich, where Drummer Lee Rigby was killed.'],
  [5,
   'A government spokesman said individuals whose presence "is not conducive to the public good" could be excluded by the home secretary.'],
  [6,
   'He added: "We condemn all those whose behaviours and views run counter to our shared values and will not stand for extremism in any form."'],
  [7, "'Right decision'"],
  [8,
   'On both of their blogs the pair called their bans from entering the UK "a striking blow against freedom" and said the "the nation that gave the world the Magna Carta is dead".'],
  [9,
   'They were due to attend a march planned by the far-right EDL to mark Armed Forces Day o

**Load labels**

In [33]:
with open('processed_dataset/labels.pickle', 'rb') as handle:
    # lower case tokenised text. 
    # each punctuation is its own token.
    labels = pickle.load(handle)

In [34]:
labels

{'111111112': [['1', 0],
  ['2', 0],
  ['3', 0],
  ['4', 0],
  ['5', 1],
  ['6', 0],
  ['7', 0],
  ['8', 0],
  ['9', 0],
  ['10', 0],
  ['11', 1],
  ['12', 0],
  ['13', 0],
  ['14', 1],
  ['15', 1],
  ['16', 0],
  ['17', 0],
  ['18', 0],
  ['19', 0],
  ['20', 0],
  ['21', 1],
  ['22', 0],
  ['23', 0],
  ['24', 0],
  ['25', 1],
  ['26', 0],
  ['27', 1],
  ['28', 0],
  ['29', 0],
  ['30', 0],
  ['31', 0],
  ['32', 0],
  ['33', 1],
  ['34', 0],
  ['35', 0],
  ['36', 0],
  ['37', 0],
  ['38', 0],
  ['39', 0],
  ['40', 0],
  ['41', 0],
  ['42', 0],
  ['43', 0],
  ['44', 0],
  ['45', 0],
  ['46', 0],
  ['47', 0],
  ['48', 0],
  ['49', 0],
  ['50', 0]],
 '111111113': [['1', 0],
  ['2', 0],
  ['3', 1],
  ['4', 0],
  ['5', 0],
  ['6', 0],
  ['7', 0],
  ['8', 0],
  ['9', 0],
  ['10', 0],
  ['11', 0],
  ['12', 1],
  ['13', 0],
  ['14', 0],
  ['15', 0],
  ['16', 0],
  ['17', 0],
  ['18', 0],
  ['19', 0],
  ['20', 0],
  ['21', 0],
  ['22', 0],
  ['23', 0],
  ['24', 1],
  ['25', 0],
  ['26', 0],
  [

In [13]:
def get_labels_array(labels_dict):
    labels_array = np.array([])
    for article_id in labels_dict.keys():
        for sentence_id, label in labels_dict[article_id]:
            labels_array = np.append(labels_array, label)
    return labels_array

In [14]:
get_labels_array(labels)[:10]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

In [16]:
labels_array = get_labels_array(labels)
labels_array[:10]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

## Spacy Tutorials

In [47]:
# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [54]:
getattr(doc[0],"lower_")

'she'

In [248]:
# Process a text
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
print(doc.ents)
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

(Apple, U.K., $1 billion)
Apple ORG
U.K. GPE
$1 billion MONEY


In [261]:
print(doc.ents[0])
print(doc[0])

print(doc.ents[2])
print(doc[-1])
print(doc[-1] in doc.ents[2])
print()
for token in doc:
    for ent in doc.ents:
        if token in ent:
            print(ent.label_)
        else:
            continue

Apple
Apple
$1 billion
billion
True

ORG
GPE
MONEY
MONEY
MONEY


In [39]:
# Process a text
doc = nlp(u"She ate the pizza")

doc.ents

()

In [41]:
# adding missing entities

text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


#### Token attributes:
    - text
    - lower
    - lemma
    - pos

#### Matcher example
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa', 'LEMMA':fifa},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]

doc = nlp("2018 FIFA World Cup: France won!")

2018 FIFA World Cup:

## Word Level Features
- Part of Speech
- Dependencies
- Named Entity Recognition

In [17]:
def process_text(model, text, attr_name="text"):
    nlp = model
    tokens = []
    for token in nlp(text):
        if token.orth_.isspace():
            continue
        elif token.like_url:
            tokens.append('URL')
        else:
            tokens.append(getattr(token, attr_name))
            
    return tokens

In [18]:
def get_word_features(model, data):
    processed_words = {
        "article_ids": [],
        "sentence_nos": [],
        "word_lower": [],
        "pos_tag": [],
        "dep_tag": [],
        "ner_tag": []
    }
    for article_id in data.keys():
        count = 1
        for i, sentence in data[article_id]:
            sent_model = model(sentence)
            for token in sent_model:
                processed_words["article_ids"].append(article_id)
                processed_words["sentence_nos"].append(i)
                processed_words["word_lower"].append(token.lower_)
                processed_words["pos_tag"].append(token.pos_)
                processed_words["dep_tag"].append(token.dep_)
                processed_words["ner_tag"].append(np.NaN)
                for ent in sent_model.ents:
                    if token in ent:
                        processed_words["ner_tag"][-1] = ent.label_
                        break
                
            count = count + 1
        
    return processed_words

In [274]:
processed_words = get_word_features(nlp, data_dict)

**convert to DataFrame**

In [276]:
df = pd.DataFrame(processed_words)

In [278]:
df.head()

Unnamed: 0,article_ids,sentence_nos,word_lower,pos_tag,dep_tag,ner_tag
0,111111112,1,us,PROPN,compound,GPE
1,111111112,1,bloggers,NOUN,nsubj,
2,111111112,1,banned,VERB,ROOT,
3,111111112,1,from,ADP,prep,
4,111111112,1,entering,VERB,pcomp,


In [292]:
df.shape

(174766, 6)

In [289]:
df.to_csv("processed_dataset/word_level_features.csv")

**Load dataframe**

In [24]:
df = pd.read_csv("processed_dataset/word_level_features.csv")
df = df.iloc[:,1:]

In [25]:
df.head()

Unnamed: 0,article_ids,sentence_nos,word_lower,pos_tag,dep_tag,ner_tag
0,111111112,1,us,PROPN,compound,GPE
1,111111112,1,bloggers,NOUN,nsubj,
2,111111112,1,banned,VERB,ROOT,
3,111111112,1,from,ADP,prep,
4,111111112,1,entering,VERB,pcomp,


In [26]:
word_features = pd.get_dummies(df, columns=["pos_tag","dep_tag","ner_tag"]).iloc[:,3:].values

In [27]:
word_features.shape

(174766, 78)

### Training ML regressors

In [28]:
models_list = [
    LinearRegression(),
    BayesianRidge(),
    SVR(),
    KNeighborsRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor(),
    MLPRegressor(),
    GaussianProcessRegressor()
]

In [29]:
train_indices, test_indices = train_test_split(df["article_ids"], test_size=0.25, random_state=RANDOM_STATE)

In [30]:
X_train = word_features[train_indices.index,:]
X_test = word_features[test_indices.index,:]
print(X_train.shape)
print(X_test.shape)

(131074, 78)
(43692, 78)


In [31]:
train_indices.shape

(131074,)

In [36]:
y_train = np.zeros(train_indices.shape)
count = 0
for article_id in train_indices.values:
    for sent_no, label in labels[str(article_id)]:
        y_train[count] = label
        count = count+1
        break

In [37]:
y_train.shape

(131074,)

In [38]:
test_indices.shape

(43692,)

In [39]:
y_test = np.zeros(test_indices.shape)
count = 0
for article_id in test_indices.values:
    for sent_no, label in labels[str(article_id)]:
        y_test[count] = label
        count = count+1
        break

In [40]:
y_test.shape

(43692,)

In [41]:
for model in models_list:
    model.fit(X_train, y_train)
    print(model.__class__)
    print("Model score ",model.score(X_test, y_test))
    
    preds_train = model.predict(X_train) >= 0.5
    preds_test = model.predict(X_test) >= 0.5
    
    print("Train accuracy ", accuracy_score(preds_train, y_train))
    print("Test accuracy ", accuracy_score(preds_test, y_test))
    print()
    print("Train f1 score ", f1_score(preds_train, y_train))
    print("Test f1 score ", f1_score(preds_test, y_test))
    print()

<class 'sklearn.linear_model.base.LinearRegression'>
Model score  0.0004516913719682103
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0
Test f1 score  0.0



  'recall', 'true', average, warn_for)


<class 'sklearn.linear_model.bayes.BayesianRidge'>
Model score  0.0008500569163213401
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0


  'recall', 'true', average, warn_for)


Test f1 score  0.0





<class 'sklearn.svm.classes.SVR'>
Model score  -0.1920344543693624
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0
Test f1 score  0.0



  'recall', 'true', average, warn_for)


<class 'sklearn.neighbors.regression.KNeighborsRegressor'>
Model score  -0.17004176923847258
Train accuracy  0.6778842485923982
Test accuracy  0.6782477341389728

Train f1 score  0.10200565752812812
Test f1 score  0.10069089048106449

<class 'sklearn.ensemble.weight_boosting.AdaBoostRegressor'>
Model score  0.0002467683538271981
Train accuracy  0.6976059325266644
Test accuracy  0.6990524581159022

Train f1 score  0.00040351054171290223
Test f1 score  0.0003041131300843914





<class 'sklearn.ensemble.forest.RandomForestRegressor'>
Model score  -0.0035911853573029617
Train accuracy  0.6992309687657354
Test accuracy  0.6977707589490066

Train f1 score  0.023046613634674005
Test f1 score  0.011675772771499138

<class 'sklearn.neural_network.multilayer_perceptron.MLPRegressor'>
Model score  -0.0012613988519096786
Train accuracy  0.697666966751606
Test accuracy  0.6990982330861485

Train f1 score  0.0005548549810844894
Test f1 score  0.0003041593795148658



MemoryError: Unable to allocate 128. GiB for an array with shape (131074, 131074) and data type float64

### Training ML classifiers

In [45]:
models_list = [
    LogisticRegression(),
    GaussianNB(),
    MultinomialNB(),
    SVC(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
    GaussianProcessClassifier()
]

In [46]:
for model in models_list:
    model.fit(X_train, y_train)
    print(model.__class__)
    print("Model score ",model.score(X_test, y_test))
    
    preds_train = model.predict(X_train) >= 0.5
    preds_test = model.predict(X_test) >= 0.5
    
    print("Train accuracy ", accuracy_score(preds_train, y_train))
    print("Test accuracy ", accuracy_score(preds_test, y_test))
    print()
    print("Train f1 score ", f1_score(preds_train, y_train))
    print("Test f1 score ", f1_score(preds_test, y_test))
    print()



<class 'sklearn.linear_model.logistic.LogisticRegression'>
Model score  0.6991211205712716
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0


  'recall', 'true', average, warn_for)


Test f1 score  0.0

<class 'sklearn.naive_bayes.GaussianNB'>
Model score  0.34958344777075895
Train accuracy  0.35357126508689746
Test accuracy  0.34958344777075895

Train f1 score  0.4578043411487663
Test f1 score  0.4537521144087345

<class 'sklearn.naive_bayes.MultinomialNB'>
Model score  0.6991211205712716
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0


  'recall', 'true', average, warn_for)


Test f1 score  0.0





<class 'sklearn.svm.classes.SVC'>
Model score  0.6991211205712716
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716



  'recall', 'true', average, warn_for)


Train f1 score  0.0
Test f1 score  0.0

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Model score  0.6782477341389728
Train accuracy  0.6778842485923982
Test accuracy  0.6782477341389728

Train f1 score  0.10200565752812812
Test f1 score  0.10069089048106449

<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>
Model score  0.6991211205712716
Train accuracy  0.697590673970429
Test accuracy  0.6991211205712716

Train f1 score  0.0
Test f1 score  0.0



  'recall', 'true', average, warn_for)


<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Model score  0.6982056211663462
Train accuracy  0.6992004516532646
Test accuracy  0.6982056211663462

Train f1 score  0.02032550627407131
Test f1 score  0.010802700675168793

<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
Model score  0.6989151332051634
Train accuracy  0.6981628698292568
Test accuracy  0.6989151332051634

Train f1 score  0.007426177274894002
Test f1 score  0.005142554639643046



MemoryError: Unable to allocate 128. GiB for an array with shape (131074, 131074) and data type float64

In [48]:
with open("models/GaussianNB_stat_model.pickle","wb") as handle:
    pickle.dump(models_list[1], handle, protocol=pickle.HIGHEST_PROTOCOL)