In [1]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
import pandas as pd
import sqlite3

nlp = spacy.load('en_core_web_lg', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

entities={}
for row in cur.execute('SELECT * FROM entity_app_entity;'):
    entities[row[0]]=row[1]
    
instances={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    try:
        instances[documentID]+=entities[entityID]
    except KeyError:
        instances[documentID]=entities[entityID]

documents={}
for row in cur.execute('SELECT * FROM entity_app_document;'):
    documents[row[0]]=row[2].replace(".html.gz","")



In [3]:
TRUTHS="C:/Users/User/Desktop/project_data/truths.txt"
with open (TRUTHS, "r") as f:
    lines = f.readlines()

truths={}
for line in lines:
    line=line.replace("\n","").replace(" ","/").split("/")[-2:]
    truths[line[0]]=line[1]

In [4]:
df = pd.DataFrame(columns=['filename', 'text', 'truth'])

i=0
for docid in documents.keys():
    try:
        filename=documents[docid]
        truth=int(truths[filename])
        text=instances[docid]
        df.loc[i] = [filename,text,truth]
        i+=1
    except KeyError:
        continue

In [5]:
def text_pipeline(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

In [35]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

abstracts=[]
truth_vals=[]
sensitivity={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    
    filename=documents[documentID]
    truth=int(truths[filename])
    
    abstract=entities[entityID]
    abstracts.append(abstract)
    truth_vals.append(truth)
    
    if abstract in sensitivity.keys():
        sensitivity[abstract]=max(truth,sensitivity[abstract])
    else:
        sensitivity[abstract]=truth
con.close()


In [57]:
train_df_ = pd.DataFrame(columns=['abstract','truth'])
train_df_['abstract'] = abstracts
train_df_['truth'] = truth_vals

test_df = pd.DataFrame(columns=['abstract','truth'])
test_df['abstract'] = sensitivity.keys()
test_df['truth'] = sensitivity.values()
test_df

Unnamed: 0,abstract,truth
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",1
3,Hashemite is a very rare barium chromate miner...,0
4,"Terrorism, in its broadest sense, is the use o...",1
...,...,...
19640,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0
19641,"Devin Garrett Townsend (born May 5, 1972) is a...",0
19642,Meggen is a municipality in the district of Lu...,0
19643,Sarah Helen Prescott is Professor of English L...,0


In [59]:
tfidf_vectorizer = TfidfVectorizer()#tokenizer=text_pipeline,binary=True)

sens=df[df['truth']==1]
not_sens=df[df['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['text'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

  merged=sens.append(not_sens, ignore_index=True)


In [60]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
classifier = SGDClassifier(loss='log')
classifier_fitted = classifier.fit(train_features,train_labels)


In [61]:
predicted = classifier_fitted._predict_proba_lr(test_features)[:,0]
predicted_df = test_df
predicted_df['predictions'] = 1-predicted
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.5 else 0)
predicted_df

Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",1,0.701975,1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",1,0.616616,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",1,0.740425,1
3,Hashemite is a very rare barium chromate miner...,0,0.638802,1
4,"Terrorism, in its broadest sense, is the use o...",1,0.614622,1
...,...,...,...,...
19640,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0,0.408184,0
19641,"Devin Garrett Townsend (born May 5, 1972) is a...",0,0.240537,0
19642,Meggen is a municipality in the district of Lu...,0,0.561466,1
19643,Sarah Helen Prescott is Professor of English L...,0,0.535975,1


In [62]:
y_true=predicted_df['truth']
y_pred=predicted_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))

Precision Score = 0.37848518111964874
Recall Score = 0.5960242005185825
F1 Score = 0.46297415240013423
F2 Score = 0.5345736434108528
Balanced Accuracy Score = 0.593755246002437


In [13]:
#import pickle
#pickle.dump(SVC_fitted, open('classifier.pkl', "wb"))
#pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [42]:
sens=train_df_[train_df_['truth']==1]
not_sens=train_df_[train_df_['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

  merged=sens.append(not_sens, ignore_index=True)


In [43]:
tfidf_vectorizer = TfidfVectorizer()

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['abstract'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

In [64]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log').fit(train_features,train_labels)

In [65]:
import pickle
pickle.dump(clf, open('classifier.pkl', "wb"))
pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [54]:
from sklearn.model_selection import cross_val_predict
predicted =  clf._predict_proba_lr(test_features)[:,0]
predicted_df = test_df_
predicted_df['predictions'] = 1-predicted
#predicted_df.sort_values(by='predictions', ascending=False).head(30)
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.5 else 0)
predicted_df.sort_values(['predictions'], ascending=False).head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['predictions'] = 1-predicted
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.5 else 0)


Unnamed: 0,abstract,truth,predictions,prediction
14067,The Indonesian National Police (Indonesian: Ke...,0,0.935334,1
19099,The Bangka Belitung Islands (Indonesian: Kepul...,1,0.922363,1
17578,"Uruzgan (Pashto: اروزګان; Dariارزگان), also sp...",0,0.905126,1
19101,Outrigger boats are various watercraft featuri...,1,0.896281,1
2911,"Pashtuns (/ˈpʌʃˌtʊn/, /ˈpɑːʃˌtʊn/, /ˈpæʃˌtuːn/...",0,0.892,1
19100,Pangkal Pinang is the capital and largest city...,1,0.891651,1
3963,Zamfara (Hausa: Jihar Zamfara Fula: Leydi Zamf...,1,0.890549,1
14172,Indium phosphide (InP) is a binary semiconduct...,0,0.8876,1
3715,Gombe State (Fula: Leydi Gommbe 𞤤𞤫𞤴𞤣𞤭 𞤺𞤮𞤥𞥆𞤦𞤫) ...,1,0.88578,1
9302,Sumatra is one of the Sunda Islands of western...,0,0.877109,1


In [55]:
y_true=predicted_df['truth']
y_pred=predicted_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))

Precision Score = 0.3331789408676857
Recall Score = 0.5641830065359477
F1 Score = 0.41894777713065423
F2 Score = 0.4954768792763007
Balanced Accuracy Score = 0.6455870784892128


In [51]:
predicted_df[predicted_df['prediction']==1]

Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",0,0.485249,1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",0,0.559250,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",0,0.515503,1
3,Hashemite is a very rare barium chromate miner...,0,0.473538,1
4,"Terrorism, in its broadest sense, is the use o...",0,0.546301,1
...,...,...,...,...
19640,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0,0.440014,1
19641,"Devin Garrett Townsend (born May 5, 1972) is a...",0,0.436760,1
19642,Meggen is a municipality in the district of Lu...,0,0.443227,1
19643,Sarah Helen Prescott is Professor of English L...,0,0.466762,1
