In [1]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
import pandas as pd
import sqlite3

nlp = spacy.load('en_core_web_lg', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

entities={}
for row in cur.execute('SELECT * FROM entity_app_entity;'):
    entities[row[0]]=row[1]
    
instances={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    try:
        instances[documentID]+=entities[entityID]
    except KeyError:
        instances[documentID]=entities[entityID]

documents={}
for row in cur.execute('SELECT * FROM entity_app_document;'):
    documents[row[0]]=row[2].replace(".html.gz","")



In [3]:
TRUTHS="C:/Users/User/Desktop/project_data/truths.txt"
with open (TRUTHS, "r") as f:
    lines = f.readlines()

truths={}
for line in lines:
    line=line.replace("\n","").replace(" ","/").split("/")[-2:]
    truths[line[0]]=line[1]

In [4]:
df = pd.DataFrame(columns=['filename', 'text', 'truth'])

i=0
for docid in documents.keys():
    try:
        filename=documents[docid]
        truth=int(truths[filename])
        text=instances[docid]
        df.loc[i] = [filename,text,truth]
        i+=1
    except KeyError:
        continue

In [5]:
def text_pipeline(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

In [6]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

abstracts=[]
truth_vals=[]
sensitivity={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    
    filename=documents[documentID]
    truth=int(truths[filename])
    
    abstract=entities[entityID]
    abstracts.append(abstract)
    truth_vals.append(truth)
    
    if abstract in sensitivity.keys():
        sensitivity[abstract]=max(truth,sensitivity[abstract])
    else:
        sensitivity[abstract]=truth
con.close()


In [7]:
train_df_ = pd.DataFrame(columns=['abstract','truth'])
train_df_['abstract'] = abstracts
train_df_['truth'] = truth_vals

test_df = pd.DataFrame(columns=['abstract','truth'])
test_df['abstract'] = sensitivity.keys()
test_df['truth'] = sensitivity.values()
test_df

Unnamed: 0,abstract,truth
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",1
3,Hashemite is a very rare barium chromate miner...,0
4,"Terrorism, in its broadest sense, is the use o...",1
...,...,...
19638,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0
19639,"Devin Garrett Townsend (born May 5, 1972) is a...",0
19640,Meggen is a municipality in the district of Lu...,0
19641,Sarah Helen Prescott is Professor of English L...,0


In [8]:
tfidf_vectorizer = TfidfVectorizer()#tokenizer=text_pipeline,binary=True)

sens=df[df['truth']==1]
not_sens=df[df['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['text'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

  merged=sens.append(not_sens, ignore_index=True)


In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
classifier = SGDClassifier(loss='log')
classifier_fitted = classifier.fit(train_features,train_labels)


In [10]:
predicted = classifier_fitted._predict_proba_lr(test_features)[:,0]
predicted_df = test_df
predicted_df['predictions'] = 1-predicted
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.5 else 0)
predicted_df

Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",1,0.550393,1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",1,0.487006,0
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",1,0.610701,1
3,Hashemite is a very rare barium chromate miner...,0,0.431085,0
4,"Terrorism, in its broadest sense, is the use o...",1,0.313742,0
...,...,...,...,...
19638,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0,0.404740,0
19639,"Devin Garrett Townsend (born May 5, 1972) is a...",0,0.449496,0
19640,Meggen is a municipality in the district of Lu...,0,0.471849,0
19641,Sarah Helen Prescott is Professor of English L...,0,0.405381,0


In [11]:
y_true=predicted_df['truth']
y_pred=predicted_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))

Precision Score = 0.37849703900347154
Recall Score = 0.6407951598962834
F1 Score = 0.47589704088837537
F2 Score = 0.5627922511690047
Balanced Accuracy Score = 0.6007771440988128


In [12]:
#import pickle
#pickle.dump(SVC_fitted, open('classifier.pkl', "wb"))
#pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [13]:
sens=train_df_[train_df_['truth']==1]
not_sens=train_df_[train_df_['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

  merged=sens.append(not_sens, ignore_index=True)


In [43]:
tfidf_vectorizer = TfidfVectorizer()

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['abstract'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

In [44]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log')#.fit(train_features,train_labels)

In [45]:
#import pickle
#pickle.dump(clf, open('classifier.pkl', "wb"))
#pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [48]:
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, train_features, train_labels, cv=6, method='predict_proba')[:,0]
predicted_df = merged
predicted_df['predictions'] = 1-predicted
#predicted_df.sort_values(by='predictions', ascending=False).head(30)
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.45 else 0)
predicted_df.sort_values(['predictions'], ascending=False).head(30)

Unnamed: 0,abstract,truth,predictions,prediction
33701,"Uruzgan (Pashto: اروزګان; Dariارزگان), also sp...",0,0.914352,1
31837,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,0,0.911736,1
53397,"Uruzgan (Pashto: اروزګان; Dariارزگان), also sp...",0,0.909107,1
35779,"Uruzgan (Pashto: اروزګان; Dariارزگان), also sp...",0,0.908176,1
47406,"Edo State is one of the 36 states of Nigeria, ...",0,0.900557,1
9281,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,1,0.895979,1
9287,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,1,0.895979,1
9197,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,1,0.895979,1
9225,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,1,0.895979,1
9243,Kano State (Hausa: Jihar Kano) (Fula: Leydi Ka...,1,0.895979,1


In [51]:
occurs_freq = predicted_df[predicted_df.groupby('abstract').abstract.transform(len) > 1]
occurs_once = predicted_df[predicted_df.groupby('abstract').abstract.transform(len) == 1]
split_df=predicted_df[predicted_df['abstract'].isin(occurs_freq['abstract'])]

In [52]:
y_true=split_df['truth']
y_pred=split_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))

Precision Score = 0.5634322620329079
Recall Score = 0.5203621516066039
F1 Score = 0.5410414013326689
F2 Score = 0.5284412137994693
Balanced Accuracy Score = 0.5502191632935952


Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",1,0.515971,1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",1,0.540595,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",1,0.509614,1
3,Hashemite is a very rare barium chromate miner...,0,0.470721,1
4,"Terrorism, in its broadest sense, is the use o...",1,0.561938,1
...,...,...,...,...
19627,A sustainable food system is a type of food sy...,0,0.515826,1
19631,"A photovoltaic power station, also known as a ...",0,0.399164,0
19633,The Community of Madrid (English: /məˈdrɪd/; S...,0,0.457561,1
19634,"Toledo is a province of central Spain, in the ...",0,0.478348,1
