In [1]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
import pandas as pd
import sqlite3

nlp = spacy.load('en_core_web_lg', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

entities={}
for row in cur.execute('SELECT * FROM entity_app_entity;'):
    entities[row[0]]=row[1]
    
instances={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    try:
        instances[documentID]+=entities[entityID]
    except KeyError:
        instances[documentID]=entities[entityID]

documents={}
for row in cur.execute('SELECT * FROM entity_app_document;'):
    documents[row[0]]=row[2].replace(".html.gz","")



In [3]:
TRUTHS="C:/Users/User/Downloads/project_data/project_data/truths.txt"
with open (TRUTHS, "r") as f:
    lines = f.readlines()

truths={}
for line in lines:
    line=line.replace("\n","").replace(" ","/").split("/")[-2:]
    truths[line[0]]=line[1]

In [4]:
df = pd.DataFrame(columns=['filename', 'text', 'truth'])

i=0
for docid in documents.keys():
    try:
        filename=documents[docid]
        truth=int(truths[filename])
        text=instances[docid]
        df.loc[i] = [filename,text,truth]
        i+=1
    except KeyError:
        continue

In [5]:
def text_pipeline(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

In [6]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

sensitivity={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    
    filename=documents[documentID]
    truth=truths[filename]
    
    abstract=entities[entityID]
    if abstract in list(sensitivity.keys()):
        if truth==1 and sensitivity[abstract]==1:
            continue
        elif truth==1 and sensitivity[abstract]==0:
            sensitivity[abstract]=1
    else:
        sensitivity[abstract]=truth
con.close()


In [7]:
test_df = pd.DataFrame(columns=['abstract','truth'])
i=0
for abstract in sensitivity.keys():
    try:
        test_df.loc[i] = [abstract,int(sensitivity[abstract])]
        i+=1
    except KeyError:
        continue

In [8]:
#one_hot_vectorizer = CountVectorizer(tokenizer=text_pipeline, binary=True)
tfidf_vectorizer = TfidfVectorizer()#tokenizer=text_pipeline,binary=True)

sens=df[df['truth']==1]
not_sens=df[df['truth']==0].iloc[:len(sens)]
merged=sens.append(not_sens, ignore_index=True)

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['text'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

  merged=sens.append(not_sens, ignore_index=True)


In [9]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log')
SVC_fitted = clf.fit(train_features,train_labels)
score = SVC_fitted.score(test_features, test_labels)
print(score)

0.6475496223432285


In [20]:
predicted = SVC_fitted.predict_proba(test_features)[:,0]
predicted_df = test_df
predicted_df['predictions'] = 1-predicted
predicted_df.sort_values(by='predictions', ascending=False).head(30)

Unnamed: 0,abstract,truth,predictions
652,The Security Service of Ukraine (Ukrainian: Сл...,1,0.999216
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",0,0.990817
3270,A strategic business unit (SBU) in business st...,1,0.988661
197,"Turkey (Turkish: Türkiye [ˈtyɾcije]), official...",1,0.985326
1872,The World Food Programme (WFP) is the food-ass...,0,0.981526
3120,"A stretcher, gurney, litter, or pram is an app...",0,0.965533
7623,"Honduras, officially the Republic of Honduras,...",0,0.959065
6634,"Maoism, officially called Mao Zedong Thought b...",0,0.955561
549,The Distinguished Conduct Medal was a decorati...,1,0.95254
557,The United Arab Emirates (UAE; Arabic: الإمارا...,1,0.949643


In [18]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(predicted_df['truth'], predicted)

0.5640562325062609

In [22]:
import pickle
pickle.dump(SVC_fitted, open('classifier.pkl', "wb"))
pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))