In [1]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
import pandas as pd
import sqlite3

nlp = spacy.load('en_core_web_lg', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

entities={}
for row in cur.execute('SELECT * FROM entity_app_entity;'):
    entities[row[0]]=row[1]
    
instances={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    try:
        instances[documentID]+=entities[entityID]
    except KeyError:
        instances[documentID]=entities[entityID]

documents={}
for row in cur.execute('SELECT * FROM entity_app_document;'):
    documents[row[0]]=row[2].replace(".html.gz","")



In [43]:
TRUTHS="C:/Users/User/Desktop/project_data/truths.txt"
with open (TRUTHS, "r") as f:
    lines = f.readlines()

truths={}
for line in lines:
    line=line.replace("\n","").replace(" ","/").split("/")[-2:]
    truths[line[0]]=line[1]

In [44]:
df = pd.DataFrame(columns=['filename', 'text', 'truth'])

i=0
for docid in documents.keys():
    try:
        filename=documents[docid]
        truth=int(truths[filename])
        text=instances[docid]
        df.loc[i] = [filename,text,truth]
        i+=1
    except KeyError:
        continue

In [45]:
def text_pipeline(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

In [46]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

sensitivity={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    
    filename=documents[documentID]
    truth=truths[filename]
    
    abstract=entities[entityID]
    if abstract in list(sensitivity.keys()):
        if truth==1 and sensitivity[abstract]==1:
            continue
        elif truth==1 and sensitivity[abstract]==0:
            sensitivity[abstract]=1
    else:
        sensitivity[abstract]=truth
con.close()


In [47]:
test_df = pd.DataFrame(columns=['abstract','truth'])
i=0
for abstract in sensitivity.keys():
    try:
        test_df.loc[i] = [abstract,int(sensitivity[abstract])]
        i+=1
    except KeyError:
        continue

In [49]:
#one_hot_vectorizer = CountVectorizer(tokenizer=text_pipeline, binary=True)
tfidf_vectorizer = TfidfVectorizer()#tokenizer=text_pipeline,binary=True)

sens=df[df['truth']==1]
not_sens=df[df['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['text'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

  merged=sens.append(not_sens, ignore_index=True)


In [51]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log')
SVC_fitted = clf.fit(train_features,train_labels)
score = SVC_fitted.score(test_features, test_labels)
print(score)

0.718530579016994


In [83]:
predicted = SVC_fitted.predict_proba(test_features)[:,0]
predicted_df = test_df
predicted_df['predictions'] = 1-predicted
#predicted_df.sort_values(by='predictions', ascending=False).head(30)
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.33 else 0)
predicted_df

Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",0,0.312752,0
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",0,0.650954,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",0,0.598394,1
3,Hashemite is a very rare barium chromate miner...,0,0.547384,1
4,"Terrorism, in its broadest sense, is the use o...",0,0.442914,1
...,...,...,...,...
19649,Sarah Helen Prescott is Professor of English L...,0,0.377566,1
19650,WMDT (channel 47) is a television station in S...,0,0.328617,0
19651,Price Drop was a British television shopping c...,0,0.317117,0
19652,SACYR S.A. (Spanish pronunciation: [saˈθiɾ]) i...,0,0.439730,1


In [84]:
predicted_df[predicted_df['prediction']==1]

Unnamed: 0,abstract,truth,predictions,prediction
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",0,0.650954,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",0,0.598394,1
3,Hashemite is a very rare barium chromate miner...,0,0.547384,1
4,"Terrorism, in its broadest sense, is the use o...",0,0.442914,1
5,A village is a clustered human settlement or c...,0,0.359697,1
...,...,...,...,...
19645,Pontevedra is a province of Spain along the co...,0,0.444048,1
19646,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0,0.507391,1
19648,Meggen is a municipality in the district of Lu...,0,0.395973,1
19649,Sarah Helen Prescott is Professor of English L...,0,0.377566,1


In [85]:
#from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
#balanced_accuracy_score(predicted_df['truth'], predicted_df['prediction'])
recall_score(predicted_df['truth'], predicted_df['prediction'])

0.8350247202706219

In [86]:
import pickle
pickle.dump(SVC_fitted, open('classifier.pkl', "wb"))
pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [8]:
sens=test_df[test_df['truth']==1]
not_sens=test_df[test_df['truth']==0]
train_df_=pd.concat([sens.iloc[:2500],not_sens.iloc[:2500]])
test_df_=pd.concat([sens.iloc[2500:],not_sens.iloc[2500:]])
train_df_

Unnamed: 0,abstract,truth
8,Bern (Swiss Standard German: [bɛrn]; Alemannic...,1
9,Akbar Hashemi Rafsanjani (Persian: اکبر هاشمی ...,1
34,A revolutionary is a person who either partici...,1
35,The Hellenic Police (Greek: Ελληνική Αστυνομία...,1
36,"Greece (Greek: Ελλάδα, romanized: Elláda, [eˈl...",1
...,...,...
4143,"Life insurance (or life assurance, especially ...",0
4144,"Da Nang or Danang ( /(ˌ)dɑː, də ˈnæŋ, ˈnɑːŋ/ d...",0
4145,American International Pictures (AIP) is an Am...,0
4146,"American International Group, Inc. (AIG) is an...",0


In [9]:
tfidf_vectorizer = TfidfVectorizer()

train_labels = train_df_['truth']
train_features = tfidf_vectorizer.fit_transform(train_df_['abstract'].tolist())

test_labels = test_df_['truth']
test_features = tfidf_vectorizer.transform(test_df_['abstract'])

In [10]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log')
SVC_fitted = clf.fit(train_features,train_labels)
score = SVC_fitted.score(test_features, test_labels)
print(score)

0.495302223614156
