In [1]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
import pandas as pd
import sqlite3

nlp = spacy.load('en_core_web_lg', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

entities={}
for row in cur.execute('SELECT * FROM entity_app_entity;'):
    entities[row[0]]=row[1]
    
instances={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    try:
        instances[documentID]+=entities[entityID]
    except KeyError:
        instances[documentID]=entities[entityID]

documents={}
for row in cur.execute('SELECT * FROM entity_app_document;'):
    documents[row[0]]=row[2].replace(".html.gz","")



In [3]:
TRUTHS="C:/Users/User/Desktop/project_data/truths.txt"
with open (TRUTHS, "r") as f:
    lines = f.readlines()

truths={}
for line in lines:
    line=line.replace("\n","").replace(" ","/").split("/")[-2:]
    truths[line[0]]=line[1]

In [4]:
df = pd.DataFrame(columns=['filename', 'text', 'truth'])

i=0
for docid in documents.keys():
    try:
        filename=documents[docid]
        truth=int(truths[filename])
        text=instances[docid]
        df.loc[i] = [filename,text,truth]
        i+=1
    except KeyError:
        continue

In [5]:
def text_pipeline(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens

In [6]:
con = sqlite3.connect("C:/Users/User/OneDrive - University of Glasgow/University Year 4/Individual Project/2464980P-L4-Project/src/entity_django/db.sqlite3")

cur = con.cursor()

sensitivity={}
for row in cur.execute('SELECT * FROM entity_app_instance;'):
    documentID=row[3]
    entityID=row[4]
    
    filename=documents[documentID]
    truth=truths[filename]
    
    abstract=entities[entityID]
    if abstract in list(sensitivity.keys()):
        if truth==1 and sensitivity[abstract]==1:
            continue
        elif truth==1 and sensitivity[abstract]==0:
            sensitivity[abstract]=1
    else:
        sensitivity[abstract]=truth
con.close()


In [7]:
test_df = pd.DataFrame(columns=['abstract','truth'])
i=0
for abstract in sensitivity.keys():
    try:
        test_df.loc[i] = [abstract,int(sensitivity[abstract])]
        i+=1
    except KeyError:
        continue

In [8]:
tfidf_vectorizer = TfidfVectorizer()#tokenizer=text_pipeline,binary=True)

sens=df[df['truth']==1]
not_sens=df[df['truth']==0].sample(n=len(sens),replace=False)
merged=sens.append(not_sens, ignore_index=True)

train_labels = merged['truth']
train_features = tfidf_vectorizer.fit_transform(merged['text'].tolist())

test_labels = test_df['truth']
test_features = tfidf_vectorizer.transform(test_df['abstract'])

  merged=sens.append(not_sens, ignore_index=True)


In [67]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
classifier = SGDClassifier(loss='log')
#classifier = MLPClassifier(max_iter=10)
#classifier = MultinomialNB()
#classifier = LinearSVC()
classifier_fitted = classifier.fit(train_features,train_labels)


In [68]:
predicted = classifier_fitted._predict_proba_lr(test_features)[:,0]
predicted_df = test_df
predicted_df['predictions'] = 1-predicted
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.33 else 0)
predicted_df

Unnamed: 0,abstract,truth,predictions,prediction
0,"Amman (English: /əˈmɑːn/; Arabic: عَمَّان, ʻam...",0,0.358722,1
1,"Hussein bin Talal (Arabic: الحسين بن طلال, Al-...",0,0.700499,1
2,"Arabic (اَلْعَرَبِيَّةُ, al-ʿarabiyyah [al ʕar...",0,0.592673,1
3,Hashemite is a very rare barium chromate miner...,0,0.567529,1
4,"Terrorism, in its broadest sense, is the use o...",0,0.536845,1
...,...,...,...,...
19646,"PRISA Televisión, S.A.U (PRISA TV) is a pay TV...",0,0.353176,1
19647,"Devin Garrett Townsend (born May 5, 1972) is a...",0,0.358247,1
19648,Meggen is a municipality in the district of Lu...,0,0.503776,1
19649,Sarah Helen Prescott is Professor of English L...,0,0.285943,0


In [69]:
y_true=predicted_df['truth']
y_pred=predicted_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))

Precision Score = 0.22036685641998735
Recall Score = 0.9065833983866771
F1 Score = 0.3545514679692668
F2 Score = 0.5586556346610224
Balanced Accuracy Score = 0.5634258085050794


In [None]:
from sklearn.metrics import classification_report
print(classification_report(predicted_df['truth'], predicted_df['prediction'], target_names=['unsensitive','sensitive']))

In [None]:
#import pickle
#pickle.dump(SVC_fitted, open('classifier.pkl', "wb"))
#pickle.dump(tfidf_vectorizer, open('vectorizer.pkl', "wb"))

In [None]:
sens=test_df[test_df['truth']==1]
not_sens=test_df[test_df['truth']==0]
print(len(sens))
print(len(not_sens))
train_df_=pd.concat([sens.iloc[:],not_sens.iloc[:]])
test_df_=pd.concat([sens.iloc[:],not_sens.iloc[:]])

In [None]:
tfidf_vectorizer = TfidfVectorizer()

train_labels = train_df_['truth']
train_features = tfidf_vectorizer.fit_transform(train_df_['abstract'].tolist())

test_labels = test_df_['truth']
test_features = tfidf_vectorizer.transform(test_df_['abstract'])

In [None]:
clf = SGDClassifier(loss='log')

In [None]:
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, train_features, train_labels, cv=6, method='predict_proba')
predicted_df = test_df_
predicted_df['predictions'] = 1-predicted
#predicted_df.sort_values(by='predictions', ascending=False).head(30)
predicted_df['prediction'] = predicted_df['predictions'].apply(lambda x : 1 if x>0.33 else 0)
predicted_df.sort_values(['predictions'], ascending=False).head(30)

In [None]:
y_true=predicted_df['truth']
y_pred=predicted_df['prediction']
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
print("Precision Score =",precision_score(y_true, y_pred, pos_label=1))
print("Recall Score =",recall_score(y_true, y_pred, pos_label=1))
print("F1 Score =",f1_score(y_true, y_pred, pos_label=1))
print("F2 Score =",fbeta_score(y_true, y_pred, pos_label=1, beta=2))
print("Balanced Accuracy Score =",balanced_accuracy_score(y_true, y_pred))