In [285]:
import pandas as pd
import numpy as np
import scipy
import re
from scipy.stats import norm

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import mpl_toolkits.mplot3d as mplt3d
from mpl_toolkits.mplot3d import Axes3D 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

%matplotlib notebook
%load_ext autoreload
%autoreload 2

# shouldn't be enabled when using interactive 3D plots
# %pylab inline
# pylab.rcParams['figure.figsize'] = (10, 7)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [183]:
# naming chosen for same-length, to look pretty
kicked = pd.read_csv('data/dismissed_complete.csv')
stayed = pd.read_csv('data/nodismissed_complete.csv')

In [184]:
kicked.head()

Unnamed: 0.1,Unnamed: 0,Author,Title paper,Journal,Year,Keywords,labels
0,0,AU=AYDIN AYDIN,"[Widespread involvement of hepatic, renal and ...",[TURKISH JOURNAL OF PEDIATRICS],[2007],"[mycotic aneurysm, visceral arteries, mesenter...",1.0
1,1,AU=AYDIN AYDIN,[Temporal Analysis of Finger-Tapping Test in I...,[ACTA PHYSIOLOGICA],[2015],,1.0
2,2,AU=AYDIN AYDIN,[UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS],[ACTA CLINICA CROATICA],[2011],"[Epididymis, Undiffrentiated carcinomas, Metas...",1.0
3,3,AU=AYDIN AYDIN,[Undifferentiated Primary Carcinoma of the Epi...,[UHOD-ULUSLARARASI HEMATOLOJI-ONKOLOJI DERGISI],[2009],"[Epididymis, Undifferentiated carcinoma, Adjuv...",1.0
4,4,AU=GİLGİL ERDAL,[Internal iliac artery pseudoaneurysm - An unu...,[AMERICAN JOURNAL OF PHYSICAL MEDICINE & REHAB...,[2008],"[lumbosacral plexopathy, sciatica, pseudoaneur...",1.0


In [185]:
stayed.head()

Unnamed: 0.1,Unnamed: 0,Author,Title paper,Journal,Year,Keywords,labels
0,0,AU=AU,[ASYMMETRIC LOADING OF AN EXTERNALLY CRACKED E...,[THEORETICAL AND APPLIED FRACTURE MECHANICS],[1990],,0.0
1,1,AU=AU,[HIGH-CAPACITY OIL-WATER SEPARATOR PERFORMANCE...,[ENVIRONMENTAL ENGINEERING : PROCEEDINGS OF TH...,[1990],,0.0
2,2,AU=AU,[NUMERICAL MODELING OF BRIDGE FLAWS IN FIBER R...,[LOCALIZED DAMAGE COMPUTER-AIDED ASSESSMENT AN...,[1990],,0.0
3,3,AU=AU,[INHIBIN - A MARKER OF SERTOLI-CELL FUNCTION],[HUMAN REPRODUCTION /],[1988],,0.0
4,4,AU=AU,[THE MEASUREMENT OF BULK AND SURFACE RECOMBINA...,[CONFERENCE RECORD OF THE TWENTIETH IEEE PHOTO...,[1988],,0.0


In [393]:
kicked.shape, stayed.shape

((12301, 7), (24859, 7))

In [434]:
kicked1 = kicked[['Author','Title paper', 'labels']]
stayed1 = stayed[['Author','Title paper', 'labels']].sample(frac=(1.0 * kicked.shape[0])/stayed.shape[0]) # random_state = 0

In [435]:
# make arrays equal in size
kicked1.shape, stayed1.shape

((12301, 3), (12301, 3))

In [436]:
# check some basic invariants on the input data, all should return True
print(kicked1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(stayed1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(kicked1['labels'].apply(lambda x: x == 1).all())
print(stayed1['labels'].apply(lambda x: x == 0).all())
print(kicked1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())
print(stayed1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())

True
True
True
True
True
True


In [437]:
df0 = pd.concat([kicked1, stayed1])

In [438]:
df0.head()

Unnamed: 0,Author,Title paper,labels
0,AU=AYDIN AYDIN,"[Widespread involvement of hepatic, renal and ...",1.0
1,AU=AYDIN AYDIN,[Temporal Analysis of Finger-Tapping Test in I...,1.0
2,AU=AYDIN AYDIN,[UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS],1.0
3,AU=AYDIN AYDIN,[Undifferentiated Primary Carcinoma of the Epi...,1.0
4,AU=GİLGİL ERDAL,[Internal iliac artery pseudoaneurysm - An unu...,1.0


In [439]:
df1 = df0.copy()
df1['Author'] = df0['Author'].apply(lambda x: x[3:])
df1['Label'] = df0['labels'].apply(lambda x: int(x))
df1['Title paper'] = df0['Title paper'].apply(lambda s: s[1:][:-1])
df1 = df1.drop(columns=['labels'])
df1.head()

Unnamed: 0,Author,Title paper,Label
0,AYDIN AYDIN,"Widespread involvement of hepatic, renal and m...",1
1,AYDIN AYDIN,Temporal Analysis of Finger-Tapping Test in In...,1
2,AYDIN AYDIN,UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS,1
3,AYDIN AYDIN,Undifferentiated Primary Carcinoma of the Epid...,1
4,GİLGİL ERDAL,Internal iliac artery pseudoaneurysm - An unus...,1


In [440]:
# remove trash author names (whose length < 6)
print('Removing rows:')
print(df1[df1['Author'].apply(lambda s : len(s) < 6)]['Author'])
df2 = df1[df1['Author'].apply(lambda s : len(s) >= 6)]

Removing rows:
10546    ASEE
22552    SGEM
7766      ACM
1          AU
0          AU
10550    ASEE
7763      ACM
3          AU
Name: Author, dtype: object


In [441]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [442]:
# In fact we will do classification on "Title paper", so further working on it
print('Shape of not English words: ', df2[df2['Title paper'].apply(lambda s : not isEnglish(s))].shape)
df3 = df2.copy()
df3['Title paper'] = df2['Title paper'].apply(
    lambda s : s.lower()
).apply(
    lambda s : re.sub(r"[.,/()?:'%\";\[\]!\{\}><]", "", s) # delete all not-letters
).apply(
    lambda s : re.sub(r"[- + = @ & * # |]", " ", s) # substitute defis with spaces
).apply(
    lambda s : re.sub(r"\d", " ", s) # substitute numbers with spaces
).apply(
    lambda s : re.sub(r"\W\w{1,2}\W", " ", s) # naive removal of super-short words
).apply(
    lambda s : re.sub(r"\s+", " ", s) # substitute multiple spaces with one
)
df3 = df3[df3['Title paper'].apply(
    lambda s: s != 'untitled' and s != 'editorial' # drop some common but not-interesting names
)]

Shape of not English words:  (0, 3)


In [443]:
# try to find strange symbols and print them in "Title paper" and print them 
symbols = df3['Title paper'].apply(
    lambda s: ''.join(c for c in s if not c.isalpha() and c != ' ')
)
print(symbols[symbols.apply(lambda s: s != '')])

Series([], Name: Title paper, dtype: object)


In [444]:
# okay, now in df3 in "Title paper" we have clean sentences, should work with trimmer now

In [445]:
df4 = df3.drop(columns=['Author'])

In [446]:
df4.head(), df4.tail()

(                                         Title paper  Label
 0  widespread involvement hepatic renal and mesen...      1
 1  temporal analysis finger tapping test individu...      1
 2          undifferentiated carcinoma the epididymis      1
 3  undifferentiated primary carcinoma the epididy...      1
 4  internal iliac artery pseudoaneurysm unusual c...      1,
                                              Title paper  Label
 9493   determination advanced life support knowledge ...      0
 24682   patients with vulval pruritus patch test results      0
 12847  personality change following gun shot case report      0
 6429   exploratory study the moment capacity and semi...      0
 22341  musculoskeletal system involvement adult wilso...      0)

In [458]:
data_train, data_test = train_test_split(df4, test_size=0.3) # random_state = 0

In [459]:
X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

In [460]:
X_train.shape, X_test.shape

((17210,), (7377,))

In [461]:
# Let's do the following model
# Features are unique words
# Samples are titles

# 1) Naive : for every sample we have binary value for every word (present / absent)
# 2) sklearn.CountVectorizer : counting
# 3) sklearn.TfidfVectorizer : with usual counting more weight is given to longer sentences, that's not really
#                               fair, TF-IDF (term frequency _times_ inverse document frequency) also gives
#                               every sample a weight for each present word, but in more sophisticated way

# We are doing (3) classificator

In [462]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
print('Train shape: ', X_train_tfidf.shape)

X_test_tfidf = tfidf_vect.transform(X_test)
print('Test  shape: ', X_test_tfidf.shape)

Train shape:  (17210, 21830)
Test  shape:  (7377, 21830)


In [463]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [464]:
np.mean(clf.predict(X_test_tfidf) == y_test)

0.5652704351362342