In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Support Vector Classification

In this notebook, we explore using support vector classifiers for retracted and non-retracted papers.

In [21]:
data  = pd.read_pickle('./Data/OpenAlex/openalex-data-plosone-2000-2010-train-04.pkl')
data.dropna
#data=data.drop('https://openalex.org/W2031611770')
data.head()

Unnamed: 0,id,title,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,min_retracted_author_rank,...,has_10pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,min_retracted_institution_rank,has_1pct_retracted_institution,has_5pct_retracted_institution,has_10pct_retracted_institution,top_percentile_retracted_institution,is_retracted
0,https://openalex.org/W2031611770,FastTree 2 – Approximately Maximum-Likelihood ...,2010,1,2,30,10411,3,False,-inf,...,False,0.0,0.0,True,284.0,False,True,True,95.046385,False
1,https://openalex.org/W2041257508,progressiveMauve: Multiple Genome Alignment wi...,2010,1,2,62,3254,3,False,-inf,...,False,0.0,0.0,True,257.0,False,True,True,95.518992,False
2,https://openalex.org/W2169773990,Rapid SNP Discovery and Genetic Mapping Using ...,2008,1,1,18,2920,9,False,-inf,...,False,0.0,0.0,True,843.0,False,False,False,85.261684,False
3,https://openalex.org/W2071754162,Source Partitioning Using Stable Isotopes: Cop...,2010,2,3,24,2321,4,False,-inf,...,False,0.0,0.0,True,414.5,False,False,True,92.762121,False
4,https://openalex.org/W2135989088,Gut Microbiota in Human Adults with Type 2 Dia...,2010,1,2,36,2293,10,True,4773.5,...,True,88.490293,0.1,True,233.5,False,True,True,95.930334,False


In [30]:
reduced_data = data.drop(columns=['id', 'title', 'min_retracted_author_rank', 'min_retracted_institution_rank'])

for col in reduced_data.columns:
    if reduced_data[col].dtype == bool:
        reduced_data[col] = reduced_data[col].astype(int)
    elif col in ['publication_year', 'countries_distinct_count', 'institutions_distinct_count', 'referenced_works_count', 'cited_by_count', 'authors_distinct_count']:
        reduced_data[col] = reduced_data[col].astype(float)

In [31]:
X_train, X_test = train_test_split(reduced_data.copy(), test_size = 0.2, random_state=123, stratify=reduced_data['is_retracted'])

y_train, y_test = X_train['is_retracted'].values, X_test['is_retracted'].values

In [32]:
for col in X_train.columns:
    if X_train[col].dtype == float:
        mean = X_train[col].mean()
        std = X_train[col].std()
        X_train[col] = (X_train[col] - mean)/std
        X_test[col] = (X_test[col] - mean)/std  

## Model Selection

Using k-fold cross validation to determine the best kernel for our support vector machine. We will test linear, polynomial, rbf, and sigmoid kernels. For polynomial kernels, we test degrees 2, 3, 5, 10.

In [33]:
X_train['is_retracted']

2856     0
5098     0
14499    0
6475     0
11873    0
        ..
12410    0
11151    0
1948     0
2110     0
15368    0
Name: is_retracted, Length: 12647, dtype: int64

In [43]:
kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state=123)

f1_scores = np.zeros((14, 10))

features = list(X_train.columns)
features.remove('is_retracted')

i = 0

for train_index, test_index in kfold.split(X=X_train[features], y=X_train['is_retracted']):
    X_tt = X_train[features].iloc[train_index]
    y_tt = X_train['is_retracted'].iloc[train_index].values
    X_val = X_train[features].iloc[test_index]
    y_val = X_train['is_retracted'].iloc[test_index].values
    

    model = SVC(kernel='linear', gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[0,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 2, gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[1,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 3, gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[2,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 5, gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[3,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 10, gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[4,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='rbf', gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[5,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='sigmoid', gamma='auto')
    model.fit(X_tt, y_tt)
    f1_scores[6,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='linear')
    model.fit(X_tt, y_tt)
    f1_scores[7,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 2)
    model.fit(X_tt, y_tt)
    f1_scores[8,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 3)
    model.fit(X_tt, y_tt)
    f1_scores[9,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 5)
    model.fit(X_tt, y_tt)
    f1_scores[10,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='poly', degree = 10)
    model.fit(X_tt, y_tt)
    f1_scores[11,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='rbf')
    model.fit(X_tt, y_tt)
    f1_scores[12,i] = f1_score(y_val, model.predict(X_val))

    model = SVC(kernel='sigmoid')
    model.fit(X_tt, y_tt)
    f1_scores[13,i] = f1_score(y_val, model.predict(X_val))

    i +=1

In [44]:
for i in range(14):
    print('Kernel', i, 'Accuracy:',np.mean(f1_scores[i]))

Kernel 0 Accuracy: 0.40952380952380957
Kernel 1 Accuracy: 0.45357142857142857
Kernel 2 Accuracy: 0.5754761904761905
Kernel 3 Accuracy: 0.505
Kernel 4 Accuracy: 0.43753968253968256
Kernel 5 Accuracy: 0.43380952380952376
Kernel 6 Accuracy: 0.0
Kernel 7 Accuracy: 0.40952380952380957
Kernel 8 Accuracy: 0.5083333333333333
Kernel 9 Accuracy: 0.5254761904761905
Kernel 10 Accuracy: 0.4719913419913421
Kernel 11 Accuracy: 0.46992063492063485
Kernel 12 Accuracy: 0.39380952380952383
Kernel 13 Accuracy: 0.0


Kernel 2, the polynomial kernel with degree 3 and gamma = auto, is giving us the best F1 score.