In [1]:
import pandas as pd
import numpy as np

In [2]:
data  = pd.read_pickle('./Data/OpenAlex/openalex-data-plosone-2000-2010-train-04.pkl')

In [3]:
data.dropna
data.head()

Unnamed: 0,id,title,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,min_retracted_author_rank,...,has_10pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,min_retracted_institution_rank,has_1pct_retracted_institution,has_5pct_retracted_institution,has_10pct_retracted_institution,top_percentile_retracted_institution,is_retracted
0,https://openalex.org/W2031611770,FastTree 2 – Approximately Maximum-Likelihood ...,2010,1,2,30,10411,3,False,-inf,...,False,0.0,0.0,True,284.0,False,True,True,95.046385,False
1,https://openalex.org/W2041257508,progressiveMauve: Multiple Genome Alignment wi...,2010,1,2,62,3254,3,False,-inf,...,False,0.0,0.0,True,257.0,False,True,True,95.518992,False
2,https://openalex.org/W2169773990,Rapid SNP Discovery and Genetic Mapping Using ...,2008,1,1,18,2920,9,False,-inf,...,False,0.0,0.0,True,843.0,False,False,False,85.261684,False
3,https://openalex.org/W2071754162,Source Partitioning Using Stable Isotopes: Cop...,2010,2,3,24,2321,4,False,-inf,...,False,0.0,0.0,True,414.5,False,False,True,92.762121,False
4,https://openalex.org/W2135989088,Gut Microbiota in Human Adults with Type 2 Dia...,2010,1,2,36,2293,10,True,4773.5,...,True,88.490293,0.1,True,233.5,False,True,True,95.930334,False


In [4]:
retracted = data[data['is_retracted']==True]
retracted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 1289 to 15302
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    45 non-null     object 
 1   title                                 45 non-null     object 
 2   publication_year                      45 non-null     object 
 3   countries_distinct_count              45 non-null     object 
 4   institutions_distinct_count           45 non-null     object 
 5   referenced_works_count                45 non-null     object 
 6   cited_by_count                        45 non-null     object 
 7   authors_distinct_count                45 non-null     object 
 8   any_author_has_retraction             45 non-null     bool   
 9   min_retracted_author_rank             45 non-null     float64
 10  has_1pct_retracted_author             45 non-null     bool   
 11  has_5pct_retracted_a

In [5]:
for col in data.columns:
    if data[col].dtype == bool:
        data[col] = data[col].astype(int)
    elif col in ['countries_distinct_count', 'institutions_distinct_count', 'referenced_works_count', 'cited_by_count', 'authors_distinct_count']:
        data[col] = data[col].astype(float)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15809 entries, 0 to 15808
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    15809 non-null  object 
 1   title                                 15809 non-null  object 
 2   publication_year                      15809 non-null  object 
 3   countries_distinct_count              15809 non-null  float64
 4   institutions_distinct_count           15809 non-null  float64
 5   referenced_works_count                15809 non-null  float64
 6   cited_by_count                        15809 non-null  float64
 7   authors_distinct_count                15809 non-null  float64
 8   any_author_has_retraction             15809 non-null  int64  
 9   min_retracted_author_rank             15809 non-null  float64
 10  has_1pct_retracted_author             15809 non-null  int64  
 11  has_5pct_retrac

In [6]:
data.min(axis=0)

id                                                         https://openalex.org/W11386834
title                                   1, 9-Pyrazoloanthrones Downregulate HIF-1α and...
publication_year                                                                     2002
countries_distinct_count                                                              0.0
institutions_distinct_count                                                           0.0
referenced_works_count                                                                0.0
cited_by_count                                                                        0.0
authors_distinct_count                                                                1.0
any_author_has_retraction                                                               0
min_retracted_author_rank                                                            -inf
has_1pct_retracted_author                                                               0
has_5pct_r

### Preparing Data

Splitting data into train and test set, then normalizing float valued features to have mean 0 and standard deviation 1.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
features = ['countries_distinct_count', 'institutions_distinct_count', 
            'referenced_works_count', 'cited_by_count', 'authors_distinct_count', 
            'any_author_has_retraction',
            'has_5pct_retracted_author',
            'top_percentile_retracted_author',
            'frac_author_repeat_offenders', 'any_institution_has_retraction',
            'has_1pct_retracted_institution',
            'top_percentile_retracted_institution']
labels = ['is_retracted']

In [9]:
reduced_data = data[features]

In [10]:
train, test = train_test_split(data.copy(), test_size = 0.2, random_state=123)

In [11]:
train.columns

Index(['id', 'title', 'publication_year', 'countries_distinct_count',
       'institutions_distinct_count', 'referenced_works_count',
       'cited_by_count', 'authors_distinct_count', 'any_author_has_retraction',
       'min_retracted_author_rank', 'has_1pct_retracted_author',
       'has_5pct_retracted_author', 'has_10pct_retracted_author',
       'top_percentile_retracted_author', 'frac_author_repeat_offenders',
       'any_institution_has_retraction', 'min_retracted_institution_rank',
       'has_1pct_retracted_institution', 'has_5pct_retracted_institution',
       'has_10pct_retracted_institution',
       'top_percentile_retracted_institution', 'is_retracted'],
      dtype='object')

In [12]:
X_train = train[features]
y_train = train[labels]
X_test = test[features]
y_test = test[labels]

X_train.dtypes
X_test.head()

Unnamed: 0,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,has_5pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,has_1pct_retracted_institution,top_percentile_retracted_institution
457,2.0,5.0,55.0,292.0,16.0,0,0,0.0,0.0,1,0,97.566953
15248,3.0,3.0,38.0,4.0,14.0,0,0,0.0,0.0,1,0,90.871696
2895,1.0,1.0,56.0,106.0,8.0,0,0,0.0,0.0,1,0,91.921932
7504,1.0,3.0,13.0,48.0,6.0,0,0,0.0,0.0,1,0,97.566953
5203,1.0,1.0,14.0,68.0,3.0,0,0,0.0,0.0,1,0,71.416069


In [13]:
for col in X_train.columns:
    if X_train[col].dtype == float:
        mean = X_train[col].mean()
        std = X_train[col].std()
        X_train[col] = (X_train[col] - mean)/std
        X_test[col] = (X_test[col] - mean)/std  

X_train.head() 
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = (X_train[col] - mean)/std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = (X_test[col] - mean)/std


Unnamed: 0,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,has_5pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,has_1pct_retracted_institution,top_percentile_retracted_institution
457,0.516144,0.779455,0.647844,1.516379,2.08632,0,0,-0.427455,-0.21894,1,0,0.459733
15248,1.562916,-0.0435,-0.129664,-0.48673,1.63311,0,0,-0.427455,-0.21894,1,0,0.207676
2895,-0.530628,-0.866456,0.69358,0.222705,0.273478,0,0,-0.427455,-0.21894,1,0,0.247215
7504,-0.530628,-0.0435,-1.273058,-0.180699,-0.179733,0,0,-0.427455,-0.21894,1,0,0.459733
5203,-0.530628,-0.866456,-1.227322,-0.041595,-0.859548,0,0,-0.427455,-0.21894,1,0,-0.524771


### Model

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
model  = LogisticRegression(penalty='l2', solver='newton-cg')

In [16]:
np.ravel(y_train).shape

(12647,)

In [17]:
model.fit(X_train, np.ravel(y_train))

In [28]:
(np.sum(np.abs(np.ravel(y_test) - model.predict(X_test)))/y_test.shape[0])

0.002213788741302973

In [31]:
np.sum(model.predict(X_test)) , np.sum(np.ravel(y_test))

(4, 9)

In [26]:
(np.sum(np.abs(np.ravel(y_train) - model.predict(X_train)))/y_train.shape[0])

0.0025302443267177987

In [30]:
np.sum(model.predict(X_train)) , np.sum(np.ravel(y_train))

(14, 36)

In [34]:
model.coef_

array([[ 0.05757428,  0.15668553,  0.1500403 , -1.01602394,  0.20565687,
         1.56176742, -0.17667737,  0.07628257,  0.5619897 ,  0.48888741,
         0.14465097, -0.03962105]])

In [35]:
model.intercept_

array([-8.48898034])

Dominant Features

In [37]:
train.columns[3], train.columns[5]

('countries_distinct_count', 'referenced_works_count')

### Lasso Regularization

In [40]:
model = LogisticRegression(penalty='l1', solver='liblinear')

In [41]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [42]:
(np.sum(np.abs(np.ravel(y_test) - model.predict(X_test)))/y_test.shape[0])

0.002213788741302973

In [43]:
model.coef_

array([[ 0.04550681,  0.18329536,  0.12999863, -0.91758169,  0.1597105 ,
         1.71270137,  0.        ,  0.        ,  0.5450523 ,  0.        ,
         0.        ,  0.        ]])