In [201]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [202]:
df = pd.read_csv('2018_data.csv')
df = df.dropna()
df = df.drop(columns=['Title', 'Publish_date', 'Authors', 'Keywords'], axis=1)
for c in df.columns:
    print(c + ' ' + str(df[c].nunique()))

Publisher 420
Subjects 871


In [203]:
def is_engi(codes):
    code_list = codes.split(', ')
    return any(str(c).startswith('22') for c in code_list)
df['Is_Engi'] = False
df['Is_Engi'] = df['Subjects'].apply(is_engi)
df[['Subjects', 'Is_Engi']]

Unnamed: 0,Subjects,Is_Engi
2,"1600, 1500, 2209",True
3,"1600, 3104, 3100, 3110, 2508",False
4,"1602, 1303, 2304, 1607",False
5,"1403, 1408, 1407",False
6,1311,False
...,...,...
2786,"2723, 2720, 2739, 1308, 3607, 2704, 2726",False
2787,2745,False
2788,"2311, 2211",True
2789,"3202, 2739",False


In [204]:
def to_num(is_engi):
    if is_engi:
        return 1
    return 0
df['Is_Engi'] = df['Is_Engi'].apply(to_num)
df['Is_Engi'].value_counts()

Is_Engi
0    1858
1     369
Name: count, dtype: int64

In [205]:
nominal = ['Publisher', 'Subjects']
dummy = pd.get_dummies(data=df[nominal], drop_first=True)
df = pd.concat([df, dummy], axis=1)
df = df.drop(columns=nominal, axis=1)
df = df.reset_index(drop=True)
df

Unnamed: 0,Is_Engi,Publisher_AIMS PressMin.yu@aimspress.com,Publisher_AME Publishing Companyjtd@thepbpc.org,Publisher_ASEAN Neurological Association,Publisher_ASTES Publishers,Publisher_Academic Press,Publisher_Academic Press Inc.apjcs@harcourt.com,Publisher_Academy and Industry Research Collaboration Center (AIRCC),Publisher_Academy of Taiwan Information Systems Researchwangson@mail.ntpu.edu.tw,Publisher_African Field Epidemiology Network,...,Subjects_3506,Subjects_3602,"Subjects_3611, 3003","Subjects_3611, 3005, 3004, 3003, 2736",Subjects_3612,"Subjects_3612, 2742","Subjects_3612, 2742, 2909, 2717","Subjects_3614, 2502, 2204, 2741","Subjects_3614, 2741","Subjects_3614, 3500, 2741"
0,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2223,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2224,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2225,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [206]:
x = df.drop(columns=['Is_Engi'], axis=1)
y = df['Is_Engi']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=1000)


log_model = LogisticRegression(
    max_iter=5000,
    random_state=1000
)

log_model.fit(x_train, y_train)

predict = log_model.predict(x_test)

print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict, digits=2))


[[551   7]
 [ 54  57]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       558
           1       0.89      0.51      0.65       111

    accuracy                           0.91       669
   macro avg       0.90      0.75      0.80       669
weighted avg       0.91      0.91      0.90       669

