Literature screening model

文献筛选模型

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from datetime import datetime
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
from gensim.models import Word2Vec
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler

file_path = r'D:/wyy/data record/paper search/train_dataset.csv'
df = pd.read_csv(file_path,encoding="unicode_escape")
df["txt"] = df["title"] + df["abstract"]
df["txt"] = df["txt"].str.lower()
df["txt"] = df["txt"].astype(str)
df["txt"] = df["txt"].apply(lambda x: re.sub(r'\d+|[^\w\s]', '', x))
stop_words = set(stopwords.words('english'))

df["txt"] = df["txt"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

def feature_extraction(txt, method="TF-IDF"):
    if method == "TF-IDF":
        vectorizer = TfidfVectorizer()
    elif method == "N-gram":
        vectorizer = CountVectorizer(ngram_range=(1, 2))
    elif method == "Word2Vec":
        model = Word2Vec(txt, window=5, min_count=1, workers=4)
        vectors = np.array([np.mean([model.wv[word] for word in sentence.split() if word in model.wv], axis=0) for sentence in txt])
        return vectors

    X = vectorizer.fit_transform(txt)
    print(X.shape)
    return X.toarray()

def dimension_reduction(X, method="PCA"):
    n_components = 1000
    if method == "PCA":
        dr = PCA(n_components=n_components)
    elif method == "LDA":
        dr = LatentDirichletAllocation(n_components=n_components)
    elif method == "t-SNE":
        dr = TSNE(n_components=3)
    
    X_reduced = dr.fit_transform(X)
    return X_reduced

def model_fit(X, y, model_name="SVM"):
    print(X.shape)
    if model_name == "SVM":
        clf = SVC()
    elif model_name == "RF":
        clf = RandomForestClassifier(n_estimators=100, random_state=112341)
    elif model_name == "NN":
        clf = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=500, random_state=42)
    cv_scores = cross_val_score(clf, X, y, cv=10)
    print(f"{model_name} CV scores:", cv_scores)
    clf.fit(X, y)
    return clf

def save_results(clf, X, model_name, output_dir):
    preds = clf.predict(X)
    count_ones = np.sum(preds)
    print(f"{model_name} predicted 1 count:", count_ones)

    timestamp = datetime.now().strftime("%Y%m%d-%H%M")
    file_name = f"{model_name}-{timestamp}.csv"
    output_path = os.path.join(output_dir, file_name)
    pd.DataFrame({'mark': df.mark, 'title': df.title, 'Chinese': df.Chinese, 'outcome': preds}).to_csv(output_path, index=False,encoding="utf-8-sig")

if __name__ == "__main__":
    fem_name = "TF-IDF"
    X = feature_extraction(df["txt"], method=fem_name)
    drm_name = "PCA"
    X_reduced = dimension_reduction(X, method=drm_name)
    y = df["mark"]

    model_names = ["SVM", "RF", "NN"]
    for model_name in model_names:
        clf = model_fit(X_reduced, y, model_name=model_name)
        full_name = fem_name + "_" + drm_name + "_" + model_name
        save_results(clf, X_reduced, full_name, output_dir="D:\\wyy\\data record\\paper search\\output")

(10375, 67370)
(10375, 1000)
SVM CV scores: [0.97495183 0.96917148 0.97302505 0.96435453 0.97591522 0.97492768
 0.97203472 0.96817743 0.975892   0.97299904]
TF-IDF_PCA_SVM predicted 1 count: 437
(10375, 1000)
RF CV scores: [0.95375723 0.95375723 0.9566474  0.95472062 0.95375723 0.95660559
 0.95564127 0.95371263 0.95564127 0.95371263]
TF-IDF_PCA_RF predicted 1 count: 495
(10375, 1000)
NN CV scores: [0.96242775 0.96050096 0.95279383 0.96628131 0.96146435 0.97492768
 0.97878496 0.96721311 0.9710704  0.96624879]
TF-IDF_PCA_NN predicted 1 count: 496


In [None]:
df_t = pd.read_csv("D:/wyy/data record/paper search/train_dataset.csv",encoding="unicode_escape")
df_t = df_t[(df_t["title"].notna())&(df_t["abstract"].notna())]
df_f = pd.read_csv("D:/wyy/data record/paper search/forecast_dataset.csv",encoding="unicode_escape")
df_f = df_f[(df_f["title"].notna())&(df_f["abstract"].notna())]

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from datetime import datetime
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
from gensim.models import Word2Vec
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler

df_t = pd.read_csv("D:/wyy/data record/paper search/train_dataset.csv",encoding="unicode_escape")
df_t = df_t[(df_t["title"].notna())&(df_t["abstract"].notna())]

df_f = pd.read_csv("D:/wyy/data record/paper search/forecast_dataset.csv",encoding="unicode_escape")
df_f = df_f[(df_f["title"].notna())&(df_f["abstract"].notna())]


def df_merge(df):
    df["txt"] = df["title"] + df["abstract"]
    df["txt"] = df["txt"].str.lower()
    df["txt"] = df["txt"].astype(str)
    df["txt"] = df["txt"].apply(lambda x: re.sub(r'\d+|[^\w\s]', '', x))
    stop_words = set(stopwords.words('english'))
    df["txt"] = df["txt"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    return df

df_f_f = df_f[~df_f['title'].isin(df_t['title'])]
df_concat = pd.concat([df_t, df_f_f])
print(df_concat.shape)
df_concat = df_merge(df_concat)


def save_results(clf, X, model_name, output_dir):
    preds = clf.predict(X)
    count_ones = np.sum(preds)
    print(f"{model_name} predicted 1 count:", count_ones)
    timestamp = datetime.now().strftime("%Y%m%d-%H%M")
    file_name = f"{model_name}-{timestamp}.csv"
    output_path = os.path.join(output_dir, file_name)
    pd.DataFrame({'Mark': df.Mark, 'Article Title': df.title, 'outcome': preds}).to_csv(output_path, index=False,encoding="gbk")
if __name__ == "__main__":
    fem_name = "TF-IDF"
    X = feature_extraction(df_concat["txt"], method=fem_name)
    drm_name = "PCA"
    X_reduced = dimension_reduction(X, method=drm_name)

    X_reduced_test = X_reduced[:10403,:]
    X_reduced_train = X_reduced[10403:,:]

    df_concat_loc = df_concat.iloc[:10403,:]
    y = df_concat_loc["mark"]

In [14]:
model_name = "RF"
clf = model_fit(X_reduced_test, y, model_name=model_name)
preds = clf.predict(X_reduced_train)

count_ones = np.sum(preds)
print(f"{model_name} predicted need paper count:", count_ones)

timestamp = datetime.now().strftime("%Y%m%d-%H%M")
file_name = f"{model_name}-{timestamp}.csv"
output_dir="D:\\wyy\\data record\\paper search\\output"
output_path = os.path.join(output_dir, file_name)

print(df_f.columns)
df_concat = df_concat.rename(columns={'ï»¿authors':'authors'})
df_n_train = df_concat.iloc[10403:,:]
pd.DataFrame({'authors': df_n_train.authors, 'title': df_n_train.title, 
                'source': df_n_train.source,   'DOI': df_n_train.DOI, 
                'year': df_n_train.year,       'abstract': df_n_train.abstract,
                'outcome': preds,              'Chinese': df_n_train.Chinese,
                }).to_csv(output_path, index=False,encoding="utf-8-sig")

(10403, 3)
RF CV scores: [0.95869356 0.95869356 0.94908742 0.95961538 0.96538462 0.97211538
 0.96538462 0.96634615 0.96634615 0.95576923]
RF predicted need paper count: 118.0
Index(['ï»¿authors', 'title', 'Chinese', 'source', 'DOI', 'abstract', 'year',
       'mark'],
      dtype='object')
