https://www.kaggle.com/datasets/neelshah18/arxivdataset/data

https://github.com/NikitaTrTr/Fake-news-detector/tree/master

In [88]:
import json
import ast
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import swifter

In [5]:
with open('arxivData.json') as f:
    data = json.load(f)

In [40]:
len(data)

41000

In [7]:
data[0].keys()

dict_keys(['author', 'day', 'id', 'link', 'month', 'summary', 'tag', 'title', 'year'])

In [37]:
data[110]['title']

'Character-based Neural Machine Translation'

In [38]:
data[110]['summary']

'Neural Machine Translation (MT) has reached state-of-the-art results.\nHowever, one of the main challenges that neural MT still faces is dealing with\nvery large vocabularies and morphologically rich languages. In this paper, we\npropose a neural MT system using character-based embeddings in combination with\nconvolutional and highway layers to replace the standard lookup-based word\nrepresentations. The resulting unlimited-vocabulary and affix-aware source word\nembeddings are tested in a state-of-the-art neural MT based on an\nattention-based bidirectional recurrent neural network. The proposed MT scheme\nprovides improved results even when the source language is not morphologically\nrich. Improvements up to 3 BLEU points are obtained in the German-English WMT\ntask.'

In [39]:
data[110]['tag']

"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.LG', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.NE', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'stat.ML', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}]"

In [92]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kirki\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kirki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kirki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [93]:
class Preprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        #self.stop_words.extend(['unused word1', 'unused word2']) - слова которые добавить в стопвордс - по результатам EDA
        self.lemmatizer = WordNetLemmatizer()
        self.label_encoder = LabelEncoder()

    def preprocess(self, data):
        result = data.copy()
        result = result.dropna(subset=['text', 'title', 'class'])
        result = result[(result['text'] != '') & (result['title'] != '') & (result['class'] != '')]

        result['text'] = result['text'].swifter.apply(self.clean_text)
        result['title'] = result['title'].swifter.apply(self.clean_text)

        # Кодирование классов
        result['class_code'] = self.label_encoder.fit_transform(result['class'])

        # Перемешивание данных
        result = result.sample(frac=1).reset_index(drop=True)
        return result

    def clean_text(self, phrase):
        cleared_text = re.sub(r'[^a-zA-Z\s]', ' ', phrase)  # Чистим текст
        words = cleared_text.lower().split()  # Токенизация и приведение к нижнему регистру
        filtered_words = [word for word in words if word not in self.stop_words]  # Убираем стоп-слова
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]  # Лемматизация
        return ' '.join(lemmatized_words).strip()

In [94]:
titles = []
abstracts = []
classes = []

for i in range(len(data)):
    title = data[i]['title']
    abstract = data[i]['summary']
    paper_class = ast.literal_eval(data[i]['tag'])[0]['term'].split('.')[0]

    titles.append(title)
    abstracts.append(abstract)
    classes.append(paper_class)

dataset = pd.DataFrame({
    'title': titles,
    'text': abstracts,
    'class': classes
})

In [95]:
dataset.head()

Unnamed: 0,title,text,class
0,Dual Recurrent Attention Units for Visual Ques...,We propose an architecture for VQA which utili...,cs
1,Sequential Short-Text Classification with Recu...,Recent approaches based on artificial neural n...,cs
2,Multiresolution Recurrent Neural Networks: An ...,We introduce the multiresolution recurrent neu...,cs
3,Learning what to share between loosely related...,Multi-task learning is motivated by the observ...,stat
4,A Deep Reinforcement Learning Chatbot,We present MILABOT: a deep reinforcement learn...,cs


In [96]:
preprocessor = Preprocessor()

In [97]:
preprocessed_dataset = preprocessor.preprocess(dataset)

Pandas Apply:   0%|          | 0/41000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41000 [00:00<?, ?it/s]

In [98]:
dict(enumerate(preprocessor.label_encoder.classes_))

{0: 'adap-org',
 1: 'astro-ph',
 2: 'cmp-lg',
 3: 'cond-mat',
 4: 'cs',
 5: 'econ',
 6: 'eess',
 7: 'gr-qc',
 8: 'hep-ex',
 9: 'hep-lat',
 10: 'hep-ph',
 11: 'hep-th',
 12: 'math',
 13: 'nlin',
 14: 'nucl-th',
 15: 'physics',
 16: 'q-bio',
 17: 'q-fin',
 18: 'quant-ph',
 19: 'stat'}

In [102]:
df = preprocessed_dataset.copy()

physics_classes = {0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 18}

class_mapping = {cls: 'physics' if cls in physics_classes else preprocessor.label_encoder.classes_[cls] 
                 for cls in range(len(preprocessor.label_encoder.classes_))}

df['new_class'] = df['class_code'].map(class_mapping)

print(df['new_class'].value_counts())

new_class
cs         34597
stat        4782
math         612
physics      579
q-bio        320
eess          75
q-fin         30
econ           5
Name: count, dtype: int64


In [103]:
df.head()

Unnamed: 0,title,text,class,class_code,new_class
0,interpreting syntactic social element tweet re...,research social medium analysis experiencing r...,cs,4,cs
1,living together mind machine intelligence,paper consider nature machine intelligence cre...,cs,4,cs
2,stochastic local search pattern set mining,local search method quickly find good quality ...,cs,4,cs
3,sparse inverse covariance matrix estimation us...,l regularized gaussian maximum likelihood esti...,cs,4,cs
4,visual translation embedding network visual re...,visual relation person ride bike bike next car...,cs,4,cs


In [104]:
new_label_encoder = LabelEncoder()

df['encoded_class'] = new_label_encoder.fit_transform(df['new_class'])

class_mapping = dict(enumerate(new_label_encoder.classes_))
print(class_mapping)

{0: 'cs', 1: 'econ', 2: 'eess', 3: 'math', 4: 'physics', 5: 'q-bio', 6: 'q-fin', 7: 'stat'}


In [105]:
df.head()

Unnamed: 0,title,text,class,class_code,new_class,encoded_class
0,interpreting syntactic social element tweet re...,research social medium analysis experiencing r...,cs,4,cs,0
1,living together mind machine intelligence,paper consider nature machine intelligence cre...,cs,4,cs,0
2,stochastic local search pattern set mining,local search method quickly find good quality ...,cs,4,cs,0
3,sparse inverse covariance matrix estimation us...,l regularized gaussian maximum likelihood esti...,cs,4,cs,0
4,visual translation embedding network visual re...,visual relation person ride bike bike next car...,cs,4,cs,0


In [106]:
df = df.drop(columns=['class', 'class_code', 'new_class'])
df = df.rename(columns={'encoded_class': 'class'})

df.head()

Unnamed: 0,title,text,class
0,interpreting syntactic social element tweet re...,research social medium analysis experiencing r...,0
1,living together mind machine intelligence,paper consider nature machine intelligence cre...,0
2,stochastic local search pattern set mining,local search method quickly find good quality ...,0
3,sparse inverse covariance matrix estimation us...,l regularized gaussian maximum likelihood esti...,0
4,visual translation embedding network visual re...,visual relation person ride bike bike next car...,0


In [108]:
df.to_csv("preprocessed_data.csv", index=False)