# Setup

In [78]:
import pandas as pd

from os import listdir
from os.path import isfile, join

from collections import Counter

In [54]:
# Paths
DATA_PATH = '../data/'
OUTPUT_PATH = '../output_data/'
MODEL_PATH = '../data/models/'

In [14]:
#pd.set_option('display.max_rows', 100)
#pd.options.display.max_columns = 100

In [6]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

print('Index, Filename')
print(list(zip([index for index, value in enumerate(datafiles)], datafiles)))

['arxiv_disinformation.csv', 'arxiv_deepfake.csv']


In [50]:
# Get a file name, can use
load_file = datafiles[1]
load_file

'arxiv_deepfake.csv'

In [62]:
# Load dataframe
CONVERTERS = {'tokens': eval, 'published_parsed': eval, 'tags': eval, 'arxiv_primary_category': eval}

df = pd.read_csv(DATA_PATH + load_file, converters=CONVERTERS)

# Examination

In [64]:
df.head(2)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,summary_detail,authors,author_detail,author,arxiv_comment,links,arxiv_primary_category,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year
0,http://arxiv.org/abs/2203.14315v1,True,http://arxiv.org/abs/2203.14315v1,2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",Adaptive Frequency Learning in Two-branch Face...,"{'type': 'text/plain', 'language': None, 'base...",Face forgery has attracted increasing attentio...,"{'type': 'text/plain', 'language': None, 'base...","[{'name': 'Neng Wang'}, {'name': 'Yang Bai'}, ...",{'name': 'Yan Wang'},Yan Wang,Deepfake Detection,"[{'href': 'http://arxiv.org/abs/2203.14315v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,face forgery has attracted increasing attentio...,"[face, forgery, attract, increase, attention, ...",2022,"[2022, 3]"
1,http://arxiv.org/abs/2203.13964v1,True,http://arxiv.org/abs/2203.13964v1,2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",Fusing Global and Local Features for Generaliz...,"{'type': 'text/plain', 'language': None, 'base...",With the development of the Generative Adversa...,"{'type': 'text/plain', 'language': None, 'base...","[{'name': 'Yan Ju'}, {'name': 'Shan Jia'}, {'n...",{'name': 'Siwei Lyu'},Siwei Lyu,"6 pages, 3 figures, 2 tables","[{'href': 'http://arxiv.org/abs/2203.13964v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,with the development of the generative adversa...,"[development, generative, adversarial, network...",2022,"[2022, 3]"


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      438 non-null    object
 1   guidislink              438 non-null    bool  
 2   link                    438 non-null    object
 3   updated                 438 non-null    object
 4   updated_parsed          438 non-null    object
 5   published               438 non-null    object
 6   published_parsed        438 non-null    object
 7   title                   438 non-null    object
 8   title_detail            438 non-null    object
 9   summary                 438 non-null    object
 10  summary_detail          438 non-null    object
 11  authors                 438 non-null    object
 12  author_detail           438 non-null    object
 13  author                  438 non-null    object
 14  arxiv_comment           240 non-null    object
 15  links 

In [59]:
df.describe()

Unnamed: 0,year
count,438.0
mean,2020.616438
std,0.907081
min,2018.0
25%,2020.0
50%,2021.0
75%,2021.0
max,2022.0


In [32]:
print(df.published_parsed.max())
print(df.published_parsed.min())

[2022, 3, 27, 14, 25, 52, 6, 86, 0]
[2018, 6, 7, 19, 36, 9, 3, 158, 0]


In [74]:
df.arxiv_primary_category[10]

"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom'}"

In [63]:
df.tags[1]

[{'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}]

In [79]:
# get tags

tag_list = []

for index, row in df.iterrows():
      tag = row.tags[0]['term']
      tag_list.append(tag)

In [80]:
Counter(tag_list)

Counter({'cs.CV': 328,
         'eess.AS': 24,
         'cs.HC': 6,
         'cs.CR': 22,
         'cs.SD': 10,
         'cs.LG': 22,
         'cs.CY': 10,
         'cond-mat.mtrl-sci': 2,
         'cs.MM': 2,
         'cs.CL': 8,
         'cs.SI': 2,
         'cs.NI': 2})

In [81]:
def get_tag(x):
      tag = x[0]['term']
      # TO DO - scrape https://arxiv.org/category_taxonomy to translate codes to plain english
      return tag

In [82]:
df['category'] = df['tags'].dropna().apply(lambda x:  get_tag(x))


In [84]:
df['category'].value_counts()

cs.CV                328
eess.AS               24
cs.CR                 22
cs.LG                 22
cs.SD                 10
cs.CY                 10
cs.CL                  8
cs.HC                  6
cond-mat.mtrl-sci      2
cs.MM                  2
cs.SI                  2
cs.NI                  2
Name: category, dtype: int64

# Adding some data 
Only needs to be done once but retaining code for new collections

Could move to preprocessing

In [45]:
def get_period(x, period):
      #output = x.split(',')[period] ## For string splitting
      output = x[:period]
      if len(output) == 1:
            return output[0]
      else:
            return output

In [46]:
df['year'] = df['published_parsed'].dropna().apply(lambda x:  get_period(x, 1))
df['month_year'] = df['published_parsed'].dropna().apply(lambda x:  get_period(x, 2))



In [47]:
df.year.value_counts()

2021    190
2020    140
2022     66
2019     32
2018     10
Name: year, dtype: int64

# Closing out

In [51]:
out_file = load_file.split('.')[0]

In [55]:
DATA_PATH + out_file + '.csv'

'../data/arxiv_deepfake.csv'

In [85]:
df.to_csv(DATA_PATH + out_file + '.csv', index=False)