## EDA on dblp dataset

In [49]:
import pandas as pd
import numpy as np

In [55]:
df = pd.read_csv('artifacts/dblp.csv',low_memory=False).drop(['Unnamed: 0'],axis=1).replace('[]',np.nan)

In [56]:
df.columns

Index(['address', 'author', 'booktitle', 'cdrom', 'chapter', 'cite',
       'crossref', 'editor', 'ee', 'isbn', 'journal', 'month', 'note',
       'number', 'pages', 'publisher', 'publnr', 'school', 'series', 'title',
       'url', 'volume', 'year', 'tag'],
      dtype='object')

These are the unique types of tags in the data. 

In [57]:
df.tag.unique()

array(['article', 'book', 'proceedings', 'inproceedings', 'www',
       'mastersthesis', 'incollection', 'data', 'phdthesis'], dtype=object)

We want to determine which features have values and are thus tied to the tag. Features that have null values for all records in each tag are thus not relevant to it.

In [58]:
tag_features = {} # initialise empty dictionary
# iterate through each unique tag to determine which features is not null and are thus part of the tag
for tag in df.tag.unique():
    df_tag = df.loc[df['tag']==tag].drop(['tag'],axis=1) # locate rows specific to the tag
    df_na = pd.DataFrame(df_tag.isna().sum()).reset_index() # create a dataframe of features with sum of na values
    df_na.columns = ['tag_type','na_count'] # rename columns
    # create new column contain count of non-na cells, calculated as (number of records for each tag - number of records with na cells)
    df_na['non_na_count'] = len(df_tag)-df_na['na_count'] 
    features = [row['tag_type'].strip() for index, row in df_na.iterrows() if row['non_na_count'] > 0] # add features to list features does not have na values
    tag_features[tag] = features # add tag as key and features as value to dictionary

Listing features present for each tag

In [59]:
[print(k,v) for k,v in tag_features.items()]

article ['author', 'booktitle', 'cdrom', 'cite', 'crossref', 'editor', 'ee', 'journal', 'month', 'note', 'number', 'pages', 'publisher', 'publnr', 'title', 'url', 'volume', 'year']
book ['author', 'booktitle', 'cdrom', 'cite', 'crossref', 'editor', 'ee', 'isbn', 'month', 'note', 'pages', 'publisher', 'school', 'series', 'title', 'url', 'volume', 'year']
proceedings ['address', 'author', 'booktitle', 'cite', 'editor', 'ee', 'isbn', 'journal', 'note', 'number', 'pages', 'publisher', 'school', 'series', 'title', 'url', 'volume', 'year']
inproceedings ['author', 'booktitle', 'cdrom', 'cite', 'crossref', 'editor', 'ee', 'month', 'note', 'number', 'pages', 'title', 'url', 'volume', 'year']
www ['author', 'cite', 'crossref', 'editor', 'ee', 'note', 'title', 'url', 'year']
mastersthesis ['author', 'ee', 'note', 'school', 'title', 'year']
incollection ['author', 'booktitle', 'cdrom', 'chapter', 'cite', 'crossref', 'ee', 'note', 'number', 'pages', 'publisher', 'title', 'url', 'year']
data ['auth

[None, None, None, None, None, None, None, None, None]

Number of features in each document tag

In [60]:
[print(k,len(v)) for k,v in tag_features.items()]

article 18
book 18
proceedings 18
inproceedings 15
www 9
mastersthesis 6
incollection 14
data 8
phdthesis 13


[None, None, None, None, None, None, None, None, None]