In [43]:
#IGNORE THIS!!!!! IT'S JUST PREPROCESSING OF DATA 
#I just want to keep it for our submission


#from dblplib import parse, parse_file
import dblplib
# from dblplib import parse_file
from lxml import etree
from datetime import datetime
import csv
import ujson
import codecs
import re
from github_com.kennethreitz import requests
assert requests.get('https://github.com/IsaacChanghau/DBLPParser').status_code == 200


# all of the element types in dblp
all_elements = {"article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"}
# all of the feature types in dblp
all_features = {"address", "author", "booktitle", "cdrom", "chapter", "cite", "crossref", "editor", "ee", "isbn",
                "journal", "month", "note", "number", "pages", "publisher", "school", "series", "title", "url",
                "volume", "year"}


def log_msg(message):
    """Produce a log with current time"""
    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), message)


def context_iter(dblp_path):
    """Create a dblp data iterator of (event, element) pairs for processing"""
    return etree.iterparse(source=dblp_path, dtd_validation=True, load_dtd=True)  # required dtd


def clear_element(element):
    """Free up memory for temporary element tree after processing the element"""
    element.clear()
    while element.getprevious() is not None:
        del element.getparent()[0]


def count_pages(pages):
    """Borrowed from: https://github.com/billjh/dblp-iter-parser/blob/master/iter_parser.py
    Parse pages string and count number of pages. There might be multiple pages separated by commas.
    VALID FORMATS:
        51         -> Single number
        23-43      -> Range by two numbers
    NON-DIGITS ARE ALLOWED BUT IGNORED:
        AG83-AG120
        90210H     -> Containing alphabets
        8e:1-8e:4
        11:12-21   -> Containing colons
        P1.35      -> Containing dots
        S2/109     -> Containing slashes
        2-3&4      -> Containing ampersands and more...
    INVALID FORMATS:
        I-XXI      -> Roman numerals are not recognized
        0-         -> Incomplete range
        91A-91A-3  -> More than one dash
        f          -> No digits
    ALGORITHM:
        1) Split the string by comma evaluated each part with (2).
        2) Split the part to subparts by dash. If more than two subparts, evaluate to zero. If have two subparts,
           evaluate by (3). If have one subpart, evaluate by (4).
        3) For both subparts, convert to number by (4). If not successful in either subpart, return zero. Subtract first
           to second, if negative, return zero; else return (second - first + 1) as page count.
        4) Search for number consist of digits. Only take the last one (P17.23 -> 23). Return page count as 1 for (2)
           if find; 0 for (2) if not find. Return the number for (3) if find; -1 for (3) if not find.
    """
    cnt = 0
    for part in re.compile(r",").split(pages):
        subparts = re.compile(r"-").split(part)
        if len(subparts) > 2:
            continue
        else:
            try:
                re_digits = re.compile(r"[\d]+")
                subparts = [int(re_digits.findall(sub)[-1]) for sub in subparts]
            except IndexError:
                continue
            cnt += 1 if len(subparts) == 1 else subparts[1] - subparts[0] + 1
    return "" if cnt == 0 else str(cnt)


def extract_feature(elem, features, include_key=False):
    """Extract the value of each feature"""
    if include_key:
        attribs = {'key': [elem.attrib['key']]}
    else:
        attribs = {}
    for feature in features:
        attribs[feature] = []
    for sub in elem:
        if sub.tag not in features:
            continue
        if sub.tag == 'title':
            text = re.sub("<.*?>", "", etree.tostring(sub).decode('utf-8')) if sub.text is None else sub.text
        elif sub.tag == 'pages':
            text = count_pages(sub.text)
        else:
            text = sub.text
        if text is not None and len(text) > 0:
            attribs[sub.tag] = attribs.get(sub.tag) + [text]
    return attribs


def parse_all(dblp_path, save_path, include_key=False):
    log_msg("PROCESS: Start parsing...")
    f = open(save_path, 'w', encoding='utf8')
    for _, elem in context_iter(dblp_path):
        if elem.tag in all_elements:
            attrib_values = extract_feature(elem, all_features, include_key)
            f.write(str(attrib_values) + '\n')
        clear_element(elem)
    f.close()
    log_msg("FINISHED...")  # load the saved results line by line using json


def parse_entity(dblp_path, save_path, type_name, features=None, save_to_csv=False, include_key=False):
    """Parse specific elements according to the given type name and features"""
    log_msg("PROCESS: Start parsing for {}...".format(str(type_name)))
    assert features is not None, "features must be assigned before parsing the dblp dataset"
    results = []
    attrib_count, full_entity, part_entity = {}, 0, 0
    for _, elem in context_iter(dblp_path):
        if elem.tag in type_name:
            attrib_values = extract_feature(elem, features, include_key)  # extract required features
            results.append(attrib_values)  # add record to results array
            for key, value in attrib_values.items():
                attrib_count[key] = attrib_count.get(key, 0) + len(value)
            cnt = sum([1 if len(x) > 0 else 0 for x in list(attrib_values.values())])
            if cnt == len(features):
                full_entity += 1
            else:
                part_entity += 1
        elif elem.tag not in all_elements:
            continue
        clear_element(elem)
    if save_to_csv:
        f = open(save_path, 'w', newline='', encoding='utf8')
        writer = csv.writer(f, delimiter=',')
        writer.writerow(features)  # write title
        for record in results:
            # some features contain multiple values (e.g.: author), concatenate with `::`
            row = ['::'.join(v) for v in list(record.values())]
            writer.writerow(row)
        f.close()
    else:  # default save to json file
        with codecs.open(save_path, mode='w', encoding='utf8', errors='ignore') as f:
            ujson.dump(results, f)
    return full_entity, part_entity, attrib_count


def parse_author(dblp_path, save_path, save_to_csv=False):
    type_name = ['article', 'book', 'incollection', 'inproceedings']
    log_msg("PROCESS: Start parsing for {}...".format(str(type_name)))
    authors = set()
    for _, elem in context_iter(dblp_path):
        if elem.tag in type_name:
            authors.update(a.text for a in elem.findall('author'))
        elif elem.tag not in all_elements:
            continue
        clear_element(elem)
    if save_to_csv:
        f = open(save_path, 'w', newline='', encoding='utf8')
        writer = csv.writer(f, delimiter=',')
        writer.writerows([a] for a in sorted(authors))
        f.close()
    else:
        with open(save_path, 'w', encoding='utf8') as f:
            f.write('\n'.join(sorted(authors)))
    log_msg("FINISHED...")


def parse_article(dblp_path, save_path, save_to_csv=False, include_key=False):
    type_name = ['article']
    features = ['title', 'author', 'year', 'journal', 'pages']
    info = parse_entity(dblp_path, save_path, type_name, features, save_to_csv=save_to_csv, include_key=include_key)
    log_msg('Total articles found: {}, articles contain all features: {}, articles contain part of features: {}'
            .format(info[0] + info[1], info[0], info[1]))
    log_msg("Features information: {}".format(str(info[2])))


def parse_inproceedings(dblp_path, save_path, save_to_csv=False, include_key=False):
    type_name = ["inproceedings"]
    features = ['title', 'author', 'year', 'pages', 'booktitle']
    info = parse_entity(dblp_path, save_path, type_name, features, save_to_csv=save_to_csv, include_key=include_key)
    log_msg('Total inproceedings found: {}, inproceedings contain all features: {}, inproceedings contain part of '
            'features: {}'.format(info[0] + info[1], info[0], info[1]))
    log_msg("Features information: {}".format(str(info[2])))


def parse_proceedings(dblp_path, save_path, save_to_csv=False, include_key=False):
    type_name = ["proceedings"]
    features = ['title', 'editor', 'year', 'booktitle', 'series', 'publisher']
    # Other features are 'volume','isbn' and 'url'.
    info = parse_entity(dblp_path, save_path, type_name, features, save_to_csv=save_to_csv, include_key=include_key)
    log_msg('Total proceedings found: {}, proceedings contain all features: {}, proceedings contain part of '
            'features: {}'.format(info[0] + info[1], info[0], info[1]))
    log_msg("Features information: {}".format(str(info[2])))


def parse_book(dblp_path, save_path, save_to_csv=False, include_key=False):
    type_name = ["book"]
    features = ['title', 'author', 'publisher', 'isbn', 'year', 'pages']
    info = parse_entity(dblp_path, save_path, type_name, features, save_to_csv=save_to_csv, include_key=include_key)
    log_msg('Total books found: {}, books contain all features: {}, books contain part of features: {}'
            .format(info[0] + info[1], info[0], info[1]))
    log_msg("Features information: {}".format(str(info[2])))


def parse_publications(dblp_path, save_path, save_to_csv=False, include_key=False):
    type_name = ['article', 'book', 'incollection', 'inproceedings']
    features = ['title', 'year', 'pages']
    info = parse_entity(dblp_path, save_path, type_name, features, save_to_csv=save_to_csv, include_key=include_key)
    log_msg('Total publications found: {}, publications contain all features: {}, publications contain part of '
            'features: {}'.format(info[0] + info[1], info[0], info[1]))
    log_msg("Features information: {}".format(str(info[2])))


dblp_path = '/Users/daniellelarson/cse416/final/dblp-2019-11-01.xml'
save_path = '/Users/daniellelarson/cse416/final/article.json'


In [45]:
%%time
try:
    context_iter(dblp_path)
    log_msg("LOG: Successfully loaded \"{}\".".format(dblp_path))
except IOError:
    log_msg("ERROR: Failed to load file \"{}\". Please check your XML and DTD files.".format(dblp_path))
    exit()
parse_article(dblp_path, save_path, save_to_csv=False)  # default save as json format

2019-12-01 23:46:11 LOG: Successfully loaded "/Users/daniellelarson/cse416/final/dblp-2019-11-01.xml".
2019-12-01 23:46:11 PROCESS: Start parsing for ['article']...
2019-12-01 23:49:00 Total articles found: 2137512, articles contain all features: 1815958, articles contain part of features: 321554
2019-12-01 23:49:00 Features information: {'title': 2137512, 'author': 6264506, 'year': 2137509, 'journal': 2137285, 'pages': 1824941}
CPU times: user 2min 43s, sys: 4.53 s, total: 2min 48s
Wall time: 2min 49s


In [46]:
%%time
save_path = '/Users/daniellelarson/cse416/final/article.csv'
try:
    context_iter(dblp_path)
    log_msg("LOG: Successfully loaded \"{}\".".format(dblp_path))
except IOError:
    log_msg("ERROR: Failed to load file \"{}\". Please check your XML and DTD files.".format(dblp_path))
    exit()
parse_article(dblp_path, save_path, save_to_csv=True)  # default save as json format

2019-12-02 00:04:08 LOG: Successfully loaded "/Users/daniellelarson/cse416/final/dblp-2019-11-01.xml".
2019-12-02 00:04:08 PROCESS: Start parsing for ['article']...
2019-12-02 00:07:00 Total articles found: 2137512, articles contain all features: 1815958, articles contain part of features: 321554
2019-12-02 00:07:00 Features information: {'title': 2137512, 'author': 6264506, 'year': 2137509, 'journal': 2137285, 'pages': 1824941}
CPU times: user 2min 46s, sys: 4.21 s, total: 2min 51s
Wall time: 2min 51s


In [1]:
%%time
#RUN THIS: START HERE TO CREATE GRAPH

#load in from csv (obvs change path)
 
import csv
jordan = 'edges_no_title.csv'
danielle = '/Users/daniellelarson/cse416/final/edges_no_title.csv'
with open(jordan, 'r') as f:
    reader = csv.reader(f)
    edges = list(reader)

CPU times: user 12.4 s, sys: 831 ms, total: 13.3 s
Wall time: 13.3 s


In [2]:
%%time
#IGNORE THIS 
#It was just to create the original file with the gendered items
 

from genderize import Genderize
import gender_guesser.detector as gender
d = gender.Detector()
genders = []
count = 0
for e in edges:
    name1 = e[0]
    name2 = e[1]
    first1 = name1.split(' ', 1)[0]
    first2 = name2.split(' ', 1)[0]
    #print(first1)
    #print(first2)
    g1 = d.get_gender(first1)
    g2 = d.get_gender(first2)
    #print(g1)
    #print(g2)
    Dict1 = {"name": name1, "gender": g1} 
    Dict2 = {"name": name2, "gender": g2} 
    genders.append([Dict1,Dict2])

CPU times: user 1min 23s, sys: 2.22 s, total: 1min 25s
Wall time: 1min 25s


In [3]:
#IGNORE THIS 
import json
jordan = 'no_title_edges_with_gender'
danielle = '/Users/daniellelarson/cse416/final/no_title_edges_with_gender'
with open(jordan, 'w') as fout:
    json.dump(genders, fout)

In [4]:
#IGNORE THIS 
import csv

with open("no_title_edges_with_gender.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(genders)

In [5]:
%%time
#RUN THIS

#create base graph

import networkx as nx
G=nx.Graph()
G.add_edges_from(edges)

CPU times: user 22.4 s, sys: 5.56 s, total: 28 s
Wall time: 30.1 s


In [6]:
%%time
#RUN THIS

#label edges

#G[1][3]['color']='blue'
#assigning genders to edges
for e in edges:
    name1 = e[0]
    name2 = e[1]
    first1 = name1.split(' ', 1)[0]
    first2 = name2.split(' ', 1)[0]
    #print(first1)
    #print(first2)
    g1 = d.get_gender(first1)
    g2 = d.get_gender(first2)
    #print(g1)
    #print(g2)
#     if (str(g1) == "andy" or str(g1) == "unknown"):
#         G.remove_node(name1)
#     elif (str(g2) == "andy" or str(g2) == "unknown"):
#         G.remove_node(name2)
    if (g1 == g2) :
        if(str(g1) == "male"):
            G[name1][name2]['gender'] = 'blue'
        else:
            G[name1][name2]['gender'] = 'pink'
    else:
        G[name1][name2]['gender'] = 'purple'
        
    

CPU times: user 1min 41s, sys: 1.16 s, total: 1min 42s
Wall time: 1min 43s


In [7]:
%%time

#RUN THIS

#label nodes


#G.node[1]['room'] = 714
#assigning genders to nodes
for n in G.nodes():
    first = n.split(' ', 1)[0]
    G.nodes[n]['gender'] = d.get_gender(first)

CPU times: user 8.52 s, sys: 49.3 ms, total: 8.57 s
Wall time: 8.57 s


In [38]:
%%time

#RUN THIS



#removing unknown or andy
nodes = []
nodes = G.nodes()
delNodes = []
print(G.number_of_nodes())
for n in nodes:
    if str(G.nodes[n]['gender']) == "andy" :
        delNodes.append(n)
    elif str(G.nodes[n]['gender']) == "unknown":
        delNodes.append(n)
        
G.remove_nodes_from(delNodes)
print(G.number_of_nodes())

942342
942342
CPU times: user 1.86 s, sys: 5.09 ms, total: 1.86 s
Wall time: 1.86 s


In [39]:
for n in nodes:
    if str(G.nodes[n]['gender']) == "mostly_male" :
        G.nodes[n]['gender'] = "male"
    elif str(G.nodes[n]['gender']) == "mostly_female":
        G.nodes[n]['gender'] = "female"

In [40]:
## output only male and females to file (removed unknowns and androgenous names)
## ex:
## Frank, male
## Anna, female
import random
import pandas

mf_list = []
mf_list2 = []
bad_chars = ["(", ")", "'"] 
random_sample = random.sample(G.nodes(),k=471171) #half of the dataset (G.number_of_nodes()/2)
for n in random_sample:
    mf_list.append(n)
    mf_list2.append(G.nodes[n]['gender'])

pd = pandas.DataFrame(mf_list,mf_list2)
pd.to_csv("only_authors_and_genders.csv")
