In [4]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score
from sklearn import preprocessing
from scipy.spatial.distance import cosine
import nltk
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn import decomposition
import sklearn.feature_extraction.text as text
import networkx as nx
from scipy.sparse.linalg import svds



In [5]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def preprocess_string(string):
    th_regexp = re.compile('[a-zA-Z\d]*-th')
    citation_regexp = re.compile('[a-zA-Z]+[\d]+')
    digit_regexp = re.compile('[\d]+')
    double_whitespace_regexp = re.compile('\s[\s]+')
    non_alphabet_regexp = re.compile('[\W]')
    one_letter_regexp = re.compile('\s[a-zA-Z]\s')
    two_letters_regexp = re.compile('\s[a-zA-Z][a-zA-Z]\s')   
    
    
    
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(string))
    
    verbs_tags_set = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    
    resulted_tokens = [token for (token, tag) in tagged_tokens if tag not in verbs_tags_set]
    
    for (token,tag) in tagged_tokens:
        if tag not in verbs_tags_set:
            resulted_tokens.append(token)
            
    result = " ".join(resulted_tokens)


    result = string
    result = th_regexp.sub('', result)
#     result = citation_regexp.sub('', result)
    result = digit_regexp.sub('', result)
    result = non_alphabet_regexp.sub(' ', result)
    result = one_letter_regexp.sub(' ', result)
    result = two_letters_regexp.sub(' ', result)
    result = double_whitespace_regexp.sub(' ', result)    
    
    return result

In [7]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info = list(reader)
corpus = [preprocess_string(element[5]) for element in node_info]

In [8]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.75, ngram_range=(1,1))
TFIDF_matrix = vectorizer.fit_transform(corpus)
num_topics = 150
_,_, V = svds(TFIDF_matrix, k=num_topics)
TOPIC_matrix = TFIDF_matrix.dot(V.transpose())
del V

In [9]:
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

# data loading and preprocessing 

# the columns of the data frame below are: 
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes


def get_features(data_fname, info_fname, portion, with_labels=True, node_info = None, tfidf_mat=None,
                 topic_mat=None, graph=None):
    with open(data_fname, "r") as f:
        reader = csv.reader(f)
        data_set = list(reader)

    data_set = [element[0].split(" ") for element in data_set]

    if portion == 1.0:
        print "one portion", portion
        to_keep = range(len(data_set))
    else:                       
        #to test code we select sample
        print "portion", portion
        random.seed(22)
        to_keep = random.sample(range(len(data_set)), k=int(round(len(data_set)*portion)))            
    
    data_set = [data_set[i] for i in to_keep]
    
    perm = [i for i in to_keep]
    
    
    valid_ids=set()
    for element in data_set:
        valid_ids.add(element[0])
        valid_ids.add(element[1])
        
    if node_info is None:
        tmp=[element for element in node_info if element[0] in valid_ids ]
        node_info=tmp
        del tmp



    IDs = []
    ID_pos={}
    for element in node_info:
        ID_pos[element[0]]=len(IDs)
        IDs.append(element[0])
        
    print "build graph"
    if graph is None:
        graph = nx.Graph()
        edges = []    
        nodes = set()
        all_edges = []
        for i in xrange(len(data_set)):
            source = data_set[i][0]
            target = data_set[i][1]
            nodes.add(source)
            nodes.add(target)
        
            if with_labels:
                if data_set[i][2] == "1":
                    edges.append((source, target))
            else:
                edges.append((source, target))
            all_edges.append((source, target))
        
        graph.add_nodes_from(nodes)
        graph.add_edges_from(edges)
    else:
        all_edges = []
        for i in xrange(len(data_set)):
            source = data_set[i][0]
            target = data_set[i][1]
            all_edges.append((source, target))
        
    
    clustering_coeff = nx.clustering(graph)
    pr = nx.pagerank(graph, alpha=0.9)
    jaccard_coeff = nx.jaccard_coefficient(graph, all_edges)
    adamic_index = nx.adamic_adar_index(graph, all_edges)
    pref_attach = nx.preferential_attachment(graph, all_edges)
    ress_alloc = nx.resource_allocation_index(graph, all_edges)
    hubs,_=nx.hits(graph,max_iter=1000)

    # graph features
    neighbor_count_source = []
    neighbor_count_target = []
    neighbor_sim = []
    clustering_coeff_source = []
    clustering_coeff_target = []
    graph_jaccoby_authors = []
    graph_jaccard_coeff = []
    pr_coeff_source = []
    pr_coeff_target = []
    
    graph_ress_alloc = [p for u,v,p in ress_alloc]
    
    graph_adamic_index = [p for u,v,p in adamic_index]
    graph_pref_attach = [p for u,v,p in pref_attach]
    hubs_feature_source = []
    hubs_feature_target = []
    
    graph_jaccard_coeff = [p for u,v,p in jaccard_coeff]
    
    # we will use three basic features:

    # number of overlapping words in title
    overlap_title = []

    # temporal distance between the papers
    temp_diff = []

    # number of common authors
    comm_auth = []
    
    # tfidf similarities
    tfidf_cos = []
    
#     topic model
    topic_sim = []   
    
    # Journal matching
    journal_matching = []
    


    counter = 0
    for i in xrange(len(data_set)):
        source = data_set[i][0]
        target = data_set[i][1]
    
    
        source_info = node_info[ID_pos[source]]
        target_info = node_info[ID_pos[target]]
    
        # convert to lowercase and tokenize
        source_title = source_info[2].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]
    
        target_title = target_info[2].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]
    
        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")
        
        
        v1 = tfidf_mat[ID_pos[source],:].toarray()[0]
        v2 = tfidf_mat[ID_pos[target],:].toarray()[0]        

        
        temp_cosine = 0.0
        
        if np.linalg.norm(v1)!= 0 and np.linalg.norm(v2) != 0 :
            temp_cosine = cosine(v1, v2)
            
        tfidf_cos.append(temp_cosine)
        
        
        v1 = topic_mat[ID_pos[source],:]
        v2 = topic_mat[ID_pos[target],:]
        
        topic_cosine = 0.0
        if np.linalg.norm(v1)!= 0 and np.linalg.norm(v2) != 0 :
            topic_cosine = cosine(v1, v2)         
            
        topic_sim.append(topic_cosine)
        
        
        # Journal matching
        source_journal = source_info[4]
        target_journal = target_info[4]
        
        journal_match = 0
        
        if source_journal == str('') or target_journal == str(''):
            journal_match = 0
        elif source_journal == target_journal:
            journal_match = 1
        elif source_journal != target_journal:
            journal_match = -1
            
        journal_matching.append(journal_match)
            
            
        neighbors_source = len(list(graph.neighbors(source)))
        neighbors_target = len(list(graph.neighbors(target)))
        neighbor_count_source.append(neighbors_source)
        neighbor_count_target.append(neighbors_target)
        neighbors_source = set(list(graph.neighbors(source)))
        neighbors_target = set(list(graph.neighbors(target)))
        neighbor_sim.append(
            len(neighbors_source.intersection(neighbors_target)))
        clustering_coeff_source.append(clustering_coeff[source])
        clustering_coeff_target.append(clustering_coeff[target])
        authors_source = set(source_auth)
        authors_target = set(target_auth)
        graph_jaccoby_authors.append(float(len(authors_source.intersection(authors_target))) / float(len(authors_source.union(authors_target))))
        pr_coeff_source.append(pr[source])
        pr_coeff_target.append(pr[target])
                
        hubs_feature_source.append(hubs[source])
        hubs_feature_target.append(hubs[target])
        
        
        
    
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[1]) - int(target_info[1]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        
        
    
        if counter % 10000 == 0:
            print counter, "training examples processsed"
        counter += 1
    
    features = (np.array([overlap_title, # 0
                          temp_diff, # 1
                          comm_auth, # 2
                          tfidf_cos, # 3
                          topic_sim, # 4
                          neighbor_count_source, # 5
                          neighbor_count_target, # 6
                          neighbor_sim, # 7
                          clustering_coeff_source, # 8
                          clustering_coeff_target, # 9
                          graph_jaccoby_authors, # 10
                          pr_coeff_source, # 11
                          pr_coeff_target, # 12
                          graph_jaccard_coeff, # 13
                          graph_adamic_index, # 14
                          graph_pref_attach, # 15
                          graph_ress_alloc, # 16
                          hubs_feature_source, # 17
                          hubs_feature_target, # 18
                          journal_matching # 21
                         ]).T).astype(float)
#     print features

    # scale
    features = preprocessing.scale(features)    
    
    if with_labels:

        # convert labels into integers then into column array
        labels = [int(element[2]) for element in data_set]
        labels = list(labels)
        labels_array = np.array(labels)
        
        return perm, data_set, features, labels_array, graph
    else:
        labels_array = np.zeros(1)
        return perm, data_set, features, labels_array, graph

In [10]:
IDs, training_dataset, training_features, training_labels, graph = get_features("training_set.txt",
                                                                    "node_information.csv",
                                                                    1.0,
                                                                    with_labels=True,
                                                                    node_info = node_info,
                                                                    tfidf_mat=TFIDF_matrix,
                                                                    topic_mat=TOPIC_matrix)
                                                                         #,topic_mat=TOPIC_matrix)

one portion 1.0
build graph
0 training examples processsed
10000 training examples processsed
20000 training examples processsed
30000 training examples processsed
40000 training examples processsed
50000 training examples processsed
60000 training examples processsed
70000 training examples processsed
80000 training examples processsed
90000 training examples processsed
100000 training examples processsed
110000 training examples processsed
120000 training examples processsed
130000 training examples processsed
140000 training examples processsed
150000 training examples processsed
160000 training examples processsed
170000 training examples processsed
180000 training examples processsed
190000 training examples processsed
200000 training examples processsed
210000 training examples processsed
220000 training examples processsed
230000 training examples processsed
240000 training examples processsed
250000 training examples processsed
260000 training examples processsed
270000 trainin

In [11]:
IDs, test_dataset, test_features, _, _= get_features("testing_set.txt", "node_information.csv", 1.0,
                                                  with_labels=False,
                                                  node_info = node_info,
                                                  tfidf_mat=TFIDF_matrix,
                                                  topic_mat=TOPIC_matrix,
                                                  graph = graph)
#                                                   topic_mat=TOPIC_matrix)

one portion 1.0
build graph
0 training examples processsed
10000 training examples processsed
20000 training examples processsed
30000 training examples processsed


In [None]:
import xgboost as xgb

In [64]:
# import xgboost as xgb
param = {'bst:max_depth':3, 'bst:eta':0.9, 'silent':1, 'objective':'binary:logistic', 'rate_drop' : 0.1 }
dtrain = xgb.DMatrix( training_features, label=training_labels)

In [65]:
bst = xgb.train( param, dtrain, 150 )

In [66]:
dtest = xgb.DMatrix(test_features)
pred = bst.predict(dtest)

In [67]:
pred2 = (pred > 0.5).astype(int)

In [68]:
pred2

array([0, 1, 1, ..., 0, 0, 1])

In [69]:
results = np.concatenate((np.reshape(IDs,(np.size(pred2), 1)), np.reshape(pred2, (np.size(pred2),1))), axis=1)
with open("last_submission.txt", "w") as f:
    writer = csv.writer(f, )
    writer.writerow(['id','prediction'])
    writer.writerows(results)