# Kartik Spark Endpoint

In [None]:
import findspark
findspark.init("/opt/spark-2.1.0-bin-cdh5.9.1/")
from pyspark.sql import SparkSession

#spark = SparkSession.builder.getOrCreate()

spark = SparkSession.builder.\
    config('spark.executor.memory', '5g').\
    config('spark.driver.memory', '5g').\
    config('spark.driver.maxResultSize', '3g').\
    config('spark.dynamicAllocation.maxExecutors', 20).\
getOrCreate()


# Rahul Endpoint

In [None]:
import findspark
findspark.init('/opt/cloudera/parcels/SPARK2/lib/spark2/')

# import the library
import pyspark
from pyspark.sql import SparkSession
# for Spark 2.0, we have a unified entry point to the cluster
spark = SparkSession.builder.\
    getOrCreate()

# for previous versions, we can simulate SparkContext and SQLContext
sc = spark.sparkContext
sqlContext = spark

# Display information about current execution
spark.conf.get('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES')

In [None]:
spark.sparkContext.addPyFile('rake.py')
spark.sparkContext.addFile('SmartStoplist.txt')

In [None]:
import rake
rake_object = rake.Rake("SmartStoplist.txt", 3, 5, 2)

In [None]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [None]:
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
    
    return sorted(keyphrases.items(), key=lambda x: x[1], reverse=True)

In [None]:
def extract_candidate_features(candidates, doc_text, doc_excerpt, doc_title, doc_file):
    import collections, math, nltk, re
    
    candidate_scores = collections.OrderedDict()
    
    # get word counts for document
    doc_word_counts = collections.Counter(word.lower()
                                          for sent in nltk.sent_tokenize(doc_text)
                                          for word in nltk.word_tokenize(sent))
    
    for candidate in candidates:
        
        pattern = re.compile(r'\b'+re.escape(candidate)+r'(\b|[,;.!?]|\s)', re.IGNORECASE)
        
        # frequency-based
        # number of times candidate appears in document
        cand_doc_count = len(pattern.findall(doc_text))
        # count could be 0 for multiple reasons; shit happens in a simplified example
        if not cand_doc_count:
            print('**WARNING:', candidate, 'not found!')
            continue
    
        # statistical
        candidate_words = candidate.split()
        max_word_length = max(len(w) for w in candidate_words)
        term_length = len(candidate_words)
        # get frequencies for term and constituent words
        sum_doc_word_counts = float(sum(doc_word_counts[w] for w in candidate_words))
        try:
            # lexical cohesion doesn't make sense for 1-word terms
            if term_length == 1:
                lexical_cohesion = 0.0
            else:
                lexical_cohesion = term_length * (1 + math.log(cand_doc_count, 10)) * cand_doc_count / sum_doc_word_counts
        except (ValueError, ZeroDivisionError) as e:
            lexical_cohesion = 0.0
        
        # positional
        # found in title, key excerpt
        in_title = 1 if pattern.search(doc_title) else 0
        in_excerpt = 1 if pattern.search(doc_excerpt) else 0
        # first/last position, difference between them (spread)
        doc_text_length = float(len(doc_text))
        first_match = pattern.search(doc_text)
        abs_first_occurrence = first_match.start() / doc_text_length
        if cand_doc_count == 1:
            spread = 0.0
            abs_last_occurrence = abs_first_occurrence
        else:
            for last_match in pattern.finditer(doc_text):
                pass
            abs_last_occurrence = last_match.start() / doc_text_length
            spread = abs_last_occurrence - abs_first_occurrence

        candidate_scores[candidate] = {'document': doc_file,
                                       'term_count': cand_doc_count,
                                       'term_length': term_length, 
                                       'max_word_length': max_word_length,
                                       'spread': spread, 
                                       'lexical_cohesion': lexical_cohesion,
                                       'in_excerpt': in_excerpt, 
                                       'in_title': in_title,
                                       'abs_first_occurrence': abs_first_occurrence,
                                       'abs_last_occurrence': abs_last_occurrence}

    return candidate_scores

In [None]:
directory_path_chunk = '/users/kanagre/AutomaticKeyphraseExtraction/SemEval2010/train/'
import os
filename_list = []
for subdir, dirs, files in os.walk(directory_path_chunk):
    for file in sorted(files):
        if file.endswith('.txt.final'):            
            filename_list.append(file)

In [None]:
a = ['Evaluating Adaptive Resource Management for Distributed Real-Time Embedded Systems', 'Demonstration of Grid-Enabled Ensemble Kalman Filter Data Assimilation Methodology for Reservoir Characterization', 'MSP: Multi-Sequence Positioning of Wireless Sensor Nodes', 'StarDust: A Flexible Architecture for Passive Localization in Wireless Sensor Networks', 'TSAR: A Two Tier Sensor Storage Architecture Using Interval Skip Graphs', 'Multi-dimensional Range Queries in Sensor Networks', 'Evaluating Opportunistic Routing Protocols with Large Realistic Contact Traces', 'CenWits: A Sensor-Based Loosely Coupled Search and Rescue System Using Witnesses', 'Fairness in Dead-Reckoning based Distributed Multi-Player Games', 'Globally Synchronized Dead-Reckoning with Local Lag for Continuous Distributed Multiplayer Games', 'Remote Access to Large Spatial Databases', 'Context Awareness for Group Interaction Support', 'A Hierarchical Process Execution Support for Grid Computing', 'Congestion Games with Load-Dependent Failures: Identical Resources','A Scalable Distributed Information Management System','Authority Assignment in Distributed Multi-Player Proxy-based Games','Network Monitors and Contracting Systems: Competition and Innovation','Shooter Localization and Weapon Classification with Soldier-Wearable Networked Sensors', 'Heuristics-Based Scheduling of Composite Web Service Workloads', 'A Holistic Approach to High-Performance Computing: Xgrid Experience', 'An Evaluation of Availability Latency in Carrier-based Vehicular ad-hoc Networks','pTHINC: A Thin-Client Architecture for Mobile Wireless Web','A Point-Distribution Index and Its Application to Sensor-Grouping in Wireless Sensor Networks','GUESS: Gossiping Updates for Efficient Spectrum Sensing','Adapting Asynchronous Messaging Middleware to ad-hoc Networking','Composition of a DIDS by Integrating Heterogeneous IDSs on Grids','Assured Service Quality by Improved Fault Management Service-Oriented Event Correlation','Tracking Immediate Predecessors in Distributed Computations','An Architectural Framework and a Middleware for Cooperating Smart Components','A Cross-Layer Approach to Resource Discovery and Distribution in Mobile ad-hoc Networks','Consistency-preserving Caching of Dynamic Database Content','Adaptive Duty Cycling for Energy Harvesting Systems','Concept and Architecture of a Pervasive Document Editing and Managing System','Selfish Caching in Distributed Systems: A Game-Theoretic Analysis','AdaRank: A Boosting Algorithm for Information Retrieval','Relaxed Online SVMs for Spam Filtering','DiffusionRank: A Possible Penicillin for Web Spamming','Cross-Lingual Query Suggestion Using Query Logs of Different Languages','HITS on the Web: How does it Compare?','HITS Hits  TRECExploring IR Evaluation Results with Network Analysis','Combining Content and Link for Classification using Matrix Factorization','A Time Machine for Text Search','Query Performance Prediction in Web Search Environments','Broad Expertise Retrieval in Sparse Data Environments','A Semantic Approach to Contextual Advertising','A New Approach for Evaluating Query Expansion: Query-Document Term Mismatch','Performance Prediction Using Spatial Autocorrelation','An Outranking Approach for Rank Aggregation in Information Retrieval','Vocabulary Independent Spoken Term Detection','Context Sensitive Stemming for Web Search','Knowledge-intensive Conceptual Retrieval and Passage Extraction of Biomedical Literature', 'A Frequency-based and a Poisson-based Definition of the Probability of Being Informative', 'Impedance Coupling in Content-targeted Advertising', 'Implicit User Modeling for Personalized Search','Location based Indexing Scheme for DAYS','Machine Learning for Information Architecture in a Large Governmental Website','Ranking Web Objects from Multiple Communities','Unified Utility Maximization Framework for Resource Selection','Automatic Extraction of Titles from General Documents using Machine Learning','Beyond PageRank: Machine Learning for Static Ranking','Distance Measures for MPEG-7-based Retrieval','Downloading Textual Hidden Web Content Through Keyword Queries','Estimating the Global PageRank of Web Communities','Event Threading within News Topics','Learning User Interaction Models for Predicting Web Search Result Preferences','Robustness of Adaptive Filtering Methods In a Cross-benchmark Evaluation','Controlling Overlap in Content-Oriented XML Retrieval','Context-Sensitive Information Retrieval Using Implicit Feedback','Improving Web Search Ranking by Incorporating User Behavior Information','Handling Locations in Search Engine Queries','A Study of Factors Affecting the Utility of Implicit Relevance Feedback','Feature Representation for Effective Action-Item Detection','Using Asymmetric Distributions to Improve Text Classifier Probability Estimates','A Framework for Agent-Based Distributed Machine Learning and Data Mining','Bidding Algorithms for a Distributed Combinatorial Auction', 'A Complete Distributed Constraint Optimization Method For Non-Traditional Pseudotree Arrangements','Dynamics Based Control with an Application to Area-Sweeping Problems','Implementing Commitment-Based Interactions','Modular Interpreted Systems','Operational Semantics of Multiagent Interactions','Normative System Games','A Multilateral Multi-issue Negotiation Protocol','Agents, Beliefs, and Plausible Behavior in a Temporal Setting','Learning and Joint Deliberation through Argumentation in Multi-Agent Systems','A Unified and General Framework for Argumentation-based Negotiation','A Randomized Method for the Shapley Value for the Voting Game','Approximate and Online Multi-Issue Negotiation','Searching for Joint Gains in Automated Negotiations Based on Multi-criteria Decision Making Theory','Unifying Distributed Constraint Algorithms in a BDI Negotiation Framework','Rumours and Reputation: Evaluating Multi-Dimensional Trust within a Decentralised Reputation System','An Efficient Heuristic Approach for Security Against Multiple Adversaries','An Agent-Based Approach for Privacy-Preserving Recommender Systems','On the Benefits of Cheating by Self-Interested Agents in Vehicular Networks','Distributed Agent-Based Air Traffic Flow Management','A Q-decomposition and Bounded RTDP Approach to Resource Allocation','Combinatorial Resource Scheduling for Multiagent MDPs','Organizational Self-Design in Semi-dynamic Environments','Graphical Models for Online Solutions to Interactive POMDPs','Letting loose a SPIDER on a network of POMDPs: Generating quality guaranteed policies','On Opportunistic Techniques for Solving Decentralized Markov Decision Processes with Temporal Constraints','A Multi-Agent System for Building Dynamic Ontologies','A Formal Model for Situated Semantic Alignment','Learning Consumer Preferences Using Semantic Similarity','Exchanging Reputation Values among Heterogeneous Agent Reputation Models: An Experience on ART Testbed','On the relevance of utterances in formal inter-agent dialogues','Hypotheses Refinement under Topological Communication Constraints','Negotiation by Abduction and Relaxation','The LOGIC Negotiation Model','Bid Expressiveness and Clearing Algorithms in Multiattribute Double Auctions','(In)Stability Properties of Limit Order Dynamics','Efficiency and Nash Equilibria in a Scrip System for P2P Networks','Playing Games in Many Possible Worlds','Finding Equilibria in Large Sequential Games of Imperfect Information','Multi-Attribute Coalitional Games','The Sequential Auction Problem on eBay: An Empirical Analysis and a Solution','Networks Preserving Evolutionary Equilibria and the Power of Randomization','An Analysis of Alternative Slot Auction Designs for Sponsored Search','The Dynamics of Viral Marketing','Scouts, Promoters, and Connectors: The Roles of Ratings in Nearest Neighbor Collaborative Filtering','Empirical Mechanism Design: Methods, with Application to a Supply-Chain Scenario','On the Computational Power of Iterative Auctions','Information Markets vs. Opinion Pools: An Empirical Comparison','Communication Complexity of Common Voting Rules','Complexity of (Iterated) Dominance','Hidden-Action in Multi-Hop Routing','A Price-Anticipating Resource Allocation Mechanism for Distributed Shared Clusters','From Optimal Limited To Unlimited Supply Auctions','Robust Solutions for Combinatorial Auctions','Marginal Contribution Nets: A Compact Representation Scheme for Coalitional Games','Towards Truthful Mechanisms for Binary Demand Games: A General Framework','Cost Sharing in a Job Scheduling Problem Using the Shapley Value','On Decentralized Incentive Compatible Mechanisms for Partially Informed Environments','ICE: An Iterative Combinatorial Exchange','Weak Monotonicity Suffices for Truthfulness on Convex Domains','Negotiation-Range Mechanisms: Exploring the Limits of Truthful Efficient Markets','Privacy in Electronic Commerce and the Economics of Immediate Gratification','Expressive Negotiation over Donations to Charities','Mechanism Design for Online Real-Time Scheduling','Robust Incentive Techniques for Peer-to-Peer Networks','Self-interested Automated Mechanism Design and Implications for Optimal Combinatorial Auctions','A Dynamic Pari-Mutuel Market for Hedging, Wagering, and Information Aggregation','Applying Learning Algorithms to Preference Elicitation','Competitive Algorithms for VWAP and Limit Order Trading','On Cheating in Sealed-Bid Auctions']

In [None]:
title_list = []
for file_name,title in zip(filename_list,a):
    title_dict = {}
    title_dict['file'] = file_name
    title_dict['title'] = title
    title_list.append(title_dict)

In [None]:
directory_path_chunk = '/users/kanagre/AutomaticKeyphraseExtraction/SemEval2010/train/'
import os
abstract_list = []
for subdir, dirs, files in os.walk(directory_path_chunk):
    for file in sorted(files):
        if file.endswith('.txt.final'):
            with open(directory_path_chunk+file, 'r') as text_file:
                file_list = text_file.readlines()
                l = []
                l = [file_list.index(i) for i in file_list if 'ABSTRACT' in i]
                if not l:
                    l = [file_list.index(i) for i in file_list if 'Abstract' in i]
                l1 = [file_list.index(i) for i in file_list if 'Categories and Subject Descriptors' in i]
                abstract_string = ""
                b = []
                b = [file_list.index(i) for i in file_list if 'INTRODUCTION' in i]
                b1 = [file_list.index(i) for i in file_list if 'REFERENCES' in i]
                body_string = ""
                
                abstract_dict = {}
                if len(l) == 0 or len(l1) == 0:
                    abstract_string = ""
                elif len(b) == 0 or len(b1) == 0:
                    body_string = ""                    
                else:
                    for i in range(l[0]+1, l1[0]):
                        abstract_line = file_list[i]
                        abstract_string = abstract_string + abstract_line
                    for i in range(b[0]+1, b1[0]):
                        body_line = file_list[i]
                        body_string = body_string + body_line 
                    abstract_dict['file'] = file
                    abstract_dict['abstract'] = abstract_string.replace('\n', ' ')
                    abstract_dict['body'] = body_string.replace('\n', ' ')
                    abstract_list.append(abstract_dict)
                
#                 b = []
#                 b = [file_list.index(i) for i in file_list if '1. INTRODUCTION' in i]
#                 b1 = [file_list.index(i) for i in file_list if '7. REFERENCES' in i]
#                 body_string = ""
#                 if len(b) == 0 or len(b1) == 0:
#                     abstract_string = ""
#                 else:
#                     for i in range(b[0]+1, b1[0]):
#                         body_line = file_list[i]
#                         body_string = body_string + body_line 
#                     abstract_dict['file'] = file
#                     abstract_dict['abstract'] = abstract_string.replace('\n', ' ')
#                     abstract_dict['body'] = body_string.replace('\n', ' ')
#                     abstract_list.append(abstract_dict)
                        

In [None]:
from collections import defaultdict
d = defaultdict(dict)
for l in (abstract_list, title_list):
    for elem in l:
        d[elem['file']].update(elem)
title_abstract = list(d.values())

In [None]:
title_abstract_kw_list = []
for d in title_abstract:
    rake_keywords = []
    textrank_keywords = []        
    title_abstract_kw_dict = {}

    doc_text = d['title'] + '. ' + d['abstract']+ '. ' + d['body']
    rake_keywords_tuple = rake_object.run(doc_text)
    for keyword in rake_keywords_tuple:
        rake_keywords.append(keyword[0])                                

    text_rank_tuple = score_keyphrases_by_textrank(doc_text)
    for keyword in text_rank_tuple:
        textrank_keywords.append(keyword[0])
    print(d['file'])
    title_abstract_kw_dict['file'] = d['file']
    title_abstract_kw_dict['title'] = d['title']
    title_abstract_kw_dict['abstract'] = d['abstract']
    title_abstract_kw_dict['body'] = d['body']
    title_abstract_kw_dict['title_abstract'] = doc_text
    title_abstract_kw_dict['keywords'] = rake_keywords + textrank_keywords
    title_abstract_kw_dict['keywords'] = list(set(title_abstract_kw_dict['keywords']))
    title_abstract_kw_dict['rake_keywords'] = rake_keywords
    title_abstract_kw_dict['textrank_keywords'] = textrank_keywords

    title_abstract_kw_list.append(title_abstract_kw_dict)