# Extraction of Relevant Key Phrases from Research Papers

<br /> 
Initialize Spark Cluster and RAKE library. In the next step, a RAKE object is created with stopwords filtered using 'SmartStoplist.txt' <br />

Dataset link: https://github.com/kartikn27/NLP_KeyPhrase_Extract/tree/master/dataset/SemEval2010
<br />

In [1]:
import findspark
findspark.init("/opt/spark-2.1.0-bin-cdh5.9.1/")
from pyspark.sql import SparkSession

#spark = SparkSession.builder.getOrCreate()

spark = SparkSession.builder.\
    config('spark.executor.memory', '5g').\
    config('spark.driver.memory', '5g').\
    config('spark.driver.maxResultSize', '3g').\
    config('spark.dynamicAllocation.maxExecutors', 20).\
getOrCreate()


In [2]:
spark.sparkContext.addPyFile('rake.py')
spark.sparkContext.addFile('SmartStoplist.txt')

In [3]:
import rake
rake_object = rake.Rake("SmartStoplist.txt", 3, 4, 4)

[('used', 1.0), ('types', 1.0), ('system', 1.0), ('supporting', 1.0), ('natural', 1.0), ('algorithms', 1.0), ('solutions', 1.0), ('sets', 1.0), ('construction', 1.0), ('linear', 1.0), ('set', 1.0), ('minimal', 1.0), ('components', 1.0), ('numbers', 1.0)]
[('minimal generating sets', 8.666666666666666), ('linear diophantine equations', 8.5), ('minimal supporting set', 7.666666666666666), ('minimal set', 4.666666666666666), ('linear constraints', 4.5), ('nonstrict inequations', 4.0), ('strict inequations', 4.0), ('upper bounds', 4.0), ('natural numbers', 4.0), ('mixed types', 3.666666666666667), ('considered types', 3.166666666666667), ('set', 2.0), ('types', 1.6666666666666667), ('considered', 1.5), ('system', 1.0), ('systems', 1.0), ('algorithms', 1.0), ('construction', 1.0), ('constructing', 1.0), ('solutions', 1.0), ('components', 1.0), ('criteria', 1.0), ('solving', 1.0), ('compatibility', 1.0)]


<br />
<br />
### Candidate extraction to create Train data using Textrank

For textrank algorithm, a set of candidate words is created in the following python method. The words are tokenized and seven POS Tags are considered good tags while selecting the candidates.<br />
<br />

In [4]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

Below is a standard Textrank algorithm method which filters the candidates generated using the above technique.

In [5]:
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
    
    return sorted(keyphrases.items(), key=lambda x: x[1], reverse=True)

<br />
<br />
###  Candidate Features Extraction

The below method extract_candidate_features computes the features for every keyphrase in the paper. These features are later used for classification tasks. 

The features for each keyphrase are: <br />
Term count <br />
Term length <br />
Max word length <br />
Spread of the keyphrase <br />
Lexical cohesion <br />
In_abstract (Boolean) <br />
In_title (Boolean) <br />
Absolute first occurrence <br />
Absolute last occurrence <br />
<br />

In [6]:
def extract_candidate_features(candidates, doc_text, doc_excerpt, doc_title, doc_file):
    import collections, math, nltk, re
    
    candidate_scores = collections.OrderedDict()
    
    # get word counts for document
    doc_word_counts = collections.Counter(word.lower()
                                          for sent in nltk.sent_tokenize(doc_text)
                                          for word in nltk.word_tokenize(sent))
    
    for candidate in candidates:
        
        pattern = re.compile(r'\b'+re.escape(candidate)+r'(\b|[,;.!?]|\s)', re.IGNORECASE)
        
        # frequency-based
        # number of times candidate appears in document
        cand_doc_count = len(pattern.findall(doc_text))
        # count could be 0 for multiple reasons; shit happens in a simplified example
        if not cand_doc_count:
            print('**WARNING:', candidate, 'not found!')
            continue
    
        # statistical
        candidate_words = candidate.split()
        max_word_length = max(len(w) for w in candidate_words)
        term_length = len(candidate_words)
        # get frequencies for term and constituent words
        sum_doc_word_counts = float(sum(doc_word_counts[w] for w in candidate_words))
        try:
            # lexical cohesion doesn't make sense for 1-word terms
            if term_length == 1:
                lexical_cohesion = 0.0
            else:
                lexical_cohesion = term_length * (1 + math.log(cand_doc_count, 10)) * cand_doc_count / sum_doc_word_counts
        except (ValueError, ZeroDivisionError) as e:
            lexical_cohesion = 0.0
        
        # positional
        # found in title, key excerpt
        in_title = 1 if pattern.search(doc_title) else 0
        in_excerpt = 1 if pattern.search(doc_excerpt) else 0
        # first/last position, difference between them (spread)
        doc_text_length = float(len(doc_text))
        first_match = pattern.search(doc_text)
        abs_first_occurrence = first_match.start() / doc_text_length
        if cand_doc_count == 1:
            spread = 0.0
            abs_last_occurrence = abs_first_occurrence
        else:
            for last_match in pattern.finditer(doc_text):
                pass
            abs_last_occurrence = last_match.start() / doc_text_length
            spread = abs_last_occurrence - abs_first_occurrence

        candidate_scores[candidate] = {'document': doc_file,
                                       'term_count': cand_doc_count,
                                       'term_length': term_length, 
                                       'max_word_length': max_word_length,
                                       'spread': spread, 
                                       'lexical_cohesion': lexical_cohesion,
                                       'in_excerpt': in_excerpt, 
                                       'in_title': in_title,
                                       'abs_first_occurrence': abs_first_occurrence,
                                       'abs_last_occurrence': abs_last_occurrence}

    return candidate_scores

<br />
<br />
##  Data Processing tasks

The below code extracts all the file names from the 144 files in the train data directory and appends it to the python list. The variable a is the list of all the titles of the papers.
<br />
<br />

In [7]:
directory_path_chunk = '/users/kanagre/AutomaticKeyphraseExtraction/SemEval2010/train/'
import os
filename_list = []
for subdir, dirs, files in os.walk(directory_path_chunk):
    for file in sorted(files):
        if file.endswith('.txt.final'):            
            filename_list.append(file)

In [8]:
a = ['Evaluating Adaptive Resource Management for Distributed Real-Time Embedded Systems', 'Demonstration of Grid-Enabled Ensemble Kalman Filter Data Assimilation Methodology for Reservoir Characterization', 'MSP: Multi-Sequence Positioning of Wireless Sensor Nodes', 'StarDust: A Flexible Architecture for Passive Localization in Wireless Sensor Networks', 'TSAR: A Two Tier Sensor Storage Architecture Using Interval Skip Graphs', 'Multi-dimensional Range Queries in Sensor Networks', 'Evaluating Opportunistic Routing Protocols with Large Realistic Contact Traces', 'CenWits: A Sensor-Based Loosely Coupled Search and Rescue System Using Witnesses', 'Fairness in Dead-Reckoning based Distributed Multi-Player Games', 'Globally Synchronized Dead-Reckoning with Local Lag for Continuous Distributed Multiplayer Games', 'Remote Access to Large Spatial Databases', 'Context Awareness for Group Interaction Support', 'A Hierarchical Process Execution Support for Grid Computing', 'Congestion Games with Load-Dependent Failures: Identical Resources','A Scalable Distributed Information Management System','Authority Assignment in Distributed Multi-Player Proxy-based Games','Network Monitors and Contracting Systems: Competition and Innovation','Shooter Localization and Weapon Classification with Soldier-Wearable Networked Sensors', 'Heuristics-Based Scheduling of Composite Web Service Workloads', 'A Holistic Approach to High-Performance Computing: Xgrid Experience', 'An Evaluation of Availability Latency in Carrier-based Vehicular ad-hoc Networks','pTHINC: A Thin-Client Architecture for Mobile Wireless Web','A Point-Distribution Index and Its Application to Sensor-Grouping in Wireless Sensor Networks','GUESS: Gossiping Updates for Efficient Spectrum Sensing','Adapting Asynchronous Messaging Middleware to ad-hoc Networking','Composition of a DIDS by Integrating Heterogeneous IDSs on Grids','Assured Service Quality by Improved Fault Management Service-Oriented Event Correlation','Tracking Immediate Predecessors in Distributed Computations','An Architectural Framework and a Middleware for Cooperating Smart Components','A Cross-Layer Approach to Resource Discovery and Distribution in Mobile ad-hoc Networks','Consistency-preserving Caching of Dynamic Database Content','Adaptive Duty Cycling for Energy Harvesting Systems','Concept and Architecture of a Pervasive Document Editing and Managing System','Selfish Caching in Distributed Systems: A Game-Theoretic Analysis','AdaRank: A Boosting Algorithm for Information Retrieval','Relaxed Online SVMs for Spam Filtering','DiffusionRank: A Possible Penicillin for Web Spamming','Cross-Lingual Query Suggestion Using Query Logs of Different Languages','HITS on the Web: How does it Compare?','HITS Hits  TRECExploring IR Evaluation Results with Network Analysis','Combining Content and Link for Classification using Matrix Factorization','A Time Machine for Text Search','Query Performance Prediction in Web Search Environments','Broad Expertise Retrieval in Sparse Data Environments','A Semantic Approach to Contextual Advertising','A New Approach for Evaluating Query Expansion: Query-Document Term Mismatch','Performance Prediction Using Spatial Autocorrelation','An Outranking Approach for Rank Aggregation in Information Retrieval','Vocabulary Independent Spoken Term Detection','Context Sensitive Stemming for Web Search','Knowledge-intensive Conceptual Retrieval and Passage Extraction of Biomedical Literature', 'A Frequency-based and a Poisson-based Definition of the Probability of Being Informative', 'Impedance Coupling in Content-targeted Advertising', 'Implicit User Modeling for Personalized Search','Location based Indexing Scheme for DAYS','Machine Learning for Information Architecture in a Large Governmental Website','Ranking Web Objects from Multiple Communities','Unified Utility Maximization Framework for Resource Selection','Automatic Extraction of Titles from General Documents using Machine Learning','Beyond PageRank: Machine Learning for Static Ranking','Distance Measures for MPEG-7-based Retrieval','Downloading Textual Hidden Web Content Through Keyword Queries','Estimating the Global PageRank of Web Communities','Event Threading within News Topics','Learning User Interaction Models for Predicting Web Search Result Preferences','Robustness of Adaptive Filtering Methods In a Cross-benchmark Evaluation','Controlling Overlap in Content-Oriented XML Retrieval','Context-Sensitive Information Retrieval Using Implicit Feedback','Improving Web Search Ranking by Incorporating User Behavior Information','Handling Locations in Search Engine Queries','A Study of Factors Affecting the Utility of Implicit Relevance Feedback','Feature Representation for Effective Action-Item Detection','Using Asymmetric Distributions to Improve Text Classifier Probability Estimates','A Framework for Agent-Based Distributed Machine Learning and Data Mining','Bidding Algorithms for a Distributed Combinatorial Auction', 'A Complete Distributed Constraint Optimization Method For Non-Traditional Pseudotree Arrangements','Dynamics Based Control with an Application to Area-Sweeping Problems','Implementing Commitment-Based Interactions','Modular Interpreted Systems','Operational Semantics of Multiagent Interactions','Normative System Games','A Multilateral Multi-issue Negotiation Protocol','Agents, Beliefs, and Plausible Behavior in a Temporal Setting','Learning and Joint Deliberation through Argumentation in Multi-Agent Systems','A Unified and General Framework for Argumentation-based Negotiation','A Randomized Method for the Shapley Value for the Voting Game','Approximate and Online Multi-Issue Negotiation','Searching for Joint Gains in Automated Negotiations Based on Multi-criteria Decision Making Theory','Unifying Distributed Constraint Algorithms in a BDI Negotiation Framework','Rumours and Reputation: Evaluating Multi-Dimensional Trust within a Decentralised Reputation System','An Efficient Heuristic Approach for Security Against Multiple Adversaries','An Agent-Based Approach for Privacy-Preserving Recommender Systems','On the Benefits of Cheating by Self-Interested Agents in Vehicular Networks','Distributed Agent-Based Air Traffic Flow Management','A Q-decomposition and Bounded RTDP Approach to Resource Allocation','Combinatorial Resource Scheduling for Multiagent MDPs','Organizational Self-Design in Semi-dynamic Environments','Graphical Models for Online Solutions to Interactive POMDPs','Letting loose a SPIDER on a network of POMDPs: Generating quality guaranteed policies','On Opportunistic Techniques for Solving Decentralized Markov Decision Processes with Temporal Constraints','A Multi-Agent System for Building Dynamic Ontologies','A Formal Model for Situated Semantic Alignment','Learning Consumer Preferences Using Semantic Similarity','Exchanging Reputation Values among Heterogeneous Agent Reputation Models: An Experience on ART Testbed','On the relevance of utterances in formal inter-agent dialogues','Hypotheses Refinement under Topological Communication Constraints','Negotiation by Abduction and Relaxation','The LOGIC Negotiation Model','Bid Expressiveness and Clearing Algorithms in Multiattribute Double Auctions','(In)Stability Properties of Limit Order Dynamics','Efficiency and Nash Equilibria in a Scrip System for P2P Networks','Playing Games in Many Possible Worlds','Finding Equilibria in Large Sequential Games of Imperfect Information','Multi-Attribute Coalitional Games','The Sequential Auction Problem on eBay: An Empirical Analysis and a Solution','Networks Preserving Evolutionary Equilibria and the Power of Randomization','An Analysis of Alternative Slot Auction Designs for Sponsored Search','The Dynamics of Viral Marketing','Scouts, Promoters, and Connectors: The Roles of Ratings in Nearest Neighbor Collaborative Filtering','Empirical Mechanism Design: Methods, with Application to a Supply-Chain Scenario','On the Computational Power of Iterative Auctions','Information Markets vs. Opinion Pools: An Empirical Comparison','Communication Complexity of Common Voting Rules','Complexity of (Iterated) Dominance','Hidden-Action in Multi-Hop Routing','A Price-Anticipating Resource Allocation Mechanism for Distributed Shared Clusters','From Optimal Limited To Unlimited Supply Auctions','Robust Solutions for Combinatorial Auctions','Marginal Contribution Nets: A Compact Representation Scheme for Coalitional Games','Towards Truthful Mechanisms for Binary Demand Games: A General Framework','Cost Sharing in a Job Scheduling Problem Using the Shapley Value','On Decentralized Incentive Compatible Mechanisms for Partially Informed Environments','ICE: An Iterative Combinatorial Exchange','Weak Monotonicity Suffices for Truthfulness on Convex Domains','Negotiation-Range Mechanisms: Exploring the Limits of Truthful Efficient Markets','Privacy in Electronic Commerce and the Economics of Immediate Gratification','Expressive Negotiation over Donations to Charities','Mechanism Design for Online Real-Time Scheduling','Robust Incentive Techniques for Peer-to-Peer Networks','Self-interested Automated Mechanism Design and Implications for Optimal Combinatorial Auctions','A Dynamic Pari-Mutuel Market for Hedging, Wagering, and Information Aggregation','Applying Learning Algorithms to Preference Elicitation','Competitive Algorithms for VWAP and Limit Order Trading','On Cheating in Sealed-Bid Auctions']

In [9]:
title_list = []
for file_name,title in zip(filename_list,a):
    title_dict = {}
    title_dict['file'] = file_name
    title_dict['title'] = title
    title_list.append(title_dict)

Below code extracts the abstract and the body of a research paper and stores in a dictionary structure shown below: <br />
<br />
{‘file’: , ‘title’: , ‘abstract’: , ‘body’: }<br />
<br />


In [10]:
directory_path_chunk = '/users/kanagre/AutomaticKeyphraseExtraction/SemEval2010/train/'
import os
abstract_list = []
for subdir, dirs, files in os.walk(directory_path_chunk):
    for file in sorted(files):
        if file.endswith('.txt.final'):
            with open(directory_path_chunk+file, 'r') as text_file:
                file_list = text_file.readlines()
                l = []
                l = [file_list.index(i) for i in file_list if 'ABSTRACT' in i]
                if not l:
                    l = [file_list.index(i) for i in file_list if 'Abstract' in i]
                l1 = [file_list.index(i) for i in file_list if 'Categories and Subject Descriptors' in i]
                abstract_string = ""
                b = []
                b = [file_list.index(i) for i in file_list if 'INTRODUCTION' in i]
                b1 = [file_list.index(i) for i in file_list if 'REFERENCES' in i]
                body_string = ""
                
                abstract_dict = {}
                if len(l) == 0 or len(l1) == 0:
                    abstract_string = ""
                elif len(b) == 0 or len(b1) == 0:
                    body_string = ""                    
                else:
                    for i in range(l[0]+1, l1[0]):
                        abstract_line = file_list[i]
                        abstract_string = abstract_string + abstract_line
                    for i in range(b[0]+1, b1[0]):
                        body_line = file_list[i]
                        body_string = body_string + body_line 
                    abstract_dict['file'] = file
                    abstract_dict['abstract'] = abstract_string.replace('\n', ' ')
                    abstract_dict['body'] = body_string.replace('\n', ' ')
                    abstract_list.append(abstract_dict)                        

In [11]:
from collections import defaultdict
d = defaultdict(dict)
for l in (abstract_list, title_list):
    for elem in l:
        d[elem['file']].update(elem)
title_abstract = list(d.values())

In [12]:
title_abstract[1].keys()

dict_keys(['abstract', 'file', 'body', 'title'])

In [13]:
title_abstract[:] = [d for d in title_abstract if d.get('file') != 'J-45.txt.final' and title_abstract if d.get('file') != 'J-65.txt.final'
                    and title_abstract if d.get('file') != 'C-44.txt.final' and title_abstract if d.get('file') != 'C-45.txt.final'
                    and title_abstract if d.get('file') != 'C-46.txt.final' and title_abstract if d.get('file') != 'J-33.txt.final']

<br />
This step combines the keyphrases from RAKE and Textrank algorithm and stores in a dictionary with key, 'keywords' in the 'title_abstract_kw_dict'.<br />
<br />

In [None]:
import re
title_abstract_kw_list = []
for d in title_abstract:
    rake_keywords = []
    textrank_keywords = []        
    title_abstract_kw_dict = {}
    
    doc_text = d['title'] + '. ' + d['abstract'] + '. ' + d['body']
    rake_keywords_tuple = rake_object.run(doc_text)
    for keyword in rake_keywords_tuple:
        rake_keywords.append(keyword[0])                                

    text_rank_tuple = score_keyphrases_by_textrank(doc_text)
    for keyword in text_rank_tuple:
        textrank_keywords.append(keyword[0])
    print(d['file'])
    title_abstract_kw_dict['file'] = d['file']
    title_abstract_kw_dict['title'] = d['title']
    title_abstract_kw_dict['abstract'] = d['abstract']
    title_abstract_kw_dict['body'] = d['body']
    title_abstract_kw_dict['title_abstract'] = doc_text
    title_abstract_kw_dict['keywords'] = rake_keywords + textrank_keywords
    title_abstract_kw_dict['keywords'] = list(set(title_abstract_kw_dict['keywords']))
    title_abstract_kw_list.append(title_abstract_kw_dict)

In [None]:
len(title_abstract_kw_list)

In [None]:
kw_feature_list = []
for element in title_abstract_kw_list:
    kw_feature_list.append(extract_candidate_features(element['keywords'], element['body'], element['abstract'], element['title'], element['file']))

In [None]:
kw_feature_list[1]

#### Create Label column
<br />
Below step imports a file, 'train.combined.final' which consists of human-curated keywords which are also present in the text of the paper.
<br />
<br />

In [14]:
with open('/users/kanagre/AutomaticKeyphraseExtraction/SemEval2010/train/train.combined.final', 'r') as text_file:
    content = text_file.readlines()
    content = [x.strip() for x in content]

In [15]:
content[:] = [string for string in content if not string.startswith('J-45 :') and not string.startswith('J-65 :')
             and not string.startswith('C-44 :') and not string.startswith('C-45 :') and not string.startswith('C-46 :')
             and not string.startswith('J-33 :')]

In [16]:
semeval_keywords_list = []
for line in content:
    semval_keywords = []
    semeval_dict = {}
    semeval_dict['document'] = line.split(':')[0].rstrip() + '.txt.final'
    semeval_dict['keywords'] = line.split(':')[1].lstrip().split(',')
    semeval_keywords_list.append(semeval_dict)

In [17]:
semeval_keywords_list[0]

{'document': 'C-41.txt.final',
 'keywords': ['adaptive resource management',
  'distributed real-time embedded system',
  'end-to-end quality of service+service end-to-end quality',
  'hybrid adaptive resourcemanagement middleware',
  'hybrid control technique',
  'real-time video distribution system',
  'real-time corba specification',
  'video encoding/decoding',
  'resource reservation mechanism',
  'dynamic environment',
  'streaming service',
  'distribute real-time embed system',
  'hybrid system',
  'quality of service+service quality']}

<br />
### Combine Label column with Candidate keywords to create dataset for classification

The below steps combines Label column with Candidate keywords extracted from RAKE and Textrank and creates a features-labelled dataset. <br />
<br />

In [18]:
arr = []
for record in semeval_keywords_list:
    arr.append(record['keywords'])

In [19]:
flat_list = [item for sublist in arr for item in sublist]

In [20]:
len(flat_list)

2139

In [None]:
import pandas as pd
import collections
title_abstract_kw_features = pd.DataFrame([])
for element in kw_feature_list:
    title_abstract_kw_features = title_abstract_kw_features.append(pd.DataFrame.from_dict(element).T)

In [None]:
len(title_abstract_kw_features)

In [None]:
title_abstract_kw_features

In [None]:
type(title_abstract_kw_features)

In [24]:
title_abstract_kw_features.to_csv('/users/kanagre/nlp_data_noY2.csv')

In [26]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(spark)
title_abstract_kw_df = sqlContext.read.load('nlp_data_noY2.csv', 
                      header='true', 
                      inferSchema='true',
                        format='com.databricks.spark.csv')

In [27]:
title_abstract_kw_df = title_abstract_kw_df.withColumnRenamed("_c0", "keyword")

In [28]:
from pyspark.sql import functions as fn

In [29]:
from pyspark.sql import types
def add_output_to_keyword(document, keyword):
    document = document.strip()
    keyword = keyword.strip()
    keyword_list = next((item for item in semeval_keywords_list if item['document'] == document))['keywords']
    keyword_list = [x.strip() for x in keyword_list]
    if keyword in keyword_list:
        output = 1
    else:
        output = 0
    return output

In [30]:
add_output_to_keyword_udf = fn.udf(add_output_to_keyword, types.IntegerType())

In [31]:
df_keywords_title_abstract = title_abstract_kw_df.select('keyword','abs_first_occurrence','abs_last_occurrence','document','in_excerpt','in_title','lexical_cohesion','max_word_length','spread','term_count','term_length', add_output_to_keyword_udf('document','keyword').alias('y'))

In [98]:
df_keywords_title_abstract.where(fn.col('y') == 1).count()

<br />
### Further text Procesing Experiments

In the below step, candidate keywords are further cleaned to remove outliers from the data. This is done using a regular expression which will remove any of the non-alphanumeric, special characters and unnecessary keyphrases to create a clean dataset,'selected_true_keywords_clean'
<br />
<br />

In [40]:
import re
def check_keyword_regex(keyword):
    pattern = re.compile("^[A-Za-z0-9/&-]")
    keyword_arr = keyword.split()
    for word in keyword_arr:
        if not pattern.match(word):
            return False
    return True

In [41]:
from pyspark.sql import types
check_keyword_regex_udf = fn.udf(check_keyword_regex, types.BooleanType())

In [42]:
selected_true_keywords = df_keywords_title_abstract.select('*',check_keyword_regex_udf('keyword').alias('keyword_tf'))

In [43]:
selected_true_keywords_clean = selected_true_keywords.where(fn.col('keyword_tf') == True)

In [44]:
selected_true_keywords_clean.count()

47392

In [45]:
selected_true_keywords_clean = selected_true_keywords_clean.withColumnRenamed('y', 'label') 

In [47]:
!hdfs dfs -get selected_true_keywords_clean2.csv /users/kanagre/selected_true_keywords_clean2.csv

In [48]:
!cat /users/kanagre/selected_true_keywords_clean2.csv/*.csv > /users/kanagre/selected_true2.csv

cat: /users/kanagre/selected_true_keywords_clean2.csv/selected_true_keywords_clean2.csv: Is a directory


In [None]:
selected_true_keywords_clean.write.csv('selected_true_keywords_clean2.csv', header= True)

<br />
## Classification Experiments  (Model evaluation and comparison)

The cleaned and pre-processed dataset can now be used to apply classifiers. <br />
The below step performs a three way random split on the data to create training_df, validation_df, testing_df subsetss of data. <br />
The training_df is then fitted on Random Forest and Logistic Regression Classification algorithms in Spark. <br />
The input features to the model are passed using a Vector Assembler. There are 2 Vector assemblers va and va2 which serve as input features to several models. <br />
These models are then compared using Binary Evaluation parameter using the function 'BinaryClassificationEvaluator' in pyspark module. <br />

<br />

In [49]:
training_df, validation_df, testing_df = selected_true_keywords_clean.randomSplit([0.6, 0.3, 0.1])

In [51]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

In [52]:
va = VectorAssembler().setInputCols(training_df.columns[1:3] + training_df.columns[4:11]).setOutputCol('features')

In [53]:
training_df.columns[1:3] + training_df.columns[4:11]

['abs_first_occurrence',
 'abs_last_occurrence',
 'in_excerpt',
 'in_title',
 'lexical_cohesion',
 'max_word_length',
 'spread',
 'term_count',
 'term_length']

In [71]:
rf = RandomForestClassifier()
rf200 = RandomForestClassifier(numTrees=200)
rfSc = RandomForestClassifier(featuresCol='features', numTrees=200)

In [55]:
training_df.count()

28437

In [60]:
rf_pipeline = Pipeline(stages=[va, rf]).fit(training_df)

In [79]:
rf_pipeline200 = Pipeline(stages=[va, rf200]).fit(training_df)

In [58]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bce = BinaryClassificationEvaluator()

In [68]:
bce.evaluate(rf_pipeline.transform(validation_df))

0.838938931173031

In [80]:
bce.evaluate(rf_pipeline200.transform(validation_df))

0.8450503054257902

In [81]:
rf_pipeline200all = Pipeline(stages=[va, rf200]).fit(selected_true_keywords_clean)

In [82]:
bce.evaluate(rf_pipeline200all.transform(selected_true_keywords_clean))

0.8602612024008018

In [88]:
rf_pipeline200all.transform(selected_true_keywords_clean).where(fn.col('label')==1)\
.select('keyword', 'label', 'prediction', 'probability').show(20000, True)

+--------------------+-----+----------+--------------------+
|             keyword|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|                 web|    1|       0.0|[0.93822336293054...|
|content-targeted ...|    1|       0.0|[0.61274275923062...|
|   matching strategy|    1|       0.0|[0.96469844111965...|
|           collusion|    1|       0.0|[0.97350973371707...|
|   asymmetric payoff|    1|       0.0|[0.92817858795700...|
|           incentive|    1|       0.0|[0.93775918026126...|
|          reputation|    1|       0.0|[0.96513527113894...|
|   stranger adaptive|    1|       0.0|[0.97532419024750...|
|        peer-to-peer|    1|       0.0|[0.90740478376528...|
|adaptive stranger...|    1|       0.0|[0.96438793080626...|
|reciprocative dec...|    1|       0.0|[0.74762240934435...|
|     stranger defect|    1|       0.0|[0.96613716497151...|
|          generosity|    1|       0.0|[0.99186053455724...|
|         negotiation|  

In [64]:
from pyspark.ml.classification import LogisticRegression

In [65]:
lr = LogisticRegression()
lr_pipeline = Pipeline(stages=[va, lr]).fit(training_df)

In [67]:
bce.evaluate(lr_pipeline.transform(validation_df))

0.8286210106817244

In [85]:
lr_pipelineAll = Pipeline(stages=[va, lr]).fit(selected_true_keywords_clean)

In [86]:
bce.evaluate(lr_pipelineAll.transform(selected_true_keywords_clean))

0.830623895203096

In [89]:
lr_pipeline.transform(selected_true_keywords_clean).where(fn.col('label')==1).\
select('keyword', 'label', 'prediction', 'probability').show(2000, False)

+-------------------------------------+-----+----------+------------------------------------------+
|keyword                              |label|prediction|probability                               |
+-------------------------------------+-----+----------+------------------------------------------+
|web                                  |1    |0.0       |[0.9763164545237345,0.023683545476265413] |
|content-targeted advertising         |1    |1.0       |[0.4685581502772011,0.531441849722799]    |
|matching strategy                    |1    |0.0       |[0.984808095579221,0.015191904420778956]  |
|collusion                            |1    |0.0       |[0.9589880919965099,0.04101190800349009]  |
|asymmetric payoff                    |1    |0.0       |[0.974217269990562,0.025782730009438135]  |
|incentive                            |1    |0.0       |[0.9480485408962956,0.05195145910370439]  |
|reputation                           |1    |0.0       |[0.9498555245361167,0.050144475463883294] |


Below, we change the input parameters 

In [92]:
va2 = VectorAssembler(
  inputCols=[ 'in_excerpt',
 'in_title',
 'lexical_cohesion',
 'max_word_length',
 'term_count',
 'term_length',
 'keyword_tf'], outputCol="features")

In [96]:
rf_pipeline5 = Pipeline(stages=[va2, rf200]).fit(training_df)

In [97]:
bce.evaluate(rf_pipeline5.transform(validation_df))

0.8420189135334649

In [93]:
lr2 = LogisticRegression().setElasticNetParam(0.1)
lr2_pipeline = Pipeline(stages=[va2, lr2]).fit(training_df)

In [94]:
bce.evaluate(lr2_pipeline.transform(validation_df))

0.8160437014775815

In [95]:
lr2_pipeline.transform(selected_true_keywords_clean).where(fn.col('label')==1).\
select('keyword', 'label', 'prediction', 'probability').show(200, False)

+-------------------------------------+-----+----------+------------------------------------------+
|keyword                              |label|prediction|probability                               |
+-------------------------------------+-----+----------+------------------------------------------+
|web                                  |1    |0.0       |[0.9811114414507598,0.018888558549240247] |
|content-targeted advertising         |1    |0.0       |[0.5009541984145209,0.4990458015854791]   |
|matching strategy                    |1    |0.0       |[0.9703931221448147,0.029606877855185263] |
|collusion                            |1    |0.0       |[0.9635315118767669,0.03646848812323313]  |
|asymmetric payoff                    |1    |0.0       |[0.9824828801098356,0.017517119890164422] |
|incentive                            |1    |0.0       |[0.9697163488904234,0.030283651109576606] |
|reputation                           |1    |0.0       |[0.9502397789624032,0.04976022103759679]  |
