The purpose of this notebook is to read the SimLex-999 dataset, and for each concept find its semantic features by requesting information from the ConceptNet knowledge graph. 

## Libraries

In [8]:
import requests
import csv
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import spearmanr
import pickle
import time

# for stem
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jobqu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 1. Reading the SimLex-999 dataset

Only extracting the **noun subset**. 

In [18]:
df = pd.read_excel('../Data/SimLex.xlsx')

# Selecting only nouns
df = df[df['POS'] == 'N']
print "Number of pais in datasaet:", len(df)

Concepts = pd.unique(df[['word1','word2']].values.ravel('K'))
Concepts = map(str, Concepts)
print "Number of total concepts", len(Concepts)

Number of pais in datasaet: 666
Number of total concepts 751


## 2. Finding features from ConceptNet

For each concept (751) a set of semantic features should be obtained from ConceptNet

### 2.1 Relevant Relations
ConceptNet has more than 30 type of relations. We will only focus on the most relevant (in a semantic sense), leaving out linguistic relations (such as the roots of words).

In [10]:
Relations = ['RelatedTo','IsA','PartOf','HasA','UsedFor','CapableOf','AtLocation','Causes','HasSubevent',
             'HasFirstSubevent','HasLastSubevent','HasPrerequisite','HasProperty','MotivatedByGoal','ObstructedBy',
             'Desires','CreatedBy','SymbolOf','DefinedAs',
             'Entails','MannerOf','LocatedNear','SimilarTo','CausesDesire', 'MadeOf', 'ReceivesAction']


Dict_relations = { 'AtLocation': ['it is located at', 4], 'AttachedTo': ['it is attached to', 4],
                   'AttractedTo': ['it is attracted to', 4], 'BoughtAt': ['it is bought at', 4],
                   'BuiltFor': ['it is built for', 4],  'CapableOf': ['it is capable of', 4],
                   'CausedBy': ['it is caused by', 4], 'Causes': ['it causes', 2],
                   'CausesDesire': ['it causes desire', 3], 'ComposedOf': ['it is composed of', 4],
                   'ComprisedOf': ['it is comprised of', 4], 'ConnectedTo': ['it is connected to',4],
                   'Considered': ['it is considered', 3], 'CoveredBy': ['it is covered by', 4],
                   'CoveredWith': ['it is covered with', 4],
                   'CreatedBy': ['it is created by',4], 'DefinedAs': ['it is defined as',4],
                   'Desires': ['it desires', 2], 'DividedInto': ['it is divided into', 4],
                   'DrawnTo': ['it is drawn to', 4], 'Entails': ['it entails', 2],
                   'FilledWith': ['it is filled with',4], 'FoundOn': ['it is found on', 4],
                   'HasA': ['it has a', 3], 'HasFirstSubevent': ['it has as first subevent', 5],
                   'HasLastSubevent': ['it has as last subevent', 5], 'HasPrerequisite': ['it has as prerequisite', 4],
                   'HasProperty': ['it has the property', 4], 'HasSubevent': ['it has as subevent', 4],
                   'IsA': ['it is a', 3], 'KeptIn': ['it is kept in', 4],
                   'LocatedNear': ['it is located near',4], 'MadeFrom': ['it is made from',4],
                   'MadeIn': ['it is made in', 4], 'MadeInto': ['it is made into', 4],
                   'MadeOf': ['it is made of', 4], 'MadeOn': ['it is made on', 4],
                   'MadeTo': ['it is made to', 4], 'MadeUp': ['it is made up', 4],
                   'MadeUpOf': ['it is made up of', 5], 'MannerOf': ['it is a manner of', 5], 
                   'MotivatedByGoal': ['it is motivated by', 4], 'NeededTo': ['it is needed to', 4],
                   'ObstructedBy': ['it is obstructed by', 4],  'PartOf': ['it is a part of', 5],
                   'PlantedIn': ['it is planted in',4], 'PlayedBy': ['it is played by', 4],
                   'PlayedIn': ['it is played in', 4], 'PlayedOn': ['it is played on', 4],
                   'PlayedWith': ['it is played with',4], 'ProducedBy': ['it is produced by',4],
                   'ReceivesAction': ['it receives action', 3], 'RelatedTo': ['it is related to', 4], 
                   'SimilarTo': ['it is similar to', 4], 'ServedAt': ['it is served at',4],
                   'ServedIn': ['it is served in',4], 'SoldAt': ['it is sold at', 4],
                   'SoldBy': ['it is sold by', 4], 'SoldIn': ['it is sold in', 4],
                   'StoredIn': ['it is stored in', 4], 'StoredOn': ['it is stored on',4],
                   'UsedAs': ['it is used as',4], 'UsedBy': ['it is used by',4],
                   'UsedFor': ['it is used for', 4], 'UsedIn': ['it is used in',4],
                   'SymbolOf': ['is symbolf of', 3]}

### Auxiliar Function

In [11]:
def clean_term (string):
    "Cleans up a word from: /c/en/loyal_friend to loyal_friend"
    A = string[string.rfind('/')+1:]
    try: 
        return str(A)
    except:
        return []

## 3. Main loop to request semantic features from ConceptNet

The requesting of information from ConceptNet requires a 2 sec pause before processing another one (a ConceptNet restriction). This loop can take up to 12 hours.

In [19]:
max_num_feat = 30 #Max features
max_feats_per_relation = 10 # Max features per relations

All_sem_features = []  # Array

# Loop for requesting edges from ConceptNet graph
for Concept in Concepts:
    #Creating list of URLs
    URL_start = 'http://api.conceptnet.io/query?start='
    URL_concept = '/c/en/' + Concept
    URL_relation = '&rel=/r/'
    URL_filter = '&filter=/c/en'
    URLs = [URL_start + URL_concept + URL_relation + r + URL_filter for r in Relations]
    Sem_features = []
    for URL in URLs:
        # Pause to avoid errors..
        time.sleep(2.1)
        # Requesting edges
        response = requests.get(URL)
        obj = response.json()
        edges = obj['edges']
        # List of features for relations:
        L_rel = []
        i = 0
        for edge in edges:
            c_name = edge['start']['term']
            rel = edge['rel']['label']
            o_name = edge['end']['term']
            w = edge['weight']
            if o_name[:6] != '/c/en/':  # Filtering non-english concepts
                break
    
            # Creating tuple: [concept, name + POS, relation, object, weight] "('" + clean_term(c_name) + "' 'N')"
            L_rel.append([clean_term(c_name), tuple([clean_term(c_name), 'N']), str(rel), clean_term(o_name), w])
            i += 1

            # Condition for max number of features
            if i > max_feats_per_relation: 
                break
                
        # Sorting assertions by weight 
        L_rel.sort(key = lambda x: x[4], reverse = True)
        # Keeping only the 'num_feat' more important features
        Sem_features.append(L_rel[:max_num_feat])
        
    Final_feats = []
    # Cleaning and sorting features...
    for l in Sem_features:
        if len(l) > 0:
            Final_feats.extend(l)
    Final_feats.sort(key = lambda x : x[4], reverse = True)
    All_sem_features.append(Final_feats)
    print Concept

KeyboardInterrupt: 

## 4. Exporting file with semantic features

In [17]:
# open file
with open('../Data/ConceptNet_Semantic_Features_1.csv','wb') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    filewriter.writerow(['Concept','Name + POS','feat_name','feat_value','weight'])
    for x in All_sem_features:
        for row in x:
            filewriter.writerow([row[0]] + [str(row[1]).replace(',','')] + row[2:])
    
# close file
csvfile.close()