# Required Installations

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any

# Colors

In [None]:
colors = ['red', 'blue', 'yellow', 'pink', 'white', 'black', 'green', 'orange', 'purple', 'brown']

In [None]:
# The colors we consider for this dataset and the number of common sense
# examples that mention them.
def color_freq_dict():
  color_freq = {
      'red': 0,
      'blue': 0,
      'yellow': 0,
      'pink': 0,
      'white': 0,
      'black': 0,
      'green': 0,
      'orange': 0,
      'purple': 0,
      'brown': 0
  }
  return color_freq

In [None]:
# Prints the color and how often samples containing it are seen.
def print_freq(color_freq, colors):
  for color in colors:
    print(color+': '+str(color_freq[color]))

# ConceptNet API

We know that in this API, the `surfaceText` param is the natural language data on which the relationship is based. Further, we know that sentences must either begin with `[[color]]` or end with `[[color]]`. So, we request the API for sentences starting and ending with each of the colors under consideration.



In [None]:
import requests

In [None]:
def find_color(query_str, color_freq, colors):
  examples = [] # list of all examples included in our dataset

  for color in colors:
    current_query = query_str.format(color)

    current_obj = requests.get(current_query).json()

    all_current_exs = current_obj['edges']

    for ex in all_current_exs:
      surface_text = ex['surfaceText']

      if surface_text is not None:

        # Sentences appear in the form "[[red]] is a [[color]]"
        surface_text = surface_text.replace('[', '')
        surface_text = surface_text.replace(']', '')

        # Converts the sentence into lower case
        surface_text = surface_text.lower()

        examples.append(surface_text)

        color_freq[color] += 1

  return examples

In [None]:
# sample URL query for interfacing with the API
# sentences that start with the color
query_str = 'http://api.conceptnet.io/query?start=/c/en/{}&limit=1000'

start_color_freq = color_freq_dict()

start_examples = find_color(query_str, start_color_freq, colors)

In [None]:
print_freq(start_color_freq, colors)

red: 80
blue: 202
yellow: 28
pink: 12
white: 41
black: 69
green: 108
orange: 60
purple: 18
brown: 10


In [None]:
# sample URL query for interfacing with the API
# sentences that end with the color
query_str = 'http://api.conceptnet.io/query?end=/c/en/{}&limit=1000'

end_color_freq = color_freq_dict()

end_examples = find_color(query_str, end_color_freq, colors)

In [None]:
print_freq(end_color_freq, colors)

red: 109
blue: 102
yellow: 105
pink: 99
white: 160
black: 133
green: 195
orange: 144
purple: 67
brown: 129


In [None]:
with open('conceptnet.txt', 'a') as f:
  for ex in start_examples:
    f.write(ex + '\n')

  for ex in end_examples:
    f.write(ex + '\n')

# OMCS Common Sense Dataset

In [None]:
from datasets import load_dataset

In [None]:
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import string

In [None]:
dataset = load_dataset("dutta18/omcs_dataset_of_commonsense_facts")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/812 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1578238 [00:00<?, ? examples/s]

In [None]:
omcs_examples = []

In [None]:
color_freq = color_freq_dict()

In [None]:
# Iterates through all facts in the OMCS dataset and checks if it contains
# a color in our list.
# If it does, this sentence is appended to our list of examples.
for sample in tqdm(dataset["train"]):

  # Extracts fact from the sample.
  fact = sample['fact']

  # Iterates through all colors.
  for color in colors:

    fact_tokens = word_tokenize(fact)
    # Checks if any color is mentioned in the fact.
    if color in fact_tokens:

      # Converts sentence to lowercase and removes all punctuation.
      fact = fact.lower()
      fact = fact.translate(str.maketrans('', '', string.punctuation))

      omcs_examples.append(fact)

      # Increases the number of examples for a given color by 1.
      color_freq[color] += 1
      break

100%|██████████| 1578238/1578238 [30:54<00:00, 850.93it/s] 


In [None]:
with open('omcs_dataset.txt', 'a') as f:
  for fact in omcs_examples:
    f.write(fact+'\n')

In [None]:
print_freq(color_freq, colors)

red: 2968
blue: 1880
yellow: 1382
pink: 283
white: 3040
black: 1951
green: 1992
orange: 890
purple: 320
brown: 859
