[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kubotaissei/defamation_japanese_twitter/blob/master/notebooks/get_dataset_example.ipynb)

In [None]:
!pip install datasets==2.8.0

In [1]:
# sample code from https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_bearer_token.py
import requests
import os
import json
from datasets import load_dataset

# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
bearer_token = os.environ.get("BEARER_TOKEN")


def create_url(ids: list):
    tweet_fields = "tweet.fields=created_at"
    ids = f"ids={','.join(ids)}"
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def get_text_data(examples):
    url = create_url(examples["id"])
    json_response = connect_to_endpoint(url)
    # print(json_response["data"])
    text_dict = {data["id"]: data["text"] for data in json_response["data"]}
    time_dict = {data["id"]: data["created_at"] for data in json_response["data"]}
    return {
        "text": [text_dict.get(id) for id in examples["id"]],
        "created_at": [time_dict.get(id) for id in examples["id"]],
    }


dataset = load_dataset("kubota/defamation-japanese-twitter")
dataset = dataset.map(get_text_data, batched=True, batch_size=100)
dataset["train"].to_pandas().head()

Found cached dataset defamation-japanese-twitter (/root/.cache/huggingface/datasets/kubota___defamation-japanese-twitter/plain_text/1.0.0/caa520e7d83339c4b77febb17a1fc8cdc5c0414312eb63049542eea77aff6962)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Unnamed: 0,id,target,label,user_id_list,text,created_at
0,1494101762587836419,"[3, 1, 3]","[4, 3, 4]","[65, 90, 51]",ジェンダーとかそういうのは関係なく、男の子はこんなもん、女の子だからこういうもん、そういう言...,2022-02-17T00:09:53.000Z
1,1494127310340976648,"[1, 1, 1]","[2, 2, 2]","[65, 90, 51]",@sharenewsjapan1 河村は阿呆か？\n今韓国と仲良くお手てつないでなどできるは...,2022-02-17T01:51:24.000Z
2,1494128253279567872,"[3, 1, 1]","[1, 1, 1]","[65, 90, 51]",低能共がドヤ顔で写真撮る権利なんて要らねえよ。\n糞どもはやく死なねえかな。\n鉄道は静かに...,2022-02-17T01:55:09.000Z
3,1494131907076190209,"[1, 3, 1]","[1, 4, 1]","[65, 90, 51]",ライル：（…消えてしまえ、消えてしまえ消えろ消えろ消えろ消えろっ!!!!マフィアなんて全部、...,2022-02-17T02:09:40.000Z
4,1494150116118515715,"[1, 1, 3]","[3, 3, 4]","[65, 90, 51]",自分が住んでいる都市は街全体がおっきな老人ホームみたいになっていて、老人の住みやすさが優先さ...,2022-02-17T03:22:02.000Z
