# Kaggle Datasets Scraper

In [70]:
import time
import kaggle
import pandas as pd
import tqdm
import pickle

kaggle.api.authenticate()

## List of all Kaggle Datasets

Download the tables `Datasets`, `DatasetVersions`, and `Users` from the [Meta Kaggle](https://www.kaggle.com/datasets/kaggle/meta-kaggle) dataset.

In [34]:
username_slug = pd.read_csv("../data/Datasets.csv", dtype={"CurrentDatasetVersionId": "Int64", "OwnerUserId": "Int64"})
username_slug = username_slug[["Id", "CreatorUserId", "OwnerUserId", "CurrentDatasetVersionId"]]
username_slug = username_slug.rename(columns={"Id": "DatasetId"})
username_slug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312769 entries, 0 to 312768
Data columns (total 4 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   DatasetId                312769 non-null  int64
 1   CreatorUserId            312769 non-null  int64
 2   OwnerUserId              310449 non-null  Int64
 3   CurrentDatasetVersionId  312664 non-null  Int64
dtypes: Int64(2), int64(2)
memory usage: 10.1 MB


In [38]:
dataset_versions = pd.read_csv("../data/DatasetVersions.csv")
dataset_versions = dataset_versions[
    ["Id", "CreatorUserId", "VersionNumber", "Slug"]
]
dataset_versions = dataset_versions.rename(columns={"Id": "DatasetVersionId"})
dataset_versions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054141 entries, 0 to 1054140
Data columns (total 4 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   DatasetVersionId  1054141 non-null  int64  
 1   CreatorUserId     1054141 non-null  int64  
 2   VersionNumber     980122 non-null   float64
 3   Slug              1054141 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 32.2+ MB


In [52]:
users = pd.read_csv("../data/Users.csv")
users = users[["Id", "UserName"]]
users = users.rename(columns={"Id": "UserId"})
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17553389 entries, 0 to 17553388
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   UserId    int64 
 1   UserName  object
dtypes: int64(1), object(1)
memory usage: 267.8+ MB


In [55]:
merged = username_slug.merge(
    dataset_versions,
    how="left",
    left_on=["CurrentDatasetVersionId"],
    right_on=["DatasetVersionId"],
).merge(
    users, 
    how="left",
    left_on="OwnerUserId",
    right_on="UserId",
)
merged.dropna(subset=["UserName", "Slug"], inplace=True)
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 301969 entries, 4 to 312768
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DatasetId                301969 non-null  int64  
 1   CreatorUserId_x          301969 non-null  int64  
 2   OwnerUserId              301969 non-null  Int64  
 3   CurrentDatasetVersionId  301969 non-null  Int64  
 4   DatasetVersionId         301969 non-null  float64
 5   CreatorUserId_y          301969 non-null  float64
 6   VersionNumber            301783 non-null  float64
 7   Slug                     301969 non-null  object 
 8   UserId                   301969 non-null  float64
 9   UserName                 301969 non-null  object 
dtypes: Int64(2), float64(4), int64(2), object(2)
memory usage: 25.9+ MB


In [56]:
merged[["UserName", "Slug"]].to_csv("../data/username_slug.csv", index=False)

## Gather dataset metadata

In [57]:
username_slug = pd.read_csv("../data/username_slug.csv")

In [None]:
metadata = []
errors = []
for user in tqdm.tqdm(username_slug["UserName"].unique()):
    try:
        page = 1
        while True:
            results = kaggle.api.dataset_list(
                sort_by="hottest",
                file_type="csv",
                license_name="all",
                user=user,
                page=page,
            )
            if results:
                metadata.extend(results)
            if len(results) < 10:
                break

            page += 1
            if page > 500:
                print(
                    f"User {user} has more than 10000 datasets. The API won't show all of them."
                )

            time.sleep(0.01)
    except Exception as e:
        errors.append((user, e))

with open("../data/metadata.pkl", "wb") as file:
    pickle.dump(metadata, file)

## Kaggle API

In [3]:
kaggle.api.dataset_download_files('sudalairajkumar/novel-corona-virus-2019-dataset', path='./data', unzip=True)

daatset_list
dataset_metadata
dataset_status
dataset_list_files

In [22]:
metadata = []
page = 1
while True:
    results = kaggle.api.dataset_list(
        sort_by="hottest",
        file_type="csv",
        license_name="all",
        tag_ids="",
        search="",
        page=page,
    )
    page += 1

    if len(results) == 0:
        break
    else:
        metadata.extend(results)

    time.sleep(0.1)

In [24]:
len(metadata)

9976

In [38]:
kaggle.api.dataset_list(
    tag_ids="",
    search="",
    page=500,
)

[hashidoyuto/presidio-analyzer,
 ady5215758/model80,
 thefc17/epl-results-19932018,
 yasserh/air-passengers-forecast-dataset,
 miguelrcborges/league-of-legends-patch-14-1-soloq-teamcomp-30k,
 humananalog/deepfakes-inference-demo,
 vinayakshanawad/heart-rate-prediction-to-monitor-stress-level,
 raniajaberi/custom-data,
 debanga/facial-expression-recognition-challenge,
 jonathanimmanuel/barcode-and-qr,
 mahoora00135/flights,
 esensing/sits-bundle,
 japandata509/shinkansen-stations-in-japan,
 catalystcooperative/pudl-project,
 tombutton/weather-data-edexcel-large-data-set,
 arnabchaki/tripadvisor-reviews-2023,
 asaniczka/san-francisco-police-stop-data-2018-2023,
 zusmani/pakistans-job-market,
 sunethjayawardana/google-data-analyst-case-study-cyclist,
 noordeen/employee-attrition]

In [19]:
kaggle.api.dataset_list(
    sort_by="hottest",
    file_type="csv",
    license_name="all",
    tag_ids="",
    search="novel-corona-virus-2019-dataset",
    page=1,
)[0].__dict__

{'subtitleNullable': 'Day level information on covid-19 affected cases',
 'creatorNameNullable': 'SRK',
 'creatorUrlNullable': 'sudalairajkumar',
 'totalBytesNullable': 8928752,
 'urlNullable': 'https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset',
 'licenseNameNullable': 'Data files © Original Authors',
 'descriptionNullable': None,
 'ownerNameNullable': 'SRK',
 'ownerRefNullable': 'sudalairajkumar',
 'titleNullable': 'Novel Corona Virus 2019 Dataset',
 'currentVersionNumberNullable': 151,
 'usabilityRatingNullable': 0.9705882,
 'id': 494724,
 'ref': 'sudalairajkumar/novel-corona-virus-2019-dataset',
 'subtitle': 'Day level information on covid-19 affected cases',
 'hasSubtitle': True,
 'creatorName': 'SRK',
 'hasCreatorName': True,
 'creatorUrl': 'sudalairajkumar',
 'hasCreatorUrl': True,
 'totalBytes': 8928752,
 'hasTotalBytes': True,
 'url': 'https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset',
 'hasUrl': True,
 'lastUpdated'

In [14]:
kaggle.api.dataset_metadata('sudalairajkumar/novel-corona-virus-2019-dataset', "./test")

'./test/dataset-metadata.json'

In [20]:
kaggle.api.dataset_download_files('marquis03/flower-classification', path='./data', unzip=True)