In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Other imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "Classifications"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

import json

In [2]:
try:
    import xmlrpclib
except ImportError:
    import xmlrpc.client as xmlrpclib

client = xmlrpclib.ServerProxy('https://pypi.python.org/pypi')
# get a list of package names
packages = client.list_packages()
print(packages[:10])
print(len(packages))
print(type(packages))

['procedural', 'hardik-distributions', 'pyside2-utils', 'commune', 'spy-probability', 'jsii-native-python', 'latent-space-viz', 'pygot', 'distributions-by-Bash', 'pyrblx']
335782
<class 'list'>


In [4]:
pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigqueryNote: you may need to restart the kernel to use updated packages.

  Downloading google_cloud_bigquery-2.29.0-py2.py3-none-any.whl (203 kB)
Collecting protobuf>=3.12.0
  Downloading protobuf-3.19.0-cp38-cp38-win_amd64.whl (895 kB)
Collecting google-api-core[grpc]<3.0.0dev,>=1.29.0
  Downloading google_api_core-2.2.1-py2.py3-none-any.whl (95 kB)
Collecting google-cloud-core<3.0.0dev,>=1.4.1
  Downloading google_cloud_core-2.1.0-py2.py3-none-any.whl (27 kB)
Collecting proto-plus>=1.10.0
  Downloading proto_plus-1.19.7-py3-none-any.whl (45 kB)
Collecting google-resumable-media<3.0dev,>=0.6.0
  Downloading google_resumable_media-2.1.0-py2.py3-none-any.whl (75 kB)
Collecting grpcio<2.0dev,>=1.38.1
  Downloading grpcio-1.41.1-cp38-cp38-win_amd64.whl (3.2 MB)
Collecting google-auth<3.0dev,>=1.25.0
  Downloading google_auth-2.3.2-py2.py3-none-any.whl (155 kB)
Collecting googleapis-common-protos<2.0dev,>=1.52.0
  Downloading googleapis_common_protos-1.53.0-py2.py

In [6]:
pip install --upgrade google-auth-oauthlib

Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting requests-oauthlib>=0.7.0
  Downloading requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Downloading oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
Installing collected packages: oauthlib, requests-oauthlib, google-auth-oauthlib
Successfully installed google-auth-oauthlib-0.4.6 oauthlib-3.1.1 requests-oauthlib-1.3.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
from google_auth_oauthlib import flow

# TODO: Uncomment the line below to set the `launch_browser` variable.
launch_browser = False
#
# The `launch_browser` boolean variable indicates if a local server is used
# as the callback URL in the auth flow. A value of `True` is recommended,
# but a local server does not work if accessing the application remotely,
# such as over SSH or from a remote Jupyter notebook.

appflow = flow.InstalledAppFlow.from_client_secrets_file(
    "credentials.json", scopes=["https://www.googleapis.com/auth/bigquery"]
)

if launch_browser:
    appflow.run_local_server()
else:
    appflow.run_console()

credentials = appflow.credentials

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=57257480088-rpo79pe66ceafcro7itqd9q2gnsnlbdq.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=DSHtdvKpl7Ypcd2KUBmUFdNUVkYC86&prompt=consent&access_type=offline
Enter the authorization code: 4/1AX4XfWjj4Af3oTkR7M6jJcLpua0B9u7Ms1Lr5PlpqWFIwtwduNUDtHoEpu0


In [33]:
pack = packages[0:5]

In [None]:
from google.cloud import bigquery

# Note: depending on where this code is being run, you may require
# additional authentication. See:
# https://cloud.google.com/bigquery/docs/authentication/

project = 'typopypi-330421'

client = bigquery.Client(project=project, credentials=credentials)

downloads = []

for proj in packages:
    query_job = client.query("""
    SELECT COUNT(*) AS num_downloads
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE file.project = '{proj}'
      -- Only query the last 30 days of history
      AND DATE(timestamp)
        BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
        AND CURRENT_DATE()""".format(**locals()))

    results = query_job.result()  # Waits for job to complete.
    for row in results:
        downloads.append(row.num_downloads)

print(downloads[0:10])


In [42]:
release = []

for proj in pack:
    query_job = client.query("""
    SELECT timestamp
    FROM `bigquery-public-data.pypi.file_downloads`
    WHERE file.project = '{proj}'
    """.format(**locals()))

    results = query_job.result()  # Waits for job to complete.
    for row in results:
        release.append(row.timestamp)

print(release[0:10])

BadRequest: 400 Cannot query over table 'bigquery-public-data.pypi.file_downloads' without a filter over column(s) 'timestamp' that can be used for partition elimination

(job ID: ea76aa25-c089-4cb6-ab09-ef6cdfc4cf69)

            -----Query Job SQL Follows-----             

    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:    SELECT timestamp
   3:    FROM `bigquery-public-data.pypi.file_downloads`
   4:    WHERE file.project = 'procedural'
   5:    
    |    .    |    .    |    .    |    .    |    .    |