In [None]:
import sys
!{sys.executable} -m pip install wayback

In [None]:
import os
import re
import csv
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from iso639 import languages
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import scipy

import time
import requests
from wayback import WaybackClient, WaybackSession

# Set notebook mode to work in offline
pyo.init_notebook_mode()

import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from tldextract import extract
import matplotlib.cm as cm
  
import urllib.request as urlopen
from textwrap import wrap

In [None]:
def get_error(filter_num, raw_num):
    pos_list = [1] * filter_num
    neg_list = [0] * (raw_num - filter_num)
    err = np.std(pos_list + neg_list)
    return err / np.sqrt(raw_num)

# Load metadata

In [None]:
data_dir = '/data/datacomp/'
scale_dir = os.path.join(data_dir, 'small')
metadata_dir = os.path.join(scale_dir, 'metadata')
sample_image_dir = 'sample_images'

In [None]:
df = None
for filename in os.listdir(metadata_dir):
    filepath = os.path.join(metadata_dir, filename)
    file_df = pd.read_parquet(filepath, engine='pyarrow')
    if df is None:
        df = file_df
    else:
        df = pd.concat([df, file_df], ignore_index=True)

In [None]:
l14_scores = df['clip_l14_similarity_score'].tolist()
min_threshold = np.percentile(l14_scores, 70)
print(min_threshold)
filtered_df = df[df['clip_l14_similarity_score'] > min_threshold]
excluded_df = df[df['clip_l14_similarity_score'] <= min_threshold]

assert(len(filtered_df) + len(excluded_df) == len(df))
print(len(df), len(filtered_df)) # 12.8M --> 3.94M filtered
print(len(filtered_df) / len(df)) # 30%

In [None]:
df['is_kept'] = df['clip_l14_similarity_score'].apply(lambda x: x > min_threshold)

In [None]:
raw_captions = df['text'].tolist()
filtered_captions = filtered_df['text'].tolist()
excluded_captions = excluded_df['text'].tolist()

In [None]:
raw_urls = df['url'].tolist()
filtered_urls = filtered_df['url'].tolist()

In [None]:
from urllib.parse import urlparse

def get_base_url(url):
    extract_url = extract(url)
    base = extract_url.domain
    suffix = extract_url.suffix
    return base + '.' + suffix

df['base_url'] = df['url'].apply(get_base_url)

In [None]:
df

# Utterance date

In [None]:
sample_csv_file = '../sample_images_1M_url_df.csv'

isHeader = True
rows = []
with open(sample_csv_file, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in reader:
        if isHeader:
            isHeader = False
            continue
        if len(row) > 0:
            row_arr = row[0].split(',')
            if len(row_arr) == 4:
                rows.append(row_arr)
sample_uids = set([row[1] for row in rows])
print(len(rows), len(sample_uids))

In [None]:
df_sample = df[df['uid'].isin(sample_uids)]
df_sample

In [None]:
df_sample = pd.read_parquet('df_sample_1M.parquet')
sample_uids = set(df_sample['uid'].tolist())
df_sample

In [None]:
timestamps_folder = '/home/hongrach/datafiltering/scripts/timestamps'
base_url_dir = os.path.join(timestamps_folder, 'base_url')
url_dir = os.path.join(timestamps_folder, 'url')

IS_BASE = False

In [None]:
json_folder = base_url_dir if IS_BASE else url_dir

uid_to_timestamps = {}
for file in os.listdir(json_folder):
    path = os.path.join(json_folder, file)
    with open(path) as f:
        d = json.load(f)
        uid_to_timestamps.update(d)

In [None]:
len(uid_to_timestamps), len([u for u in uid_to_timestamps if uid_to_timestamps[u] is not None ])

## analysis

In [None]:
# count by year in filtered dataset

df_sample_filtered = df_sample[df_sample['is_kept'] == True]
filtered_uids = df_sample_filtered['uid'].tolist()

count_by_year = defaultdict(int)
succ_count = 0
err_count = 0

for uid in df_sample['uid'].tolist():
    if uid in uid_to_timestamps:
        t = uid_to_timestamps[uid]
        if t is not None:
            year = int(str(t)[:4])
            if year >= 1996:
                count_by_year[year] += 1
                succ_count += 1
        else:
            err_count += 1
    else:
        err_count += 1
print(succ_count, err_count, succ_count / (succ_count + err_count))

fig, ax = plt.subplots(figsize=(6,3))
x = count_by_year.keys()
plt.bar(x, count_by_year.values())
# plt.xticks(np.arange(min(x), max(x)+1, 1.0), rotation=40, ha='right')
plt.title('Frequency of raw dataset samples by year')
plt.ylabel('Count')
plt.xlabel('Year')
plt.show()

In [None]:
# filter ratio by year
uid_by_year = defaultdict(lambda: defaultdict(list)) # {year: {'kept': [], 'total': []}}
filtered_uids_set = set(filtered_uids)

succ_count = 0

for uid in sample_uids:
    is_kept = uid in filtered_uids_set
    if uid in uid_to_timestamps:
        t = uid_to_timestamps[uid]
        if t is not None:
            year = int(str(t)[:4])
            if year >= 1996:
                uid_by_year[year]['total'].append(uid)
                if is_kept:
                    uid_by_year[year]['kept'].append(uid)
                succ_count += 1

err_count = len(sample_uids) - succ_count
print(succ_count, err_count, succ_count / (succ_count + err_count))

years = []
filter_ratios = []
raw_count = []
yerrs = []
for y in uid_by_year.keys():
    num_total = len(uid_by_year[y]['total'])
    if num_total >= 500:
        raw_count.append(num_total)
        ratio = len(uid_by_year[y]['kept']) / num_total
        filter_ratios.append(ratio)
        years.append(y)
        yerrs.append(get_error(len(uid_by_year[y]['kept']), num_total))

fig, ax = plt.subplots(figsize=(6, 3))
plt.bar(years, filter_ratios, yerr=yerrs)
plt.xticks(np.arange(min(years), max(years)+1, 1.0), rotation=40, ha='right')
plt.ylim(0, 0.5)
plt.title('Filter ratio by earliest-indexed year')
plt.ylabel('Filter ratio')
plt.xlabel('Year')
plt.show()

In [None]:
MIN_COUNT = 1000
x = [c for c in raw_count if c >= MIN_COUNT]
y = [filter_ratios[i] for i, c in enumerate(raw_count) if c >= MIN_COUNT]

plt.xlabel('Raw count by year')
plt.ylabel('Filter ratio (percent in kept dataset)') 
plt.title('Filter ratio vs frequency in unfiltered dataset')
plt.scatter(x, y)

b, a = np.polyfit(x, y, deg=1)
print(np.corrcoef(x, y)[0,1] ** 2)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
print(slope, intercept, r_value, p_value, std_err)
# Create sequence of 100 numbers from 0 to 100 
xseq = np.linspace(0, 30000, num=100)

# Plot regression line
plt.plot(xseq, a + b * xseq, color="k", lw=2.5);

# Wayback machine

In [None]:
client = WaybackClient()
results = client.search('nasa.gov')
record = next(results)
record.timestamp.strftime("%Y%m%d%H%M%S")

In [None]:
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return first

In [None]:
# by exact URL

NUM_SAMPLE = 1000000
df_sample = df.sample(NUM_SAMPLE, random_state=0)

# uid_to_timestamps = {}
count = 0

for i, row in df_sample.iterrows():
    uid = row['uid']
    base_url = row['base_url']
    url = row['url']

    count += 1
    if uid in uid_to_timestamps:
        continue

#     time.sleep(1)
    
    try:
        results = client.search(url)
        record = peek(results)
        if record and record.timestamp:
            earliest_timestamp = int(record.timestamp.strftime("%Y%m%d%H%M%S"))
            uid_to_timestamps[uid] = earliest_timestamp
        else:
            uid_to_timestamps[uid] = None
        print(count, base_url, uid_to_timestamps[uid])
    except Exception as e:
        print('ERROR', e, count, base_url)

In [None]:
len(uid_to_timestamps)

In [None]:
df_sample[['uid', 'url', 'base_url']].to_csv('../sample_images_1M_url_df.csv')

In [None]:
df_sample[['uid', 'url', 'base_url']]

In [None]:
r = requests.get("http://web.archive.org/cdx/search/cdx", params={'url': 'azureedge.net', 'limit': 1})
r.content

In [None]:
results = client.search('r-corona.jp')
record = peek(results)
print(record.timestamp if record else None)