In [None]:
import sys
!{sys.executable} -m pip install IP2Location

In [None]:
import os
import re
import csv
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from langdetect import detect
from iso639 import languages
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import scipy

import time
import socket
import requests
import CloudFlare
import IP2Location

# Set notebook mode to work in offline
pyo.init_notebook_mode()

import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from tldextract import extract
import matplotlib.cm as cm
  
import urllib.request as urlopen
from textwrap import wrap

In [None]:
def get_error(filter_num, raw_num):
    pos_list = [1] * filter_num
    neg_list = [0] * (raw_num - filter_num)
    err = np.std(pos_list + neg_list)
    return err / np.sqrt(raw_num)

# Load metadata

In [None]:
data_dir = '/data/datacomp/'
scale_dir = os.path.join(data_dir, 'small')
metadata_dir = os.path.join(scale_dir, 'metadata')
sample_image_dir = 'sample_images'

In [None]:
df = None
for filename in os.listdir(metadata_dir):
    filepath = os.path.join(metadata_dir, filename)
    file_df = pd.read_parquet(filepath, engine='pyarrow')
    if df is None:
        df = file_df
    else:
        df = pd.concat([df, file_df], ignore_index=True)

In [None]:
l14_scores = df['clip_l14_similarity_score'].tolist()
min_threshold = np.percentile(l14_scores, 70)
print(min_threshold)
filtered_df = df[df['clip_l14_similarity_score'] > min_threshold]
excluded_df = df[df['clip_l14_similarity_score'] <= min_threshold]

assert(len(filtered_df) + len(excluded_df) == len(df))
print(len(df), len(filtered_df)) # 12.8M --> 3.94M filtered
print(len(filtered_df) / len(df)) # 30%

In [None]:
df['is_kept'] = df['clip_l14_similarity_score'].apply(lambda x: x > min_threshold)

In [None]:
raw_captions = df['text'].tolist()
filtered_captions = filtered_df['text'].tolist()
excluded_captions = excluded_df['text'].tolist()

In [None]:
raw_urls = df['url'].tolist()
filtered_urls = filtered_df['url'].tolist()

In [None]:
from urllib.parse import urlparse

def get_base_url(url):
    extract_url = extract(url)
    base = extract_url.domain
    suffix = extract_url.suffix
    return base + '.' + suffix

In [None]:
df['base_url'] = df['url'].apply(get_base_url)
df

In [None]:
df_sample = pd.read_parquet('df_sample_1M.parquet')
df_sample['base_url'] = df_sample['url'].apply(get_base_url)
df_sample

In [None]:
sample_uid_to_text = {}
for _, row in df_sample.iterrows():
    sample_uid_to_text[row['uid']] = row['text']

# Cloudflare categorization

In [1]:
account_id = 'XXX'
api_url = 'https://api.cloudflare.com/client/v4/accounts/' + account_id + '/intel/domain/bulk'
headers = {
    "X-Auth-Email": "XXX",
    "X-Auth-Key": "XXX"
}

outdir = '/data/datacomp/small/cloudflare_100k/'

In [None]:
def get_categories(url_batch, start, end, r=None, outfile_name=None):    
    if r is None:
        r = requests.get(
            api_url,
            params={'domain': url_batch},
            headers=headers,
            stream=True,
            timeout=300
        )

    if r.status_code == 200:
        r_json = r.json()
        
        if outfile_name is None:
            outfile_name = os.path.join(outdir, str(start) + '_' + str(end) + '.json')
        with open(outfile_name, 'w') as f:
            json.dump(r_json, f)
    else:
        print(end, 'request error', r.content)

In [None]:
base_urls = list(df_sample['base_url'].unique())
len(base_urls), base_urls[:10]

NUM_SAMPLE = 100000
random.seed(0)
sample_base_urls = random.sample(base_urls, NUM_SAMPLE)

In [None]:
NUM_BATCH = 10
start_batch_i = 1

for batch_i in range(start_batch_i, int(len(sample_base_urls) / NUM_BATCH) + 1):
    start = NUM_BATCH * batch_i
    end = NUM_BATCH * (batch_i + 1)
    print('categorizing for batch', batch_i, 'with entries i=', start, end)
    
    url_batch = sample_base_urls[start: end]
    
    try:
        get_categories(url_batch, start, end)
    except Exception as e:
        print(end, 'ERROR', e)


## Analysis

In [None]:
url_to_content_categories = {}
for file in os.listdir(outdir):
    filepath = os.path.join(outdir, file)
    if filepath.endswith('.json'):
        with open(filepath) as f:
            r_json = json.load(f)
            if 'success' in r_json and r_json['success'] and 'result' in r_json and r_json['result'] is not None:
                for result in r_json['result']:
                    if 'content_categories' in result:
                        url_to_content_categories[result['domain']] = result['content_categories']
            else:
                print(filepath, 'json error')

print(len(url_to_content_categories))

In [None]:
url_to_categories = defaultdict(list)

for url in url_to_content_categories:
    for category in url_to_content_categories[url]:
        if 'super_category_id' in category and category['super_category_id'] != 15:
            url_to_categories[url].append(category['name'])

category_to_urls = defaultdict(list)
for url in url_to_categories:
    for category in url_to_categories[url]:
        category_to_urls[category].append(url)
print(len(category_to_urls))

In [None]:
uid_to_categories = {}
for _, row in df_sample.iterrows():
    uid = row['uid']
    base_url = row['base_url']
    if base_url in url_to_categories:
        uid_to_categories[uid] = url_to_categories[base_url]

print(len(uid_to_categories))
df_sample['categories'] = df_sample['uid'].map(uid_to_categories)
df_sample

In [None]:
df_sample_filtered = df_sample[df_sample['is_kept'] == True]
filtered_uids = set(df_sample_filtered['uid'].unique())

categories_to_uid = defaultdict(lambda: defaultdict(list)) # category: {kept: [], total: []}

for uid in uid_to_categories:
    is_filtered = uid in filtered_uids
    for category in uid_to_categories[uid]:
        if is_filtered:
            categories_to_uid[category]['kept'].append(uid)
        categories_to_uid[category]['total'].append(uid)
print(len(categories_to_uid))

In [None]:
CATEGORY_MAP = {
    'APIs': 'Technology', # prior
    'Adult Themes': 'NSFW', # NSFW
    'Advertisements': 'Advertisements', # inaccurate
    'Alcohol': 'Drugs & Alcohol', # new
    'Artificial Intelligence': 'Technology', # new
#     'Arts': 'Arts', 
#     'Arts & Crafts': 'Arts & Crafts', 
#     'Astrology': 'Astrology', 
#     'Auctions & Marketplaces': 'Auctions & Marketplaces',
#     'Audio Streaming': 'Audio Streaming', 
#     'Body Art': 'Body Art', 
#     'Business': 'Business', 
    'CIPA Filter': 'NSFW', # NSFW
#     'Cartoons & Anime': 'Cartoons & Anime', 
    'Chat': 'Chat & Messaging', # prior
    'Clothing': 'Clothing & Fashion', # prior
#     'Comic Books': 'Comic Books', 
    'Content Servers': 'Content Servers', # diff
#     'Coupons': 'Coupons', 
    'Cryptocurrency': 'Cryptocurrency', # new
#     'Dating & Relationships': 'Dating & Relationships', 
    'Deceptive Ads': 'Deceptive Ads', # inaccurate
#     'Digital Postcards': 'Digital Postcards', 
    'Drugs': 'Drugs & Alcohol', # diff
#     'Ecommerce': 'Ecommerce', 
#     'Economy & Finance': 'Economy & Finance', 
#     'Education': 'Education', 
#     'Educational Institutions': 'Educational Institutions', 
#     'Entertainment': 'Entertainment', 
    'Fashion': 'Clothing & Fashion', # prior
    'File Sharing': 'File Sharing', # inaccurate
    'Fine Art': 'Arts', # prior
#     'Food & Drink': 'Food & Drink', 
#     'Forums': 'Forums', 
#     'Gambling': 'Gambling', 
#     'Gaming': 'Gaming', 
    'Government': 'Government & Politics', # prior
#     'Hacking': 'Hacking', 
#     'Health & Fitness': 'Health & Fitness', 
#     'Hobbies & Interests': 'Hobbies & Interests', 
#     'Home & Garden': 'Home & Garden', 
    'Home Video/DVD': 'Movies & Home Video', # prior
    'Humor': 'Humor', # inaccurate
    'Information Security': 'Technology', # prior
    'Information Technology': 'Technology', # prior 
    'Instant Messengers': 'Chat & Messaging', # prior
    'Internet Phone & VOIP': 'Internet Phone & VOIP', # inaccurate
#     'Job Search & Careers': 'Job Search & Careers', 
#     'Lifestyle': 'Lifestyle', 
    'Lingerie & Bikini': 'Clothing & Fashion', # prior
#     'Magazines': 'Magazines', 
    'Messaging': 'Chat & Messaging', # prior
    'Militancy, Hate & Extremism': 'NSFW', # inaccurate, NSFW
    'Military': 'Government & Politics', # prior
    'Movies': 'Movies & Home Video', 
#     'Music': 'Music', 
#     'News & Media': 'News & Media', 
    'News, Portal & Search': 'Stock Photos', # new
    'Nudity': 'NSFW', # NSFW
    'P2P': 'Video Streaming', # prior
#     'Paranormal': 'Paranormal', 
#     'Parenting': 'Parenting', 
    'Personal Blogs': 'Personal Blogs', # inaccurate
#     'Pets': 'Pets', 
    'Photo Sharing': 'File Sharing', # new
#     'Photography': 'Photography', 
    'Politics, Advocacy, and Government-Related': 'Government & Politics', # diff
    'Pornography': 'NSFW', # NSFW
    'Professional Networking': 'Business', # prior
#     'Questionable Activities': 'Questionable Activities', 
    'Radio': 'Audio Streaming', # prior
#     'Real Estate': 'Real Estate', 
#     'Religion': 'Religion', 
#     'Safe for Kids': 'Safe for Kids', 
    'School Cheating': 'School Cheating', # new 
#     'Science': 'Science', 
    'Search Engines': 'Search Engines', # inaccurate
#     'Sex Education': 'Sex Education', 
    'Shopping': 'Shopping', # inaccurate
    'Social Networks': 'Social Networks', # inaccurate
    'Space & Astronomy': 'Science', # prior 
#     'Sports': 'Sports', 
    'Swimsuits': 'Clothing & Fashion', # inaccurate
#     'Technology': 'Technology', 
#     'Television': 'Television', 
    'Tobacco': 'Drugs & Alcohol', # diff
    'Translator': 'Translator', # inaccurate
#     'Travel': 'Travel', 
#     'Vehicles': 'Vehicles', 
#     'Video Streaming': 'Video Streaming', 
    'Violence': 'NSFW', 
    'Weapons': 'NSFW', 
#     'Weather': 'Weather', 
#     'Webmail': 'Webmail', 
}

with open('cloudfare_categories.json', 'w') as f:
    json.dump(CATEGORY_MAP, f)

CATEGORY_MAP_REVERSE = defaultdict(list)
for c in CATEGORY_MAP:
    CATEGORY_MAP_REVERSE[CATEGORY_MAP[c]].append(c)

In [None]:
for v in CATEGORY_MAP_REVERSE:
    print(v, '&', '; '.join(CATEGORY_MAP_REVERSE[v]), r"\ "[0] + r"\ "[0])

In [None]:
final_categories_to_uid = defaultdict(lambda: defaultdict(list)) # category: {kept: [], total: []}

for c in categories_to_uid:
    if c in CATEGORY_MAP:
        new_c = CATEGORY_MAP[c]
        kept_uids = set(final_categories_to_uid[new_c]['kept'] + categories_to_uid[c]['kept'])
        final_categories_to_uid[new_c]['kept'] = list(kept_uids)
        total_uids = set(final_categories_to_uid[new_c]['total'] + categories_to_uid[c]['total'])
        final_categories_to_uid[new_c]['total'] = list(total_uids)
    else:
        final_categories_to_uid[c]['kept'] = categories_to_uid[c]['kept']
        final_categories_to_uid[c]['total'] = categories_to_uid[c]['total']
print(len(categories_to_uid), len(final_categories_to_uid))

### Plot

In [None]:
raw_count_by_category = {}
ratios_by_category = {}
for c in final_categories_to_uid:
    num_total = len(final_categories_to_uid[c]['total'])
    raw_count_by_category[c] = num_total
    if num_total > 1000:
        ratio = len(final_categories_to_uid[c]['kept']) / num_total
        ratios_by_category[c] = ratio
print(len(ratios_by_category))

In [None]:
# raw count

top_num = 20
sorted_count_by_category = {k: v for k, v in sorted(
    raw_count_by_category.items(), key=lambda item: item[1], reverse=True
) if v > 1000}

plot_df = pd.DataFrame({
    'category': list(sorted_count_by_category.keys()),
    'count': list(sorted_count_by_category.values())
})
plot_df.sort_values('count', ascending=False).plot(
    kind='bar',y='count',x='category', figsize=(15, 2), legend=False
)
plt.xlabel('')
plt.ylabel('Count')
plt.xticks(rotation=40, ha='right')
plt.show()

In [None]:
sum(list(sorted_count_by_category.values())[:5])

In [None]:
for c in list(sorted_count_by_category.keys()):
    print(c, '|', sorted_count_by_category[c])
# for c in list(sorted_count_by_category.keys())[-top_num:]:
#     print(c, '|', sorted_count_by_category[c])

In [None]:
# filter ratios

sorted_ratios_by_category = {k: v for k, v in sorted(
    ratios_by_category.items(), key=lambda item: item[1], reverse=True
)}
yerr = [
    get_error(
        len(final_categories_to_uid[c]['kept']),
        len(final_categories_to_uid[c]['total'])
    )
    for c in sorted_ratios_by_category
]
results_df = pd.DataFrame({
    'top_category': list(sorted_ratios_by_category.keys()),
    'Category filter ratio': list(sorted_ratios_by_category.values()),
    'top_yerr': yerr,
#     'bottom_category': list(sorted_ratios_by_category.keys()),
#     'bottom_ratio': list(sorted_ratios_by_category.values()),
#     'bottom_yerr': yerr,
})
results_df.sort_values('Category filter ratio', ascending=False).plot(
    kind='bar',y='Category filter ratio',x='top_category',xerr='top_yerr',
    figsize=(15, 2), legend=False
)
plt.ylabel('Filter ratio')
plt.xlabel('')
plt.xticks(rotation=40, ha='right')
plt.axhline(y=0.3, color='r', linestyle='-', label='Global filter ratio')
plt.legend()
plt.show()


In [None]:
for c in list(sorted_ratios_by_category.keys())[:top_num]:
    print(c, ';', sorted_ratios_by_category[c])
for c in list(sorted_ratios_by_category.keys())[-top_num:]:
    print(c, ';', sorted_ratios_by_category[c])

In [None]:
# filter ratios by category

## Examples

In [None]:
for k in sorted(category_to_urls.keys()):
    print("'" + k + "'" + ', ')

In [None]:
nsfw_uids = final_categories_to_uid['NSFW']['kept']
print(len(nsfw_uids))
# df_sample[df_sample['uid'].isin(set(nsfw_uids))][['uid', 'base_url', 'is_kept', 'text', 'categories']].to_csv('nsfw_uids.csv', escapechar='\\')

In [None]:
c = 'Politics, Advocacy, and Government-Related'
category_to_urls[c], len(categories_to_uid[c]['kept']), len(categories_to_uid[c]['total'])

In [None]:
# sample_urls = ['wp.com', 'pinimg.com', 'ebayimg.com', 'cloudfront.net', 'wordpress.com', 'wixstatic.com', 'made-in-china.com', 'ssl-images-amazon.com', 'amazonaws.com', 'alicdn.com', 'gstatic.com', 'fc2.com', 'media-amazon.com', 'gravatar.com', 'ytimg.com', 'tripadvisor.com', 'ebaystatic.com', 'bing.net', 'exblog.jp', 'dreamstime.com']
sample_urls = ['s6img.com', 'shutterstock.com', 'cpcache.com', 'shopstyle-cdn.com', 'bigstockphoto.com', 'vectorstock.com', 'canstockphoto.com', 'fineartamerica.com', 'gettyimages.com', 'etsystatic.com', 'photoshelter.com', 'bing.net', 'zcache.com', 'shoplightspeed.com', '123rf.com', 'slideserve.com', 'teacherspayteachers.com', 'ftcdn.net', 'ssl-images-amazon.com', 'ztat.net'] \
    + ['canalblog.com', 'prom.st', 'userapi.com', 'servimg.com', 'ebaystatic.com', 'wklcdn.com', 'rightmove.co.uk', 'k-img.com', 'goo-net.com', 'sinaimg.cn', 'seesaa.net', 'blogimg.jp', 'st-hatena.com', 'pimg.tw', 'fc2.com', 'cocolog-nifty.com', 'wikidot.com', 'exblog.jp', 'gravatar.com', 'buuyers.com']
print(len(sample_urls))

get_categories(sample_urls[:10], 0, 0, outfile_name='cloudflare_domains/top_domains_3.json')
get_categories(sample_urls[10:20], 0, 0, outfile_name='cloudflare_domains/top_domains_4.json')
get_categories(sample_urls[20:30], 0, 0, outfile_name='cloudflare_domains/top_domains_5.json')
get_categories(sample_urls[30:40], 0, 0, outfile_name='cloudflare_domains/top_domains_6.json')

In [None]:
sample_url_to_content_categories = {}
for file in os.listdir('cloudflare_domains/'):
    filepath = os.path.join('cloudflare_domains/', file)
    if filepath.endswith('.json'):
        with open(filepath) as f:
            r_json = json.load(f)
            if 'success' in r_json and r_json['success'] and 'result' in r_json and r_json['result'] is not None:
                for result in r_json['result']:
                    if 'content_categories' in result:
                        sample_url_to_content_categories[result['domain']] = result['content_categories']
            else:
                print(filepath, 'json error')

sample_url_to_categories = defaultdict(list)

for url in sample_url_to_content_categories:
    for category in sample_url_to_content_categories[url]:
        if 'super_category_id' in category and category['super_category_id'] != 15:
            sample_url_to_categories[url].append(category['name'])

In [None]:
for url in sample_urls:
    print(url, sample_url_to_categories[url])

In [None]:
len(url_to_categories)

# IP Address

In [None]:
len(set([row[3] for row in rows]))

In [None]:
# base_url_to_ip = {}

for row in rows:
    uid = row[1]
    url = row[2]
    base_url = row[3]
    
    if base_url in base_url_to_ip:
        continue
        
    try:
        ip_addr = socket.gethostbyname(base_url)
#         print(uid, base_url, ip_addr)
        base_url_to_ip[base_url] = ip_addr
    except Exception as e:
        base_url_to_ip[base_url] = None
        print('ERROR', base_url)

In [None]:
with open('base_url_200k_to_ip.json', 'w') as fp:
    json.dump(base_url_to_ip, fp)

In [None]:
len([u for u in base_url_to_ip if base_url_to_ip[u] is not None])

In [None]:
# map ip address to country

In [None]:
ip_db_path = "../scripts/ip2location_db/IP2LOCATION-LITE-DB1.IPV6.BIN"
ip_db = IP2Location.IP2Location(ip_db_path)

In [None]:
rec = ip_db.get_all("19.5.10.1")
rec.country_short

In [None]:
base_url_to_country_code = {}
base_url_to_country_name = {}
for base_url in base_url_to_ip:
    ip = base_url_to_ip[base_url]
    if ip is not None:
        rec = ip_db.get_all(ip)
        base_url_to_country_name[base_url] = rec.country_long
        base_url_to_country_code[base_url] = rec.country_short
#         print(base_url, country)
    else:
        base_url_to_country_code[base_url] = None
        base_url_to_country_name[base_url] = None
    

In [None]:
len([u for u in base_url_to_country_code if base_url_to_country_code[u] is not None])

In [None]:
# with open('base_url_200k_to_country_code.json', 'w') as fp:
#     json.dump(base_url_to_country_code, fp)
# with open('base_url_200k_to_country_name.json', 'w') as fp:
#     json.dump(base_url_to_country_name, fp)

## analysis

In [None]:
with open('base_url_200k_to_country_name.json') as f:
    base_url_to_country_name = json.load(f)
len(base_url_to_country_name), len([c for c in base_url_to_country_name if base_url_to_country_name[c] is not None])

In [None]:
uid_to_base_url = {}
for _, row in df_sample.iterrows():
    uid_to_base_url[row['uid']] = row['base_url']
len(uid_to_base_url)

In [None]:
# count by country in filtered dataset

df_sample_filtered = df_sample[df_sample['is_kept'] == True]
filtered_uids = df_sample_filtered['uid'].tolist()

count_by_country = defaultdict(int)
succ_count = 0
err_count = 0

for uid in df_sample['uid'].tolist():
    base_url = uid_to_base_url[uid]
    if base_url in base_url_to_country_name:
        country = base_url_to_country_name[base_url]
        if country is not None and country != '-':
            count_by_country[country] += 1
            succ_count += 1
        else:
            err_count += 1
    else:
        err_count += 1
print(succ_count, err_count, succ_count / (succ_count + err_count))

In [None]:
COUNTRIES = ['Netherlands', 'United Kingdom', 'Iran', 'Korea', 'Taiwan', 'Virgin Islands']
def parse_country(name):
    country = name
    if name == 'United States of America':
        country = 'USA'
    elif name == 'Russian Federation':
        country = 'Russia'
    else:
        for c in COUNTRIES:
            if name.startswith(c):
                country = c
    return country

top_num = 20
sorted_count_by_country = {parse_country(k): v for k, v in sorted(
    count_by_country.items(), key=lambda item: item[1], reverse=True
)}

plot_df = pd.DataFrame({
    'country': list(sorted_count_by_country.keys())[:top_num],
    'count': list(sorted_count_by_country.values())[:top_num]
})
plot_df.sort_values('count').plot(kind='barh',y='count',x='country')
plt.xlabel('IP address geolocation breakdown by country in raw dataset')
plt.show()

In [None]:
for c in list(sorted_count_by_country.keys())[:top_num]:
    print(c, sorted_count_by_country[c])

In [None]:
from langdetect import detect

pattern = re.compile(r'[^A-Za-z ]+')

def clean_caption(caption, as_set=False):
    # only return unique words in caption
    cleaned_caption = pattern.sub(' ', caption.lower())
    if as_set:
        return set(cleaned_caption.split(' '))
    else:
        return cleaned_caption

def is_english(uid):
    caption = sample_uid_to_text[uid]
    cleaned_caption = clean_caption(caption)
    if cleaned_caption.strip() != '':
        try:
            lang = detect(cleaned_caption)
            if lang == 'en':
                return True
        except Exception as e:
            pass
            # print(e, caption)
    return False

In [None]:
# filter ratio by country
uid_by_country = defaultdict(lambda: defaultdict(list)) # {country: {'kept': [], 'total': []}}
filtered_uids_set = set(filtered_uids)

succ_count = 0

for uid in df_sample['uid'].tolist():
    is_kept = uid in filtered_uids_set
    base_url = uid_to_base_url[uid]
    country = base_url_to_country_name[base_url] if base_url in base_url_to_country_name else None
    if country is not None and country != '-':
        if True: # is_english(uid):
            uid_by_country[country]['total'].append(uid)
            if is_kept:
                uid_by_country[country]['kept'].append(uid)
            succ_count += 1

err_count = len(df_sample) - succ_count
print(succ_count, err_count, succ_count / (succ_count + err_count))

In [None]:
print(succ_count, err_count, succ_count / (succ_count + err_count))

In [None]:
countries = uid_by_country.keys()
raw_count_by_country = {}
ratios_by_country = {}
for c in countries:
    num_total = len(uid_by_country[c]['total'])
    if num_total > 5000:
        ratio = len(uid_by_country[c]['kept']) / num_total
        ratios_by_country[c] = (ratio)
        raw_count_by_country[c] = num_total

In [None]:
for c in raw_count_by_country:
    print(c, ',', raw_count_by_country[c])

In [None]:
sorted_ratios_by_country = {(k): v for k, v in sorted(
    ratios_by_country.items(), key=lambda item: item[1], reverse=True
)}
yerr = [
    get_error(
        len(uid_by_country[c]['kept']),
        len(uid_by_country[c]['total'])
    )
    for c in sorted_ratios_by_country
]

fig, ax = plt.subplots(figsize=(8, 3))
results_df = pd.DataFrame({
    'top_country': [parse_country(c) for c in sorted_ratios_by_country.keys()],
    'top_ratio': list(sorted_ratios_by_country.values()),
    'top_yerr': yerr,
})
results_df.sort_values('top_ratio', ascending=False).plot(
    kind='bar',y='top_ratio',x='top_country',yerr='top_yerr', legend=False,ax=ax
)
ax.set_ylabel('Filter ratio')
ax.set_xlabel('')
ax.set_xticks(np.arange(len(sorted_ratios_by_country)), labels=results_df['top_country'], rotation=40, ha='right')

nonwest_is = [0, 10, 12, 13, 14, 15, 16]
for i in nonwest_is:
    label = ax.get_xticklabels()[i]
    label.set_bbox(dict(facecolor='yellow', edgecolor='yellow', pad=0))
    
plt.show()

In [None]:
# bar graph: AGE and GENDER

graph_results = defaultdict(list)

for c in sorted_ratios_by_country:
    total_num = len(uid_by_country[c]['total'])
    filter_num = len(uid_by_country[c]['kept'])
    graph_results['kept'].append(filter_num)
    graph_results['excluded'].append(total_num - filter_num)

x = np.arange(len(sorted_ratios_by_country))  # the label locations
labels = ['%s (%.2f)' % (parse_country(c), sorted_ratios_by_country[c]) for c in sorted_ratios_by_country]

width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(figsize=(7, 4))
bottom = np.zeros(len(x))

for k, v in graph_results.items():
    rects = ax.bar(x + width/2, v, width, label=k, bottom=bottom, hatch='///' if k == 'kept' else None)
#         ax.bar_label(rects) # , padding=3)
    bottom += v

ax.set_ylabel('Count')
ax.set_title('Frequency by geolocation for English-detected samples')
ax.legend(loc='upper right', ncols=1)
ax.set_xticks(x + width/2, labels=labels, rotation=40, ha='right')
ax.tick_params(axis='x', which='major', pad=0)

nonwest_is = [0, 10, 13, 14, 15, 16]
for i in nonwest_is:
    label = ax.get_xticklabels()[i]
    label.set_bbox(dict(facecolor='yellow', edgecolor='yellow', pad=0))

fig.tight_layout()
plt.show()


In [None]:
for c in (list(sorted_ratios_by_country.keys())):
    print(c, ',', sorted_ratios_by_country[c])

In [None]:
top_num = 20
sorted_count_by_country = {k: v for k, v in sorted(
    count_by_country.items(), key=lambda item: len(item[1]), reverse=True
)}

plot_df = pd.DataFrame({
    'country': list(count_by_country.keys())[:top_num],
    'count': list(count_by_country.values())[:top_num]
})
plot_df.sort_values('count').plot(kind='barh',y='count',x='country')
plt.xlabel('Country breakdown in kept dataset')
plt.show()

In [None]:
MIN_COUNT = 1000
x = [raw_count_by_country[c] for c in raw_count_by_country if raw_count_by_country[c] >= MIN_COUNT]
y = [filter_ratios[c] for c in raw_count_by_country if raw_count_by_country[c] >= MIN_COUNT]

plt.xlabel('Raw count by year')
plt.ylabel('Filter ratio (percent in kept dataset)') 
plt.title('Filter ratio vs frequency in unfiltered dataset')
plt.scatter(x, y)

b, a = np.polyfit(x, y, deg=1)
print(np.corrcoef(x, y)[0,1] ** 2)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
print(slope, intercept, r_value, p_value, std_err)
# Create sequence of 100 numbers from 0 to 100 
xseq = np.linspace(0, 30000, num=100)

# Plot regression line
plt.plot(xseq, a + b * xseq, color="k", lw=2.5);