## ICLab Data Files Exploratory Analysis

In [12]:
import pandas as pd
import tldextract
import numpy as np
import json
import math
import pycountry

In [2]:
iclab = pd.read_csv("cleaned_data/iclab_2018-09.csv")
iclab.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False


For now, we only care about the 2-digit country code *country*, *url*, and *censored_updated*. Let's filter a cleaner table with just these columns.

In [64]:
iclab_clean = iclab[["country", "url", "censored_updated"]]
iclab_clean.head()

Unnamed: 0,country,url,censored_updated
0,US,http://kinox.to/,False
1,US,http://movie4k.to/,False
2,KR,4shared.com,False
3,KR,news.bbc.co.uk,False
4,KR,ngt.jinbo.net,False


We need to redo the combined_similarities.json and individual similarities files. First, let's find what countries are in the dataset.

In [5]:
iclab_clean.country.unique()

array(['US', 'KR', 'ES', 'ZA', 'CZ', 'PL', 'MY', 'RU', 'CN', 'TW', 'BG',
       'HK', 'RO', 'PE', 'HU', 'NO', 'MX', 'UA', 'NL', 'VN', 'JP', 'LT',
       'RS', 'AU', 'KE', 'SK', 'IN', 'CL', 'CA', 'LI', 'SG', 'ID', 'NZ',
       'LU', 'BZ', 'CO', 'TR', 'BR', 'SE', 'IS', 'FI', 'DZ', 'PT', 'DK',
       'IL', 'MD', 'AT', 'SC'], dtype=object)

In [57]:
cc_unique = iclab_clean.country.unique()

In [45]:
country_codes = pd.read_csv("https://raw.githubusercontent.com/daylight-lab/III/master/shared/data/country-codes/countries_codes_and_coordinates.csv").replace('"','', regex=True)
country_codes.head()

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33.0,65.0
1,Albania,AL,ALB,8,41.0,20.0
2,Algeria,DZ,DZA,12,28.0,3.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0
4,Andorra,AD,AND,20,42.5,1.6


In [46]:
country_codes.iloc[0]

Country                Afghanistan
Alpha-2 code                    AF
Alpha-3 code                   AFG
Numeric code                     4
Latitude (average)              33
Longitude (average)             65
Name: 0, dtype: object

In [47]:
country_codes.columns

Index(['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code',
       'Latitude (average)', 'Longitude (average)'],
      dtype='object')

In [48]:
country_codes.dtypes

Country                object
Alpha-2 code           object
Alpha-3 code           object
Numeric code           object
Latitude (average)     object
Longitude (average)    object
dtype: object

In [50]:
country_codes['Alpha-2 code'] = country_codes['Alpha-2 code'].str.strip()

In [51]:
country_codes.loc[country_codes["Alpha-2 code"] == "AF"]

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33,65


In [55]:
all_countries = [country_codes.loc[country_codes["Alpha-2 code"] == u].iloc[0].Country for u in iclab_clean.country.unique()]
all_countries

['United States',
 'Korea, Republic of',
 'Spain',
 'South Africa',
 'Czech Republic',
 'Poland',
 'Malaysia',
 'Russian Federation',
 'China',
 'Taiwan, Province of China',
 'Bulgaria',
 'Hong Kong',
 'Romania',
 'Peru',
 'Hungary',
 'Norway',
 'Mexico',
 'Ukraine',
 'Netherlands',
 'Viet Nam',
 'Japan',
 'Lithuania',
 'Serbia',
 'Australia',
 'Kenya',
 'Slovakia',
 'India',
 'Chile',
 'Canada',
 'Liechtenstein',
 'Singapore',
 'Indonesia',
 'New Zealand',
 'Luxembourg',
 'Belize',
 'Colombia',
 'Turkey',
 'Brazil',
 'Sweden',
 'Iceland',
 'Finland',
 'Algeria',
 'Portugal',
 'Denmark',
 'Israel',
 'Moldova, Republic of',
 'Austria',
 'Seychelles']

In [65]:
iclab_clean["domain"] = [tldextract.extract(i).domain for i in iclab_clean["url"]]
iclab_clean["suffix"] = [tldextract.extract(i).suffix for i in iclab_clean["url"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [66]:
iclab_clean.head()

Unnamed: 0,country,url,censored_updated,domain,suffix
0,US,http://kinox.to/,False,kinox,to
1,US,http://movie4k.to/,False,movie4k,to
2,KR,4shared.com,False,4shared,com
3,KR,news.bbc.co.uk,False,bbc,co.uk
4,KR,ngt.jinbo.net,False,jinbo,net


In [None]:
# recalculate combined_similarities.json
combined_similarities = {}

def correct_country(c):
    if c == "United States":
        c = "United States of America"
    if c == 'Korea, Republic of':
        c = "South Korea"
    if c == 'Russian Federation':
        c = "Russia"
    if c == 'Taiwan, Province of China':
        c = "Taiwan"
    if c == 'Viet Nam':
        c = "Vietnam"
    if c == 'Moldova, Republic of':
        c = "Moldova"
    return c
    
    # format: "('Austria', 'BEL')": [{"similarity": 0.0496579382}]
for c1 in cc_unique:
    new_common_domains = pd.DataFrame(columns = ["Country 1", "Country 2", "Domain", "Suffix"])
    for c2 in cc_unique:
        if c1 != c2:
            country_1 = correct_country(country_codes.loc[country_codes["Alpha-2 code"] == c1].iloc[0].Country.strip())
            country_2 = correct_country(country_codes.loc[country_codes["Alpha-2 code"] == c2].iloc[0].Country.strip())
            
            c1_rows = iclab_clean.loc[iclab_clean["country"] == c1]
            c2_rows = iclab_clean.loc[iclab_clean["country"] == c2]
            
            joined_c1_c2 = pd.concat([c1_rows, c2_rows], axis = 0)
            joined_c1_c2["combined_site"] = joined_c1_c2["domain"] + "." + joined_c1_c2["suffix"]
            
            unique_sites = np.unique(joined_c1_c2["combined_site"])
            
            blocked_c1 = joined_c1_c2.loc[(joined_c1_c2["country"] == c1) & (joined_c1_c2["censored_updated"])]
            blocked_c2 = joined_c1_c2.loc[(joined_c1_c2["country"] == c2) & (joined_c1_c2["censored_updated"])]
            commonly_blocked = blocked_c1.merge(blocked_c2, how = 'inner', on = 'combined_site')
            blocked_both = commonly_blocked['combined_site'].unique()
            num_blocked_both = len(blocked_both)
            for b in blocked_both:
                suffix = b.split(".")[-1]
                domain = b[:(b.index(suffix) - 1)]
                new_common_domains = new_common_domains.append({"Country 1": country_1, "Country 2": country_2, "Domain": domain, "Suffix": suffix}, ignore_index = True)
            print(c1, c2, num_blocked_both)
            similarity = num_blocked_both / (len(unique_sites))
            
            country_2_alpha_2 = country_codes.loc[country_codes["Alpha-2 code"] == c2].iloc[0]["Alpha-3 code"].strip()
            if not country_2_alpha_2:
                country_2_alpha_2 = "MDA"
            k = "('" + country_1 + "', '" + country_2_alpha_2 + "')"
            combined_similarities[k] = [{"similarity": similarity}]
            print(k, similarity)
    new_common_domains.to_csv("new_common_domains/" + country_1 + "-common-domains.csv")

In [109]:
with open('../new-combined-similarities.json', 'w') as json_file:
    json.dump(combined_similarities, json_file)

In [111]:
# what are the unique urls being tested?
len(iclab_clean['url'].unique())

22717

In [31]:
iclab.loc[(iclab['http_status'] != 200)].dropna().head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
7748,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://masrawy.com/,-2,no_control_resp,-2,no_control_resp,403.0,False,3104.0,{},False,ICMP unreachable,False
8391,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://www.amateurpages.com/,false,sameip,false,sameip,-1.0,False,0.0,{},False,Handshake conflict,False
9566,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://advocacy.globalvoicesonline.org/,false,sameip,false,sameip,-1.0,False,0.0,{},True,ICMP admin prohibition,True
18619,baseline-2018-09-01T031910.129586.json.bz2,2018-09-01T03:19:10.096Z,US,54455.0,citizenlab-global,http://www.pc2call.com/,false,sameip,false,sameip,403.0,False,162.0,{},False,Handshake conflict,False
19753,baseline-2018-09-01T031910.129586.json.bz2,2018-09-01T03:19:10.096Z,US,54455.0,citizenlab-global,http://www.womeninblack.org/,false,sameip,false,sameip,403.0,False,410.0,{},False,ICMP unreachable,False


In [30]:
iclab.loc[iclab['http_status'] == 200].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
657,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.acquisitionx.com/,false,sameip,False,,200.0,False,8419.0,{},,,False
659,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.lingerieatlarge.com/,false,sameip,False,,200.0,False,29282.0,{},False,Handshake conflict,False
660,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.pokerstars.net/,false,sameip,False,,200.0,False,23591.0,{},,,False
661,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://slickdeals.net/,-2,no_control_resp,False,,200.0,False,679791.0,{},,,False
665,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.gamingday.com/,false,sameip,False,,200.0,False,26428.0,{},False,Handshake conflict,False


In [29]:
iclab.loc[iclab['censored_updated'] == True].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
823,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://redtube.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],,,True
870,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://warc.jalb.de/,false,sameip,False,,-1.0,False,0.0,{},True,ICMP admin prohibition,True
985,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://backpage.com/,-2,no_control_resp,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],,,True
997,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://pridetube.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],True,Payload collision,True
1030,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://xvideos.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],True,Payload collision,True


In [9]:
# packeted_updated : packet_reason, block : http_reason, dns : dns_reason
print("packet reasons", iclab.packet_reason.unique())
print()
print("http reasons", iclab.http_reason.unique())
print()
print("dns reasons", iclab.dns_reason.unique())

packet reasons [nan 'Handshake conflict' 'ICMP unreachable' 'ICMP admin prohibition'
 'Payload collision' 'Data after FIN' 'Data after RST']

 "HTTP body contains [\\\\u003cFRAME SRC='http://www.ifydnun.com/?dn=]"
 'HTTP body contains [\\u003ctitle\\u003eWeb Site Blocked\\u003c/title\\u003e]'
 'HTTP body contains [\\\\u003cTITLE\\\\u003eAccess Denied\\\\u003c/TITLE\\\\u003e]'
 'HTTP body contains [\\u003ctitle\\u003eAccess Denied\\u003c/title\\u003e]'
 'HTTP body contains [\\\\u003ch1\\\\u003eAccess To Website Blocked\\\\u003c/h1\\\\u003e]'
 'HTTP body contains [\\\\u003cimg src=\\""//cacheimg.casino.com/images/www/forbidden/forbidden-logo.png]'
 'HTTP body contains [\\\\u003ctitle\\\\u003eSucuri WebSite Firewall - Access Denied\\\\u003c/title\\\\u003e]'
 'HTTP body contains [The page is blocked due to the decision of the authorities in your area.]'
 'HTTP body contains [http://zapret-info.gov.ru/]' '[]'
 'HTTP body contains [http://eais.rkn.gov.ru/]'
 'HTTP body contains [The page you

In [28]:
iclab.loc[(iclab['censored_updated'] == True) & (iclab['country'] == 'US')].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
4401,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,https://www.netflix.com/,False,,True,reserved,200.0,False,76249.0,{},,,True
4564,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://warc.jalb.de/,,,False,sameip,-1.0,False,0.0,{},True,ICMP admin prohibition,True
4884,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://bittornado.com/,,,False,sameip,200.0,False,13930.0,{},True,ICMP admin prohibition,True
5718,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://netflix.com/,False,,True,reserved,200.0,False,71719.0,{},,,True
5804,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://anonymouse.org/,,,False,sameip,200.0,False,3560.0,{},True,ICMP admin prohibition,True


In [6]:
censored_non_200_us = iclab.loc[(iclab['censored_updated'] == True) &
          (iclab['country'] == 'US') &
          (iclab['http_status'] != 200)
          & (iclab['http_status'] != -1)
         ].dropna(subset = ['http_status'])
censored_non_200_us.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
6080,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
29273,baseline-2018-09-01T045208.307428.json.bz2,2018-09-01T04:52:08.236Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
71991,baseline-2018-09-01T170123.077746.json.bz2,2018-09-01T17:01:23.023Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
104315,baseline-2018-09-02T022742.763006.json.bz2,2018-09-02T02:27:42.683Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
108607,baseline-2018-09-02T032809.990945.json.bz2,2018-09-02T03:28:09.926Z,US,198605.0,citizenlab-global,http://www.schwarzreport.org/,False,sameip,false,sameip,403.0,True,6863.0,HTTP body contains [\u003ctitle\u003eWeb Site ...,,,True


In [7]:
len(censored_non_200_us)

36

In [3]:
non_200_http = iclab.loc[(iclab['http_status'] != 200)].dropna(subset = ['http_status'])

In [4]:
non_200_us = non_200_http.loc[non_200_http['country'] == 'US'].url.unique()
non_200_us[:20]

array(['http://kinox.to/', 'http://movie4k.to/',
       'http://www.911truth.org/', 'http://adnetworkperformance.com/',
       'http://torah.org/', 'http://www.socom.mil/', 'http://ouo.io/',
       'http://www.suicidepreventionlifeline.org/',
       'http://www.democracycaucus.net/',
       'http://fatosdesconhecidos.com.br/', 'http://www.iccwomen.org/',
       'http://www.avert.org/', 'http://www.islamicity.com/',
       'http://www.cites.org/', 'http://amphetamines.com/',
       'http://www.serials.ws/', 'https://www.serials.ws/',
       'https://medpot.net/', 'http://www.hitler.org/',
       'http://www.bahai.org/'], dtype=object)

In [5]:
len(non_200_us)

2747

HTTP Status Code Info
- Informational responses (100–199)
- Successful responses (200–299)
- Redirects (300–399)
- Client errors (400–499)
- Server errors (500–599)

In [8]:
# types of error codes
iclab['http_status'].unique()

array([403.,  nan, 200.,  -1., 406., 401., 404., 500., 503., 400., 405.,
       521., 479., 429., 418., 409., 410., 504., 456., 544., 204., 451.,
       301., 523., 502., 412., 424., 530., 520., 999., 203., 408., 206.,
         0., 526., 508., 525., 477.])

In [10]:
gov_blocked = iclab.loc[(iclab['http_status'] == 451)]
gov_blocked.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
15128,baseline-2018-09-01T012755.940830.json.bz2,2018-09-01T01:27:55.773Z,CZ,60068.0,citizenlab-global,http://lifebuzz.com/,False,,False,,451.0,False,339.0,{},,,False
26772,baseline-2018-09-01T042033.228432.json.bz2,2018-09-01T04:20:32.939Z,RU,43317.0,country-sensitive-ru,http://putinbog.wordpress.com/,False,,False,,451.0,True,1833.0,HTTP body contains [http://zapret-info.gov.ru/],,,True
26979,baseline-2018-09-01T042033.228432.json.bz2,2018-09-01T04:20:32.939Z,RU,43317.0,country-sensitive-ru,http://haamash.wordpress.com/,False,,False,,451.0,True,1833.0,HTTP body contains [http://zapret-info.gov.ru/],,,True
37206,baseline-2018-09-01T084135.726037.json.bz2,2018-09-01T08:41:35.582Z,BG,59564.0,citizenlab-global,http://lifebuzz.com/,False,,False,,451.0,False,339.0,{},,,False
56755,baseline-2018-09-01T135130.512703.json.bz2,2018-09-01T13:51:30.306Z,UA,59564.0,country-sensitive-ua,https://lotoru.com/,False,,False,,451.0,False,52559.0,{},,,False


In [14]:
unique_alpha2 = gov_blocked['country'].unique()
print([pycountry.countries.get(alpha_2=i).name for i in unique_alpha2])

['Czechia', 'Russian Federation', 'Bulgaria', 'Ukraine', 'Lithuania', 'Netherlands', 'Luxembourg', 'Spain', 'Hungary', 'Turkey', 'Sweden', 'Finland', 'Portugal', 'Denmark', 'Slovakia', 'Romania']


In [21]:
# find rows with status codes that are NOT 200, -1, nan, 0, 999
anomaly_status_codes_us = iclab.loc[#(iclab['censored_updated'] == True)
          (iclab['country'] == 'US')
          & (iclab['http_status'] != 200)
          & (iclab['http_status'] != -1)
          & (iclab['http_status'] != 0)
          & (iclab['http_status'] != 999)                          
         ].dropna(subset = ['http_status'])
anomaly_status_codes_us.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False
4358,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.911truth.org/,,,false,sameip,406.0,False,300.0,{},,,False
4361,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://adnetworkperformance.com/,,,-2,no_control_resp,403.0,False,568.0,{},,,False
4370,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://torah.org/,,,false,sameip,403.0,False,3095.0,{},,,False


In [22]:
len(anomaly_status_codes_us)

37868