In [89]:
import os
import re
import pandas as pd
import tldextract
import pycountry
import numpy as np

In [2]:
import signal
from contextlib import contextmanager


@contextmanager
def timeout(time):
    # Register a function to raise a TimeoutError on the signal.
    signal.signal(signal.SIGALRM, raise_timeout)
    # Schedule the signal to be sent after ``time``.
    signal.alarm(time)

    try:
        yield
    except TimeoutError:
        pass
    finally:
        # Unregister the signal so it won't be triggered
        # if the timeout is not reached.
        signal.signal(signal.SIGALRM, signal.SIG_IGN)


def raise_timeout(signum, frame):
    raise TimeoutError

In [4]:
google_whois = os.popen('whois google.com').read()

In [5]:
google_whois

'% IANA WHOIS server\n% for more information on IANA, visit http://www.iana.org\n% This query returned 1 object\n\nrefer:        whois.verisign-grs.com\n\ndomain:       COM\n\norganisation: VeriSign Global Registry Services\naddress:      12061 Bluemont Way\naddress:      Reston Virginia 20190\naddress:      United States\n\ncontact:      administrative\nname:         Registry Customer Service\norganisation: VeriSign Global Registry Services\naddress:      12061 Bluemont Way\naddress:      Reston Virginia 20190\naddress:      United States\nphone:        +1 703 925-6999\nfax-no:       +1 703 948 3978\ne-mail:       info@verisign-grs.com\n\ncontact:      technical\nname:         Registry Customer Service\norganisation: VeriSign Global Registry Services\naddress:      12061 Bluemont Way\naddress:      Reston Virginia 20190\naddress:      United States\nphone:        +1 703 925-6999\nfax-no:       +1 703 948 3978\ne-mail:       info@verisign-grs.com\n\nnserver:      A.GTLD-SERVERS.NET 192

In [6]:
re.search('Registrant Country:.*\n', google_whois).group(0)

'Registrant Country: US\n'

### China

In [7]:
china_blocked = pd.read_csv('cleaned_data/blocked_urls_china.csv').drop(columns = ["Unnamed: 0"])

In [8]:
china_blocked.head()

Unnamed: 0,URL
0,www.a00.com
1,a0006.in2000.com
2,a1.sparklancer.com
3,a1234.uhome.net
4,a1234bc.uhome.net


In [3]:
def url_whois(url):
    try:
        with timeout(3):
            split_url = tldextract.extract(url)
            domain = split_url[1]
            suffix = split_url[2]
            recombined_url = domain + "." + suffix
            url_whois = os.popen('whois ' + recombined_url).read()
            reg_country = re.search('Registrant Country: [A-z]*\n', url_whois)
            result = None
            if reg_country:
                result = reg_country.group(0)[20:]
            print(url, result)
            return result
    except:
        return None

In [10]:
china_blocked['URL Country'] = china_blocked['URL'].apply(url_whois)

www.a00.com CN

a1.sparklancer.com KY

a1234.uhome.net Malaysia

a1234bc.uhome.net Malaysia

a2s.idv.tw None
a2z.fhl.net None
www.a4c.at None
www.a4u.at None
www.a86.com CN

a8713000.in2000.com Vietnam

a888.bizland.com US

aa999.e-chome.net US

aaaaa.itgo.com US

www.aaanet.org US

www.aaaw.org US

www.a-abcastrology.com None
www.aabcdallas.org CA

www.aacsonline.org US

www.aadz12.ukgateway.net GB

www.aaiil.org US

www.aaperry.com US

aardvark.tce.rmit.edu.au None
aaroncarter.uhome.net Malaysia

ab03.in2000.com Vietnam

www.ab69.com None
abai.i.am None
abandon.uhome.net Malaysia

www.abbeywong.com None
www.abbotlibrary.org CA

abc.com US

www.abc.com.au None
abc.go.com US

www.abc.gov.au None
abc.net.au None
www.abc.net.au None
www.abc-ca.org JP

www.abc-chengqu.com None
www.abcdefghijklmnopqrstuvwxyz.com US

www.aberdeennews.com US

www.aberdeenrfc.com None
aberoom.tripod.com US

www.able.net-shop.com CN

ablook.com US

www.ablook.com US

www.abmagazines.com None
abofx.coc.cc CN

w

www.alsb.uscourts.gov None
www.altamista.com US

www.altavisat.com CN

altavista.co.uk None
www.altavista.co.uk None
altavista.com US

www.altavista.com None
alternativespace.5u.com US

www.altreligionscientology.org US

alumni.hchs.hc.edu.tw None
alumni.ice.ntnu.edu.tw None
alumni.nctu.edu.tw None
www.alumnicrawler.com US

www.alumnisource.com US

alumweb.mit.edu None
www.am1320.com None
www.a--ma.idv.tw None
www.amafish.com US

www.amalnet.k12.il None
amanda-lee.fans-club.com None
www.amap.com.tw None
www.amarc.org DE

amarillonet.com US

www.amath.nchu.edu.tw None
www.amazingfilms.com UK

amazingforums.com None
www.amazinggracebaptist.homestead.com US

amazonb1bb.netfirms.com US

www.ambafrance-gr.org FR

ambarts.tripod.com US

ambasadager.riscom.md None
www.ambassadorsforchristchurch.org None
ambat.39m.net US

www.amb-u.com.hk None
www.amegaproxy.com US

www.amei-fanclub.com None
www.amercoll.edu None
www.americanembassy.org.cy None
www.americanfeedmagazine.com US

www.americanmax.

www.auch.com.au None
www.auctiondown.com US

www.audio888diy.com None
www.audiobasket.com None
audioprecision.hypermart.net None
www.auditing.org GB

audrey.flying.to None
www.augusthome.com US

www.aumfidelity.com None
aupo.in2000.com Vietnam

www.aurora-edu.com US

www.ausable.org US

www.auschwitz.dk None
www.ausdaily.net.au None
www.aussie.tk None
www.aussiedenver.com FR

www.austinlivinggrace.org None
www.australia.com AU

www.australian-embassy.it None
australien.mine.nu None
www.automobile-market-cn.com None
autopilot.sf.net None
autopr0n.com JP

www.autorevista.com None
www.autosportasia.com US

av.com CN

www.av.com CN

www.av888.com US

www.avatarrecords.com US

www.avatarsabode.com.au None
www.avbaby.com US

avboys.uhome.net Malaysia

www.avchinaweb.com None
www.avdrecords.com None
www.ave.net None
www.ave-maria.com None
www.ave-resource.org None
aviyonah.tripod.com US

avm.idv.st None
avon.uhome.net Malaysia

www.avyes.com HK

www.awakenersabode.org None
award.fans-club.com

www.bismarcksda.org US

www.bisrs.com None
www.biswas.com US

bitchmakemeasandwich.com FR

www.bitchmakemeasandwich.com FR

www.bizdalian.com None
www.bizjournals.com US

www.biznews.net None
www.bizoriental.biz None
www.bizwebawards.com PA

www.bj63.com.cn None
www.bjbb.com.cn None
www.bjchangping.com CN

www.bjco.com US

www.bjep.org US

bjic-travel.netfirms.com US

www.bjmac.gov.cn None
www.bjrealty.cc None
www.bjstudent.org.cn None
www.bjurholm.se None
www.bjzc.org US

www.black5th.com US

www.blackbeat.8m.com None
blackcoffe.homestead.com None
www.blackmarket1.com None
www.blacktemple.org None
www.blackzebra.com KY

www.blairridge.org US

www.bless100.com GB

www.blessthechild.com US

blh.cjb.net US

blink182.uhome.net Malaysia

www.bliss-music.com 

www.blitzdbz.tk None
www.blkbox.com US

www.bl-knitting.com CN

www.blogyou.com US

blood.e-chome.net US

www.bloorstreet.com None
www.bltg.com None
blue0598.uhome.net Malaysia

bluebone.hypermart.net None
www.blueboxz.com NL

bluebus

www.cand.uscourts.gov None
www.candiscoffee.com None
www.canglong.net US

www.cango.net.kg None
www.canoe.ca CA

canoe.macausport.org MO

www.canticanova.com None
www.canyonheightsacademy.com US

cao000.k12.net.cn None
cap.estevan.sk.ca None
www.capecod.net US

www.capecodaccess.com US

www.capitalchristian.net US

capitalnaz.click2site.com US

www.capitolsteps.com None
capl.tripod.com US

www.capricciousa.com UK

www.caprotect.com US

car.hongkong.com KY

www.caradon.gov.uk None
carbeauty.in2000.com Vietnam

carcat.myetang.com None
www.cardinalnewman.com US

www.cardinalshehanschool.org US

cardoc.8m.com None
www.cardpen.com US

cardriver.cjb.net US

www.career-connection-inc.com US

www.caresafe.com US

www.careweb.com None
www.cargal.org AT

www.cariboord.bc.ca CA

carkey.uhome.net Malaysia

www.carlisle.ac.uk None
carlisle-www.army.mil None
carllau.hk-homepage.com None
www.carolinabiblecollege.org US

carpart.bizland.com US

www.carpediemrecords.com DE

www.carshalton.ac.uk None
ww

www.chinaaowei.com CN

www.china-apsis.com CN

www.chinaarts-crafts.com None
www.china-automobile.com RU

www.chinabamboocentre.com HK

www.china-bang.com CN

www.chinabigchem.com CN

www.chinabusinesshelp.com None
www.chinabytemail.com US

www.chinaclimbing.com None
www.chinacommerce.com CN

www.chinacommerce.net None
www.chinacon.net None
www.china-construction-eq.com CN

www.chinacpw.com CN

chinacrystalandmore.com None
www.chinacuriopith.com CN

www.chinadejun.com CN

www.chinadianhuan.com None
www.china-dienste.de None
www.chinadisinfectant.com CN

www.chinadnaproduct.com CN

www.chinadoc.com CN

chinadoll.uhome.net Malaysia

www.chinadoorlock.com None
www.chinadq.com None
www.chinaeagle.com US

www.chinaeast.com.hk None
www.china-economicnews.org None
www.chinaele.net None
www.chinaelec.com US

www.chinaenzyme.org None
www.chinaestate.com US

www.chinaetrend.com None
www.chinaevedu.com US

www.chinaeweekly.com None
www.chinaexporter.net None
www.chinafabrics.biz None
www.chinafas

www.chses.tyc.edu.tw None
www.chsh.chc.edu.tw None
www.chsh.cy.edu.tw None
www.chshb.gov.tw None
www.chtpe.com CN

chu.jznu.net None
www.chuanshuo.tk None
chuanton.bizhosting.com US

chuchris.uhome.net Malaysia

www.chuckieboy.com None
www.chuguo.org US

chuishan.uhome.net Malaysia

chungchuntung.uhome.net Malaysia

chunglingjohor.tripod.com US

www.chungstimepieces.com None
www.chunpai.com None
church.39m.net US

www.churchgroup.cjb.net US

www.churchmusicians.com US

www.church-of-england.org GB

www.churchofgodcarmichael.org US

www.churchofgodindia.org None
churchofgodop.homestead.com US

ciacia.virtualave.net None
www.cia-drugs.org US

www.ciaonet.org US

ciapo.idv.st None
www.cibcs.org None
www.cic.nyu.edu None
www.cic.org.hk None
cicq.iscool.net MD

www.cidob.org ES

www.ciecc.com.cn None
www.ciefl.org US

cifp.tripod.com US

www.cihts.ac.in IN

www.cimhk.com US

www.cimonline.tk None
cindyyu.com US

www.cinfo.org.cn None
cinnamon.homeip.net None
cino.uhome.net Malaysia

cinput.

www.consulatfrance.org.tr None
www.consul-france.org JP

www.consulfrance-abidjan.org FR

www.consulfrance-barcelone.org FR

www.consulfrance-casablanca.org FR

www.consulfrance-conakry.org.gn None
www.consulfrance-hongkong.org FR

www.contech.com.tw None
www.contracostatimes.com US

conversiontojudaism.com US

www.cookegg.f4w.net PA

cookegg.uhome.net Malaysia

www.cookies.fans-club.com None
cookiestalk.hk-homepage.com None
cookiethief.uhome.net Malaysia

www.cooktownnews.com DE

www.coolantique.com US

cooldoc.dhs.org AU

coolguitar.uhome.net Malaysia

www.coolqq.com None
coolvet.in2000.com Vietnam

www.co-op-party.org.uk None
copera18.39m.net US

www.copgny.org US

copm.k12.net.cn None
www.copvcia.com PA

www.coral.com.hk None
coreberry.uhome.net Malaysia

corey.org PA

corky.net IL

www.corneliacog.homestead.com US

www.cornellprogressive.org DK

www.cornercaffe.com.tw None
www.cornerstonebc.click2site.com US

coronaclub.852.net HK

www.cos.org.sg None
www.cosplayshop.com US

www.c

www.cytrax.com US

cyvv.myrice.com CN

cywong1hk.uhome.net Malaysia

www.cyzen.org DE

www.czechpoint.cz None
www.czguangda.com None
www.c-zim.tk None
www.czlib.net None
www.czlx.com CN

www.czm2a.mypage.org KY

www.cz-maoyuan.com HK

www.czonline.net None
www.czquickway.com None
www.cz-safeinfo.net Malaysia

www.czsnrj.com None
cztxgs.com CN

www.czwx.ah163.net CN

czxx.luohuedu.net cn

www.czyz.com.cn None
www.czzq.com.cn None
www.d.kth.se None
d101.i.am None
www.d1d1d1.com CN

www.d2.com.hk None
WWW.D2NET.COM None
www.d4home.com Malaysia

www.d4kj.com None
www.dab.org.hk None
www.dabhk.com None
www.dack.com US

www.dacorum.gov.uk None
www.dadesentinel.com None
www.dadicorp.com None
www.daemon.be None
www.daemon-info.com None
www.daemyung-ind.com None
www.daetech.com.hk None
dafang.jmtour.org CN

www.daft.com IE

www.dagogtid.no None
www.dagsavisen.no None
www.dahanbank.8m.com None
www.dahang.com.cn None
www.dahe-ad.com None
www.dahe-ad.com.cn None
www.daheishi.com None
www.daheyaji.

www.delai.com 

www.delanco.org US

www.delasalle.org US

www.delawarechristian.org US

www.delawarewave.com US

www.delerium.co.uk None
www.delft.bahai.nl None
www.deliveranceoutreach.faithweb.com US

www.delmark.com US

www.delorie.com None
www.delphi.com.cn None
www.delphitea.tk None
www.delrayumc.org US

www.dels-den.co.uk None
www.delta.idv.st None
www.delta.tec.la.us None
www.deltachurch.com US

www.deltaia.com FR

www.deltaunited.com.tw None
demagicland.netfirms.com US

demeteria.tripod.com US

demo.mynetwatch.net None
democracy.org.hk None
www.democracy.org.hk None
www.democracydata.com US

www.democracysummer.org US

www.demsonline.net None
denairnaz.freeservers.com US

www.denfance.com US

www.dengsact.com CN

dengzhou.nyinfo.ha.cn None
deniseho.freeservers.com US

www.denkmalpflege-hessen.de None
www.dennisau.cjb.net US

www.dennismaldini3.50megs.com US

dennisnieh.cjb.net US

www.dentalcare.com.tw None
www.dentfirst.com None
www.denverfriends.org JP

www.denverjournal.com U

www.dvt.com.hk None
www.dvu-hessen.de None
www.dwatch.ca CA

dwb-taipei.org.tw None
www.dwmodel.com.tw None
www.d-worx.com US

dy930.uhome.net Malaysia

www.dyeschina.com CN

dyf.cjb.net US

www.dyfed-powys.police.uk None
www.dygasket.com None
www.dynaphoon.com CN

dynasty.in2000.com Vietnam

www.dynastyfc.com US

dyndns.org US

www.dyndns.org US

www.dyns.cx BE

www.dyps.tcc.edu.tw None
dzf.cjb.net US

www.dzzk.net CN

e.lichun.tripod.com US

www.e-168telbook.com US

www.e289.com None
e50.cjb.net US

www.eaa.org.hk None
www.eachealth.com None
www.eadcraft.com None
eagle.ee.ntu.edu.tw None
eagle.webpipe.net US

eagle149.uhome.net Malaysia

www.eaglecross.net None
www.eagle-king.com.tw None
eagles5.tripod.com US

www.eaglesaint.net None
www.eagles-wingsmin.com None
www.eagnas.com US

www.eagton.com None
eangel.uhome.net Malaysia

www.e-antiquer.com None
earcheer.music.webjump.com None
earth.dns2go.com None
earth.fg.tp.edu.tw None
earth.gl.ntu.edu.tw None
earth2willi.com US

earth911.gcc

www.esun-bank.com None
www.e-suntimes.com None
et291800.uhome.net Malaysia

etcetera.humberc.on.ca CA

etcfsin.uhome.net Malaysia

e-tea.hypermart.net None
www.etea.mainpage.net None
www.e-tech-solution.com CA

www.etere.com None
www.ethio-stmichael.org US

ethome-ethome.tripod.com US

www.eti-bull.net FR

www.etkorea21.com US

www.etone-tw.com None
etouch.tw.to None
www.etruth.org.hk None
www.etwn.com None
www.etzchaimsharon.org US

www.etz-chayim.org US

euhwang.in2000.com Vietnam

www.eumitcom.com US

www.eunice.cc None
eurekanet.cjb.net US

www.euro.ecb.int None
euro2000.uhome.net Malaysia

www.eurochinaweb.com FR

eurocup.uhome.net Malaysia

www.europeaninternet.com US

www.europeanmissions.org KY

www.eurosemi.co.uk None
evaboy.hypermart.net None
www.evainjection.com None
www.evangelheightsonline.org None
www.EvangelicalView.com None
evangelique.cjb.net US

www.evansdata.com None
www.evansvilleheartcenter.com US

www.evcforum.net US

www.evcpl.lib.in.us None
www.evelynroberts.com

www.firstcongfp.org US

www.first-lady.com US

firstpage.de None
www.firstpost.com IN

www.FIRSTPOST.com IN

firstpresvancouver.com PA

www.firstsphere.net US

www.firstuuaustx.org None
www.firstwa.com US

firstwar.uhome.net Malaysia

www.fis.edu.hk None
www.fish.com.hk.ro None
www.fish2002.tk None
www.fish365.com KR

fishboy.39m.net US

www.fishery.nkimt.edu.tw None
www.fisheye.com.tw None
fishing.hkpeoples.net None
fishmaster.idv.tw None
fishworld.uhome.net Malaysia

www.fi-taipei.org FR

www.fit-right.com.tw None
www.fitzroyharbour.com CA

www.fjc.gov None
www.fjcs.org US

www.fjqz.gov.cn None
www.fjsh.cy.edu.tw None
www.fjsm-spices-terpineol.com US

www.fkpg.gov.tw None
www.flamingo-everspring.com.tw None
www.flamingoroad.org JP

flaneur.ourfamily.com US

www.flash5workstation.cjb.net US

www.flashhk.com CN

flashland.uhome.net Malaysia

flasht.uhome.net Malaysia

www.flavor.com.tw None
flerp.org None
www.flevoland.nl None
flexc.lcs.mit.edu None
www.flex-compiler.lcs.mit.edu None
f

fu-tang.netfirms.com US

www.future-china.org US

futurezone.orf.at None
www.fuxiangaquarium.com CN

www.fuyinonline.com CN

www.fuyinonline.org None
www.fuzzion.org ES

fuzzyworld.virtualave.net None
fw.china-wwwinfo.com None
www.fwsda.org US

www.fwtea.com CN

www.fxvictco.com None
www.fyicalgary.com CA

www.fyitoronto.com CA

fzly.my.west163.com CN

g2000.com.hk None
gabeweb.50megs.com US

www.gaiarising.org US

gaipon.in2000.com Vietnam

www.galaweb.org US

galaxysun.uhome.net Malaysia

www.galerie-uno.com CN

www.galion.lib.oh.us None
www.galvnews.com None
www.galwayadvertiser.ie None
www.gamb.uscourts.gov None
www.gamd.uscourts.gov None
game.dqsky.com CN

game.hongkongdog.com US

game.wuhan.net.cn None
game100.uhome.net Malaysia

gameboyroland.uhome.net Malaysia

gameboystation.in2000.com Vietnam

gamecentery.virtualave.net None
game-city.virtualave.net None
gameclub.myrice.com CN

www.gamecy.com None
gamefeeling.uhome.net Malaysia

gameone.uhome.net Malaysia

www.game-revolution

gov.668.cc US

www.gov.au None
government.hongkongdog.com US

www.goyoyo.com CN

www.goyoyo.com.cn None
gp.macaustreet.com None
gpaper.gigigaga.com US

www.gpchain.com CA

gpf.ktmbc.org None
www.gpl.lib.in.us None
www.gppgle.com US

www.gppl.ab.ca CA

www.gqzx.gov.cn None
www.gr.ch None
gr.hypermart.net None
www.gracebaptist.net US

www.gracebaptistattleboro.org US

www.gracebaptistonline.com None
www.gracebibleonline.org CA

www.gracebrethren.com 

www.gracechurchhumble.org None
www.gracemahomet.org US

www.graceonlinelibrary.org US

www.graceschool.org US

www.gracetoyou.org US

www.grado.com None
www.gradon-eng.com None
www.grahambiblecollege.com None
www.grandchina-hk.com CN

www.grandforks.com us

www.grandmarkhandbag.com HK

www.grandpacific.com None
granite.korea.ac.kr None
www.granitecurling.com US

grantham.39m.net US

graphics.fortunecity.com None
www.grassroots3v3.com None
www.gratefuldread.net None
www.gravestonedesigns.com US

www.great-british-pages.co.uk None
www.greatch

hcs.cjb.net US

www.hcsoft.com US

www.hcylqx.com CN

www.hdcdvd.com CN

www.hdglass.com CA

www.hdoco.com US

www.hdxxw.net None
www.headhunt.nanjing.gov.cn None
www.healingpsalm.com US

healingwell.subportal.com None
www.health.com US

health4u.dns2go.info US

health999.yes8.com CN

www.healthbasic.com None
www.healthcarejournal.com GR

www.healthinchina.com US

healthlab.w3.to None
www.healthland.org US

www.healthoo.com CN

www.healthoo.com.cn None
www.health-salon.com JP

www.healthy.com.hk None
www.healthy-ageing.net None
www.hearministries.org None
www.heart.com.hk.ro None
www.heartandstroke.ca CA

www.heartbeat777.org None
www.heartdiseasej.com US

www.heartibet.org RU

heartofsanantonio.com US

www.hearttemple.org None
www.heavencinema.com UK

www.heaven-creation.com CN

www.heavenlygardens.org US

www.heavenlyrest.org US

heavy.metal.isCool.net MD

www.hebeiwindow.com CN

www.hebraicrenewal.com JP

hebrew.fhl.net None
www.hebrews132.homestead.com None
hehehoho.uhome.net Malay

hktcrc.uhome.net Malaysia

hktcw.uhome.net Malaysia

www.hktd.net US

hkten.i.am None
hktimmy.uhome.net Malaysia

www.hktmcc.com None
hkto.8m.com None
hktol.uhome.net Malaysia

hktop100.in2000.com Vietnam

www.hktrampers.com None
www.hktreblechoir.org None
hkttn.uhome.net Malaysia

hktyc.uhome.net Malaysia

www.HKUFO.net HK

www.hkug.com HK

www.hkvipnet.org None
www.hkwahsan.com.hk.ro None
www.hkwalker.net CN

www.hkwdesign.com None
hkweather.w3.to None
hkwrestling.freehomepage.com PA

hkwsc.uhome.net Malaysia

hkybaseball.tripod.com US

hkyys.idv.st None
hkyzworld.tripod.com US

WWW.H-L.CC None
www.hla.hlc.edu.tw None
www.hlbh.hlc.edu.tw None
www.hlbpm.com None
www.hlc.moj.gov.tw None
www.hlcc.gov.tw None
www.hld.moj.gov.tw None
www.hlepb.gov.tw None
www.hlhb.gov.tw None
www.hlic.com None
hlife.tw.to None
www.hljaudit.gov.cn None
www.hlj-ftm.com CN

hlmss.mainpage.net None
www.hlrl.hessen.de None
www.hlypaintball.cjb.net US

www.hma.gov.cn None
www.hmajesty.org US

www.hmd.yy.gov.cn 

www.hystock.com NL

hysun.39m.net US

www.hzcenter.com CN

www.hzinitial.com None
www.hz-nchotel.com None
hzpost.668.cc US

hzqw.uhome.net Malaysia

www.hzsanhe.com CN

www.hzshbj.com None
i.am None
www.i.am None
www.i926.com None
i9care2you.tripod.com US

www.iafcc.com US

www.iamchap.org US

iamfailai.uhome.net Malaysia

www.i-am-uber.com None
www.ianb.uscourts.gov None
www.iand.uscourts.gov None
www.iasb.uscourts.gov None
www.iasd.uscourts.gov None
www.iasps.org.il None
www.iba.com.hk None
www.ibaobao.com CN

www.ibcd.de None
www.ibcsf.org None
www.ibela.com US

www.ibgames.net None
ibiza.eurimage.com KR

www.iboc.org US

ibs.taipei-elife.net None
www.ibsu.edu.ge None
ibu.ourfamily.com US

ibus.virtualave.net None
icac.virtualave.net None
icbanana.com None
www.icbh.org US

www.icchurchonline.org None
www.icecity-online.com Malaysia

www.iceramic.net JP

icewing.uhome.net Malaysia

icgr.homestead.com None
ichannel.cjb.net US

ichibata.org US

ichien.bizland.com None
www.ichi-ichigo.c

www.islamiccity.tsx.org BS

www.islamicnet.com US

www.islamicparty.faithweb.com US

www.islamicsciences.com None
www.islamicsocietyofmobile.com None
www.islamicwebguide.com None
www.islamicyouthmovement.20m.com US

www.islaminteractive.net US

www.islamphil.com CN

islandmoon.com US

www.islc.net None
www.islington.gov.uk None
www.ism.rlp.de None
www.ismart.educations.net FR

isnba.com DE

iso.china-labour.org.hk None
iso.hrichina.org CA

iso.hrichina.org:8151 CA

www.iso9000online.cc None
www.isoc.org.il None
www.isoc-unsw.org.au None
www.iso-ing.com None
www.israel-gifts-flowers.com IL

www.israel-holy-land.com None
www.israelinsider.com None
www.israel-mfa.gov.il None
www.israeltour.com US

www.israports.org.il None
www.isri.unlv.edu None
isshou.netfirms.com None
issl.webprovider.com None
www.issy.com FR

istandalone.8m.com None
istock.uhome.net Malaysia

isweb40.infoseek.co.jp None
it.mainpage.net None
www.it-120.com CN

www.it139.com None
www.it30.com US

www.it99.cc None
www.ita

www.jingfa.com.tw None
www.jinggongmm.com None
www.jing-ho.com.tw None
www.jinghuachem.com None
www.jingjiaelectric.com None
www.jingjiang.com None
www.jingji-book.com None
www.jinglecookie.com None
www.jingsheng.com None
www.jingxie.com None
www.jingyuan.com.cn None
www.jinhaihotel.com None
www.jinher.com None
www.jinher.com.tw None
www.jinhsin.com.tw None
www.jinhuan.com None
www.jinhua-stone.com None
www.jinhuipaint.com None
www.jinjuan.com None
www.jinlan.com.tw None
www.jinledl.com None
jinlengleng.tripod.com None
www.jinlongbuilding.com None
www.jinlong-co.com None
www.jinlong-light.com None
www.jinlu.com.cn None
www.jinma-cn.com None
www.jinma-gearing.com None
www.jinnah.edu None
jinnung.tripod.com None
www.jinqiao.com.cn None
jins.bizland.com None
www.jinsan.tpc.gov.tw None
www.jinshengstone.com None
www.jin-taiwan.com.tw None
www.jin-tao.com None
www.jinworld.com None
www.jin-yang.com None
www.jinyuan-buyi.com None
www.jinzhengleather.com None
jiou-pin-lotus.com.tw None
jis3q.

keung4k.uhome.net Malaysia

www.keungchaihk.f4w.net PA

kevin405.hypermart.net None
kevin-hk.tripod.com US

www.keymonastery.org US

www.keynetworks.com US

keystoneonline.com DE

www.keyuan.com.cn None
kfccma.uhome.net Malaysia

kfchong.8k.com CN

kfckfc.uhome.net Malaysia

www.kflr.org US

www.kgbmedia.com US

kh.to None
kh3c.in2000.com Vietnam

khchu.virtualave.net None
www.khilafah.com None
khongwk.tripod.com US

k-h-s.hihosting.hinet.net None
khvs.adsldns.org US

www.kibbutzlotan.com None
www.kiddygram.com US

www.kidneyweb.com US

www.kids.com.hk.ro None
kids.iscute.com US

kidsbook.tripod.com US

www.kids-edu.com US

www.kidsfirstcm.org None
kidtaozi.yes8.com CN

kiev.in2000.com Vietnam

kihoo.hk.st None
kikiding.kiss.to None
www.kikigame.com None
kikiho.tripod.com US

www.kikikids.com US

kikis.uhome.net Malaysia

kilin.homestead.com None
kill.e-chome.net US

www.killchina.com None
www.killchinks.com None
www.killuglytv.com CN

www.killzine.com None
kimbasangels.com US

kimjihy

leading2001.chinatimes.com Malaysia

www.Leading-Man.com AE

www.leadingstudy.com CN

www.leadmushroom.com.cn None
leaf.idv.st None
www.learn.co.uk None
www.learn.jtsa.edu None
learn.scientology.org US

learn.to None
learnchi.tripod.com US

learn-easy.tripod.com US

www.learningaboutgod.com us

www.learningchannel.org NO

www.learningplant.com US

learnweb.tsx.org BS

leb.net US

lecom.cjb.net US

www.ledgernews.com None
lee11.uhome.net Malaysia

lee123.in2000.com Vietnam

leejiny.tripod.com US

lee-mark.tripod.com US

www.leesburg2day.com PA

www.leewa.com.hk None
leeweidehh.friendpages.com US

www.leewingtat.org.hk None
leeyoungae.org DE

www.legaldockets.com None
www.legalparty.com None
legco.gov.hk None
legco2000.net None
legend.idv.st None
www.legend-nt.com CN

www.legionofmary.com ZW

www.legis.gov.bc.ca CA

lego70.tripod.com US

www.leicazw.com None
www.lemania.com None
lemonde.tripod.com US

lemontree.i8.com CN

lemusic.cjb.net US

lend.2hu.org CN

www.lens.org AU

www.l-e-o.co

www.losroger.freeservers.com US

www.loss-obesity.com CN

www.lost-found-china.com None
www.lostfriends.com US

www.lostquotes.com US

www.lothac.homestead.com None
lottery.chinatimes.com Malaysia

www.lotus2000.net None
lotutor.uhome.net Malaysia

lou6904.uhome.net Malaysia

www.loudountimes.com None
www.louisville.com None
www.louisville.edu None
louisvillekbc.org None
www.lourdes.edu None
lovableses.uhome.net Malaysia

love.25184.com None
love.backchina.com CA

love.hongkongdog.com US

www.love1314.net CN

www.love138.com JP

love2ask.cjb.net US

love7474.39m.net US

www.lovecenter.org US

lovecookie.50megs.com US

www.LoveDJ.cjb.net US

www.lovefriendclub.com None
lovegcgoobi.homestead.com None
lovekary.iscute.com US

lovekentw.tripod.com US

loveland.uhome.net Malaysia

www.lovelandfyi.com RU

lovelandy.in2000.com Vietnam

lovemusk.com RU

lovenchat.uhome.net Malaysia

www.lovenet.co.uk None
www.loveofgod.org US

loveparty.uhome.net Malaysia

www.lovescopes.net None
lovesea.cjb.ne

www.marthomasf.org US

www.marthomawashington.org None
www.marthomayouths.org None
martial-art.uhome.net Malaysia

martin2000.in2000.com Vietnam

www.martingunnarsson.com SE

www.martinlee.org.hk None
www.martinsvillebulletin.com US

www.martintempleamez.com US

www.martlet.org US

www.marturo.com None
maru.uhome.net Malaysia

www.marushin.com.hk None
marutr.tripod.com US

www.maruwacera.co.jp None
www.marveltours.net AU

www.marxism.com None
www.marxist.org KY

www.maryjanezone.com US

www.maryjoseph.org US

www.marykaycn.com HK

www.marymountpv.edu None
www.maryward.net None
www.masada2000.org US

www.mass.edu None
massacre.uhome.net Malaysia

www.massbibmin.org None
www.massey.ac.nz None
www.masshk.com None
www.mastersays.com US

master-why.uhome.net Malaysia

www.mastina.com None
matc.uhome.net Malaysia

matchbox.w3.to None
www.materialreligion.org BY

math.boisestate.edu None
www.mathaba.net GB

mathexercise.tripod.com US

www.maths123.com AU

mathserver.sdu.edu.cn None
mathsmoney

www.nb.cei.gov.cn None
nbalive2001.in2000.com Vietnam

nbaspace.126.com CN

www.nbfdc.com CN

www.nbfe.com US

www.nbg-churchofgod.com None
www.nbhooya.com CN

www.nbjob.net US

www.nb-rh.com None
www.nbyucheng.com CN

ncaids.www.50megs.com US

www.nca-kingwood.org US

www.ncbr.com US

ncc.to None
www.nccentral.com None
www.nccn.net None
www.ncconline.org PA

NccuAd.cjb.net US

www.ncd-china.com None
www.nceb.uscourts.gov None
www.ncfc4dc.org JP

nchu-2000.tripod.com US

www.ncix.gov None
www.ncmb.uscourts.gov None
www.ncmd.uscourts.gov None
www.ncn.org US

ncpunk.8m.com None
www.ncsl.org US

www.ncswt.or.th None
www.nc-tutor.com None
www.ncwb.uscourts.gov None
www.ncwd.uscourts.gov None
www.nczl.com CN

nd.54ol.org CN

ndac.hypermart.net None
www.ndb.uscourts.gov None
www.ndd.uscourts.gov None
ndhly.coc.cc CN

ndnet.hypermart.net None
ndp.ca CA

www.ndp.ca CA

www.nea.fr None
www.neacb.com CN

www.neb.uscourts.gov None
www.ned.org US

www.ned.uscourts.gov None
www.neda.net None
nefari

nl3.freeservers.com US

www.nlbc.org US

www.nlp.no None
www.nls.no None
www.nmalliance.org US

www.nmapwin.org US

www.nmha.org US

www.nmims.edu None
www.nmmc.net None
www.nmsc.gov.au None
www.nnbc.homestead.com None
www.nnyzg.com CN

noelbio.tsx.org BS

noelrole.bizland.com None
www.noeltyl.com None
www.noettic.com US

www.nofat.netfirms.com None
nofire.in2000.com Vietnam

www.noisebox.co.uk None
noiseproject.cjb.net US

noizeworks.netfirms.com None
noizeworks.no-ip.com US

www.nols.org US

www.nomamo.com FR

www.nominet.org.uk None
www.nomssi.de None
none.jumpfun.com None
www.nonferrous-china.com CN

nonnamail.com None
www.noornet.com US

www.no-porn.com None
www.norahvincent.com US

www.norcal.org US

norembassy.tripod.com US

www.norfolkcoc.org US

www.norge.dk None
www.nori.go.kr None
noriai.uhome.net Malaysia

www.norja.net None
www.norquest.ab.ca CA

www.northchurch.4mg.com US

www.northfork.com None
northgatechina.cjb.net US

www.north-gla.ac.uk None
www.northlincs.gov.uk Non

www.osb.hu None
www.osborndrugs.com US

www.oshawalibrary.on.ca CA

www.osirusoft.com None
www.osku.com None
www.ososdevenezuela.com None
www.osphysics.com.hk.ro None
www.oss4lib.org JP

www.ossor.com None
www.osunsc.cjb.net US

osunwin.in2000.com Vietnam

ota.mainpage.net None
otakuman.hoops.ne.jp None
www.otan.org US

www.otatoo.com US

www.otherspokane.com CA

www.otherthing.com US

www.otohq.org JP

www.OTownAteMyBalls.homestead.com None
ottawamessianic.cjb.net US

ottawamessianic.freeservers.com US

ourcalss48.webjump.com None
ourchelseafc.tripod.com US

ourearth.coc.cc CN

www.oureffort2001.com JP

www.ourladyofgraceparish.org None
www.ourladyrockwall.org US

ourmem.uhome.net Malaysia

www.ourredeemer.com None
ourseeds.jeeran.com JO

www.ourvineyard.com US

ourwebhome.com CA

www.outlog.org CA

www.out-of-Zion.com IL

www.outputonline.com None
www.outreachforchristministries.org None
www.overpopulation.com CA

www.ovnet.com US

www.owatonna.com None
owen.9982.com CN

owenbeckham.

poetry.educations.net BE

poetrypicture.tripod.com US

www.pogi.com US

pokemonculb.in2000.com Vietnam

pokepig.netfirms.com None
pokevb.uhome.net Malaysia

www.polinco.com None
www.politicalaccess.com US

www.politicalindex.com None
www.politicalstability.com US

www.politiquebooks.com None
polodog.netfirms.com None
www.poltekpajajaran.ac.id None
polun.virtualave.net None
polysol.8m.com None
www.polytechhk.net None
www.polyubball.com None
pongchee.tripod.com US

www.ponk111.com None
www.ponoi.com US

pontoblog.tk None
poonmantik.bizland.com None
www.pooyesh.com None
popcity.virtualave.net None
www.popdetox.com None
www.popedope.com US

www.poplchurch.org US

www.poprocks.com None
www.popsci.com US

popstyle.freeservers.com US

porngateway.com US

www.pornplugin.com None
www.pornshaq.com None
port.in2000.com Vietnam

www.portals.portland.co.uk None
www.PortStJoeFL.com None
posedionfc.tripod.com US

positron.net US

www.possessionsonline.com None
www.possibilitiesproject.com None
www.po

www.recordnet.com US

www.recoverypath.com US

www.recweb.org ID

red_memorial.tripod.com US

www.redbirdbaptistchurch.org None
www.reddeer.net US

www.reddeeradvocate.com CA

www.redeemedchurch.org US

www.redeyerecords.com US

www.redguard.com US

RedHat.uhome.net Malaysia

www.redhorserecords.com US

www.redhotbikini.co.uk None
www.rediris.es None
www.redjuderias.org ES

www.redleague.net None
www.redlist.org GB

www.redlobster.com US

www.rednation.com None
www.redpass.com None
www.redress.org GB

www.redrival.com None
www.redstream.org US

redwhite.com None
redwine.netfirms.com None
redwingteam.tripod.com US

www.reeusda.gov None
www.refdesk.com PA

reformatory.tripod.com US

www.refuge.amnesty.org US

www.refugeatwestside.com None
www.refugewings.com None
RegalConstellation.com None
www.regalhongkong.com None
www.regexen.com None
www.reginadellamore.org IT

www.registerherald.com US

www.rehaballiance.org.hk None
www.reiki-grma.com None
www.reimerswaal.nl None
rejoice.in2000.com 

tailorpc.8m.com None
tailwind.virtualave.net None
www.tainancity.com US

tainanpeople.in2000.com Vietnam

www.taion-insurance.com.hk None
www.taipei.cc CN

www.taipei.org US

taipeicapitaleo.in2000.com Vietnam

www.taipei-elife.net None
www.taipeimosque.org None
taipeita.tw.to None
www.taipo12.f4w.net PA

taison.in2000.com Vietnam

taiwaichurch.tripod.com US

www.taiwan.com KY

taiwan.org US

taiwan_home.tripod.com US

taiwan520.virtualave.net None
www.taiwanartist.com US

www.taiwan-bank.com None
www.taiwanclassified.net None
taiwandog.cjb.net US

www.taiwanese.com US

www.taiwanfood.com US

taiwango.in2000.com Vietnam

www.taiwaninformation.org CA

taiwanjapan.hoops.ne.jp None
taiwan-kansai.hoops.ne.jp None
www.taiwanoffer.com JP

www.taiwanpostbox.bizland.com None
www.taiwanpt.net CN

www.taiwanradio.net US

www.TaiwanSexFriend.com None
www.taiwan-strait.net None
www.taiwanwindow.com US

www.taiyanet.com US

www.takako1.net.hk.ro None
takako-collection.uhome.net Malaysia

takakopark

www.tibethouse.org US

www.tibeticlt.org IN

www.tibet-info.net FR

www.tibetjustice.org CA

www.tibetnews.com US

www.tibetsearch.com US

www.tibet-society.org.uk None
www.tibettimes.tibetsearch.com US

www.tibettour.net.cn None
www.tibetworld.com CN

www.tibs.org US

www.ticohe.com None
www.tiempo.hn HN

www.tientai.com Malaysia

www.tier.net US

www.tieshaozi.com CN

www.tietgen.dk None
www.tiffanylee.fans-club.com None
tiffanylee.issweet.net None
www.tiger186.org None
www.tigerboxing.com.hk None
www.tigerden.com None
tigerphone.tripod.com US

www.tigress.com DE

www.tilburg.nl None
www.tilehill.ac.uk None
www.tillholland.com JP

www.tima.com GB

www.timacad.ru None
www.timberchina.com US

www.timbomb.net None
time.com US

www.time.com None
www.timeasia.com None
www.timebombrecordings.com None
www.timecube.com PA

www.time-data.com CN

www.times.co.nz None
www.times10.org US

times-age.co.nz None
www.timesbulletin.com US

www.timesfreepress.com US

www.timesglobe.com CA

www.timesgu

toyshunter.uhome.net Malaysia

toysmarket.uhome.net Malaysia

www.toys-wpn.com None
www.toyuanzhou.com CN

www.tpc.int None
www.tpchc.org None
www.tpg.com.au None
www.tpgz.xnapster.com US

www.tpi.com None
tpl.city.timmins.on.ca None
www.tpl.lib.wa.us None
www.tplau.org None
www.tplibrary.org US

www.t-princess.com CN

www.tpsabc.com CN

www.tpu.fi None
tpublib.fp.execpc.com US

www.tpyhotel.com None
tpzy.hk-homepage.com None
www.tqcpa.com CN

tquesay.in2000.com Vietnam

www.tra98.com None
www.trabicn.com CN

www.trackinginformation.com US

www.tracor.es None
www.tracts.com None
www.tracyfpc.org None
www.tradcathhs.freeservers.com US

www.trademarkrecords.com US

www.traderfirst.com US

www.tradesources.com KN

www.tradingit.com NL

www.tradingpost.com.au None
www.traerstarclipper.com None
tragiclad.tripod.ca US

www.trailblazeronline.net None
www.trailortrash.com None
www.trailwalker.com US

training.scientology.org US

trainpass.tw.to None
www.trampled.org US

www.trance.nu None
www.

utenti.tripod.it None
www.uticachurchofchrist.com US

www.utne.com None
www.utpulse.com US

www.uubang.com CN

uufa.org US

www.uumidland.net JP

uvoid.39m.net US

www.uwcdc.com PA

www.uwm.org US

www.uyghuramerican.org US

www.uyghurinfo.com None
www.uyghurs.org JP

www.uygur.com US

www.uygur.net US

www.uygur.org DE

www.uygurs.com US

vacations.to None
www.vaccinealliance.org US

vacuum.mi.org US

www.vaeb.uscourts.gov None
www.vaed.uscourts.gov None
www.vaikunt.org CN

valleychurch.com US

www.valleystar.com PA

valluvanad.bravepages.com None
www.vanderbilt.edu None
www.vanderwoning.com CA

van-inn.8k.com CN

www.vannet.com CA

www.vasc.org US

vase.essex.ac.uk None
www.vasturachana.com CN

www.vatsvezia.it None
www.vawb.uscourts.gov None
www.vawd.uscourts.gov None
vax1.vigo.lib.in.us None
www.vbbox.com US

www.vbvineyard.com JP

www.vc.bc.ca None
www.vcbit.com CN

vcclub.yes8.com CN

www.vce.com US

VCFBillings.homestead.com None
www.vcllc.com KY

www.veahavta.org US

www.vec.ac

www.welcomehomemag.com US

www.welcomelee.org None
www.welcomewherever.com US

weleague.uhome.net Malaysia

www.welfarelottery.net JP

well.taipei-elife.net None
www.welltechpacific.com None
www.well-trading.com None
www.wels.net None
wembley.fortunecity.com None
www.wemu.org US

www.wenbao.com.cn None
www.wenjun-china.com None
www.wenonahpresbyterian.org None
wensin.bizland.com None
wentong.ebigchina.com None
www.wenworld.com us

www.wenxucity.com KY

wenxuecity.com US

www.wenxuecity.com US

weonewon.uspace.net CN

www.we-q.net None
were-wolf.hypermart.net None
werkl.in2000.com Vietnam

wesleyday.freeyellow.com None
www.wesleyunited.org US

www.westarkchurchofchrist.org US

www.west-cheshire.ac.uk None
www.westchesterbaptistchurch.org None
www.westchina.com US

www.westernjubilee.com US

www.westernwheel.com CA

www.westerwin.org US

www.west-fly.com US

www.westibi.com US

www.westin-shanghai.com CN

www.westmin.org US

www.west-norfolk.gov.uk None
www.westside.mychurch.com None
www

www.xinlianghotel.com cn

www.xinlongfa.com None
www.xin-qiao.com CN

www.xinruigroup.com.cn None
xinsheng.net CA

www.xin-xing-group.com None
www.xinyaa.com None
www.xinyi-hk.com CN

xiongni.tripod.com US

xiongni.tripod.com. US

www.xipworld.com None
xipworld.com None
xistrat.org US

xitak.tripod.com US

xixd.members.easyspace.com GB

xizang-zhiye.com None
www.xizang-zhiye.org IN

xizang-zhiye.org IN

www.xj-ic.com CN

www.xjife.edu.cn None
www.xjmachinery.com None
www.xjyz.com.cn None
www.xj-zy.com None
xmascarolsonthenet.tripod.com US

xmbremix.xmbhosted.com None
www.xmdz.com CN

xmlnet.uhome.net Malaysia

www.xmoonies.com None
www.xmsun.com US

xngeng.k12.net.cn None
xoss.netfirms.com None
xpubhk.virtualave.net None
xresistancex.cjb.net US

www.xshowbiz.com PA

x-stream.fortunecity.com None
www.xszy.net CN

www.xtec.i.am None
xteens.uhome.net Malaysia

www.xtremeteamministry.homestead.com None
www.xuanwu.com CN

www.xuelinxing.com CN

xuesongyuan.wangzhan.com CN

www.xueyou.com No

www.zjenet.com None
www.zjfirm.com CN

www.zj-fishmarket.com CN

www.zjjtours.com US

www.zjlove.com None
www.zjmc.edu.cn None
www.zj-qingqi.com None
zjtuceng.ntsun.com CN

www.zjxinhong.com CN

zjxyz.uhome.net Malaysia

www.zjzbsc.com None
www.zknew.com US

www.zlm.com.cn None
www.z-movers.cc None
znet.org US

www.zoinlutheran.homestead.com None
www.zone.f4w.net PA

zone247.uhome.net Malaysia

zonenet.uhome.net Malaysia

www.zoomf1.com None
www.zoom-soft.com None
www.zorbagarden.net None
zpgvidz.cjb.net US

www.zqcool.com CN

www.zqnp.com CN

zring.hypermart.net None
www.zsck.net None
WWW.ZSKGM.COM None
www.zstableware.com None
www.zstvb.tk None
www.zswise.com CN

www.zumbrolutheran.org US

zumbrota.com US

www.zunhua.com CN

www.zuni.org.hk None
www.zunxin.com CN

www.zuozhongmin.com CN

zwei.852.net HK

www.zwtea.com None
www.zxmp.com CN

www.zxzqsz.com None
www.zytlsyn.com None
www.zytx.com.cn None
www.zzcw.com CN

www.zzhksy.com CN

www.zzjwl.com None
www.0011av.com TW

www.0011ca

886.5u.com US

889.39m.net US

www.8899.com.cn None
www.88dreams.com US

89200089.in2000.com Vietnam

8964.39m.net US

8964.com US

www.89-64.com None
www.8add8.com MY

8bit.at None
www.8cai.com CN

www.8d8d.com US

www.8gg.com CN

90060358866.envy.nu None
www.9-11peace.org US

www.91858.com None
www.91-9.com None
www.91office.com None
9244x.iscool.net MD

926.homestead.com None
www.927cn.com None
93.2y.net None
www.94ta.com CN

www.95885.com MO

95it1.iscool.net MD

95it2.iscool.net MD

998.net-shop.com CN

www.9999-rose.com CN

www.999adult.com None
99chat.cjb.net US

www.99ew.com CN

www.9u9u.com 



In [11]:
china_blocked.head()

Unnamed: 0,URL,URL Country
0,www.a00.com,CN\n
1,a0006.in2000.com,
2,a1.sparklancer.com,KY\n
3,a1234.uhome.net,Malaysia\n
4,a1234bc.uhome.net,Malaysia\n


In [12]:
china_blocked['URL Country'].unique()

array(['CN\n', None, 'KY\n', 'Malaysia\n', 'Vietnam\n', 'US\n', 'CA\n',
       'GB\n', 'JP\n', 'CH\n', 'NZ\n', 'us\n', 'GA\n', 'KR\n', 'PA\n',
       'ID\n', 'HK\n', 'TH\n', 'FR\n', 'MT\n', 'DE\n', 'UK\n', 'AU\n',
       'MY\n', 'BR\n', 'AE\n', 'BE\n', 'IN\n', 'KN\n', 'CZ\n', 'SE\n',
       'MD\n', 'UY\n', 'PS\n', '\n', 'IT\n', 'NO\n', 'LU\n', 'NL\n',
       'RU\n', 'BS\n', 'QA\n', 'SG\n', 'MO\n', 'AT\n', 'DK\n', 'TW\n',
       'cn\n', 'PK\n', 'Austra\n', 'ES\n', 'SI\n', 'HU\n', 'IL\n', 'Un\n',
       'uk\n', 'IE\n', 'VG\n', 'TR\n', 'MX\n', 'SA\n', 'BM\n', 'ca\n',
       'LV\n', 'AR\n', 'EG\n', 'IR\n', 'PH\n', 'PY\n', 'GR\n', 'AD\n',
       'HR\n', 'AF\n', 'AO\n', 'Cyprus\n', 'ZA\n', 'JO\n', 'VN\n',
       'AUSTRALIA\n', 'fr\n', 'BG\n', 'ZW\n', 'BY\n', 'FI\n', 'UA\n',
       'AW\n', 'KE\n', 'KH\n', 'SK\n', 'hk\n', 'HN\n', 'SC\n', 'CO\n',
       'GI\n'], dtype=object)

In [14]:
china_blocked.to_csv("cleaned_data/adversarial_data/blocked_urls_china.csv")

### All Countries

In [5]:
domain_data_map = {
    "GDPR" : "blocked_urls_gdpr.csv",
    "India" : "blocked_urls_in.csv",
    "Iraq" : "blocked_urls_iq.csv",
    "Pakistan" : "blocked_urls_pak.csv",
    "Russia" : "blocked_urls_rus.csv",
    "Saudi Arabia" : "blocked_urls_sa.csv",
    "United Arab Emirates" : "blocked_urls_uae.csv"
}

In [21]:
pd.read_csv('cleaned_data/blocked_urls_gdpr.csv').head()

Unnamed: 0,Name,Date Unblocked,Website,Archived Block Message[0],Archived Block Message[1],Block Message Screenshot,Website Archives[0],Website Archives[1]
0,5 News KFSM,,https://5newsonline.com,https://archive.fo/Lwl88,,https://data.verifiedjoseph.com/files/gdpr-scr...,https://web.archive.org/web/2/https://5newsonl...,
1,99acres,,https://www.99acres.com,https://archive.fo/24l2Q,,https://data.verifiedjoseph.com/files/gdpr-scr...,https://web.archive.org/web/2/https://www.99ac...,
2,Aberdeen News,,https://www.aberdeennews.com,,,https://data.verifiedjoseph.com/files/gdpr-scr...,,
3,Abington Mariner,,http://abington.wickedlocal.com,,,https://data.verifiedjoseph.com/files/gdpr-scr...,https://web.archive.org/web/2/http://abington....,
4,"Ad Express & Daily Iowegian (Centerville, IA)",,https://www.dailyiowegian.com,,,https://data.verifiedjoseph.com/files/gdpr-scr...,,


In [13]:
for k, v in domain_data_map.items():
    if k == 'GDPR':
        curr_blocked = pd.read_csv('cleaned_data/' + v)
        col = 'Website' if k == 'GDPR' else 'URL'
        curr_blocked['URL Country'] = curr_blocked[col].apply(url_whois)
        final_filename = "cleaned_data/adversarial_data/" + v
        curr_blocked.to_csv(final_filename)

https://5newsonline.com None
https://www.99acres.com None
https://www.aberdeennews.com US

http://abington.wickedlocal.com US

https://www.dailyiowegian.com US

http://agcanada.com CA

https://agdealer.com CA

http://www.agrinews-pubs.com None
https://www.agupdate.com None
http://www.ahwatukee.com US

https://www.aikenstandard.com US

http://www.airthinx.io US

https://democratherald.com None
http://albertafarmexpress.ca CA

http://www.aledotimesrecord.com US

https://www.alicetx.com None
http://www.alliancetimes.com None
https://www.alliednews.com US

http://allston.wickedlocal.com US

https://www.amny.com US

https://www.amarillo.com None
http://americanmilitarynews.com US

https://www.americanpress.com None
http://www.amestrib.com US

https://www.adn.com US

https://www.anchoragepress.com us

https://www.andovertownsman.com US

http://www.apg-wi.com None
https://www.apgwest.com US

https://www.appeal-democrat.com None
https://www.argus-press.com None
https://tucson.com US

https://a

http://www.egcitizen.com US

https://www.elkvalleytimes.com None
https://elkodaily.com None
https://www.ellos.us US

http://www.ellwoodcityledger.com None
http://www.emissourian.com None
https://www.empiretoday.com US

http://www.emporiagazette.com None
https://www.energy-tech.com US

https://www.enidnews.com US

http://www.enterprisepub.com None
http://www.goerie.com None
https://www.faceplusplus.com CN

http://www.newsminer.com None
http://www.fairfaxtimes.com US

https://www.farmforum.net US

https://www.farmtalknewspaper.com US

https://farmzilla.com CA

https://www.fauquier.com None
http://www.feastmagazine.com None
http://fiddleheadfocus.com US

https://www.jaxdailyrecord.com US

http://www.fltimes.com US

https://www.flexonline.com US

http://www.floydct.com None
https://www.flyafter5.com US

https://www.fontanaheraldnews.com None
http://foodincanada.com CA

http://www.fbherald.com None
http://www.forthoodsentinel.com None
http://times-journal.com None
http://www.fortworthbusine

http://malden.wickedlocal.com None
http://manitobacooperator.ca CA

http://www.manoanow.org US

http://mansfield.wickedlocal.com None
http://marblehead.wickedlocal.com None
https://www.mdjonline.com None
http://www.marinscope.com US

https://marketing.usatoday.com  None
http://marlborough.wickedlocal.com None
https://www.marshallnewsmessenger.com US

http://marshfield.wickedlocal.com None
http://rockland.wickedlocal.com None
https://www.frontiersman.com None
https://www.mcalesternews.com US

http://www.mebaneenterprise.com US

http://medfield.wickedlocal.com None
http://medford.wickedlocal.com None
http://melrose.wickedlocal.com None
https://www.mensjournal.com US

https://www.virginiamn.com None
http://www.messagemedia.co US

http://www.republic-online.com US

https://www.miamiok.com None
http://www.middletowntranscript.com None
http://www.ourmidland.com US

http://www.mrt.com US

https://www.midlothianmirror.com None
http://www.milfordbeacon.com None
https://www.militarynews.com None

https://starmagazine.com US

http://www.starcourier.com None
https://starlocalmedia.com None
https://www.starnewsonline.com None
https://www.statesville.com US

https://www.yourstephenvilletx.com None
http://www.steubencourier.com None
http://stoneham.wickedlocal.com None
http://stoughton.wickedlocal.com None
https://www.stowetoday.com None
http://www.sturgisjournal.com None
http://www.stuttgartdailyleader.com US

http://www.sunad.com US

https://www.newbernsj.com None
https://www.yoursun.com None
https://www.sun-sentinel.com None
http://www.sussexcountian.com None
https://www.sustainablecitynetwork.com US

https://www.suwanneedemocrat.com US

http://www.swiowanewssource.com US

http://swampscott.wickedlocal.com None
https://www.swnewsmedia.com US

https://www.swvatoday.com US

http://www.journaldemocrat.com None
http://www.taftmidwaydriller.com None
https://www.tahlequahdailypress.com US

https://www.tbnweekly.com US

http://www.tauntongazette.com None
http://www.tdtnews.com None
http

https://www.themonitor.com PA

http://www.monroenews.com None
https://mtstandard.com None
https://www.mooreamerican.com US

http://www.mcall.com None
https://www.scnow.com US

https://www.themorningsun.com US

https://www.moultrieobserver.com US

http://www.themountainmail.com US

https://themountainpress.com US

https://www.murfreesboropost.com US

https://www.muskogeephoenix.com US

http://dailysentinel.com None
https://www.nelighnews.com US

http://www.newportplaintalk.com None
https://www.kingstreenews.com US

https://www.newsadvance.com US

https://www.newsandtribune.com US

https://www.enewscourier.com US

https://www.thenewsguard.com US

http://www.thenewsherald.com US

https://www.morganton.com US

https://www.newsitem.com US

http://www.newsoforange.com US

http://www.newsrecord.org CA

https://www.thenewsdispatch.com US

http://www.thenewsenterprise.com None
http://www.winchesternewsgazette.com US

https://www.nrtoday.com US

http://www.newstimes.com US

http://www.hartfordci

https://www.wfaa.com None
https://www.wfmynews2.com None
https://www.wgal.com US

https://wgem.com None
https://wgnradio.com None
https://wgntv.com None
https://wgno.com None
https://www.wgrz.com None
https://www.whas11.com None
http://www.whitehalljournal.com None
https://whnt.com None
https://hoiabc.com None
https://whotv.com None
http://www.wickedlocal.com None
https://www.journalpatriot.com None
https://www.williamsondailynews.com US

https://www.willistonherald.com None
http://wilmington.wickedlocal.com None
http://winchester.wickedlocal.com None
https://www.winnipegfreepress.com None
https://www.winonadailynews.com None
https://www.journalnow.com US

http://wire.lee.net None
https://www.wiscnews.com None
https://www.wisconsingazette.com None
https://www.wisn.com US

https://www.wjcl.com US

https://wkow.com None
http://www.wkyc.com None
https://www.wlky.com US

https://www.wltx.com None
https://www.wlwt.com US

https://www.13wmaz.com None
https://www.mor-tv.com US

https://www.wm

In [19]:
all_clean_files = [
    "blocked_urls_china.csv",
    "blocked_urls_gdpr.csv",
    "blocked_urls_in.csv",
    "blocked_urls_iq.csv",
    "blocked_urls_pak.csv",
    "blocked_urls_rus.csv",
    "blocked_urls_sa.csv",
    "blocked_urls_uae.csv",
]

countries = ["China", "GDPR", "India", "Iraq", "Pakistan", "Russia", "Saudi Arabia", "United Arab Emirates"]

In [39]:
idx = 7
x = pd.read_csv("cleaned_data/adversarial_data/" + all_clean_files[idx])[["URL", "URL Country"]]
x['URL Country'] = x['URL Country'].str.replace('\n', '')
x.head()

Unnamed: 0,URL,URL Country
0,http://fditny.hi5.com/,
1,http://hi5.com/friend/p356889975-friend-html,
2,http://hi5/friend/p336746863-FoSHEYaH_%20%20%2...,
3,http://tokiva.com/,CA
4,http://1.cool0.biz/,


In [40]:
x.to_csv("cleaned_data/adversarial_data/final/" + all_clean_files[idx])

In [96]:
# calculate grouped counts
replace_url_country = {
    'Malaysia': 'MY',
    'Vietnam': 'VN',
    'Austra': 'AU',
    'Un': 'US',
    'Cyprus': 'CY',
    'AUSTRALIA': 'AU',
    'UKRAINE': 'UA',
    'Switzerland': 'CH',
    'Boston': 'US',
    'China': 'CN',
    'AN': 'AD',
    'UK': 'GB',
    'uk': 'GB',
    'REDACTED': np.nan,
}

final_counts = []

for i in range(len(countries)):
    country = countries[i]
    filename = "cleaned_data/adversarial_data/final/" + all_clean_files[i]
    curr_df = pd.read_csv(filename)
    if country == 'GDPR':
        curr_df.rename(columns = {'Website': 'Number of Domains Blocked'}, inplace = True)
    curr_df['URL Country'] = curr_df['URL Country'].replace(replace_url_country).str.upper()
    curr_count = curr_df.groupby(by = 'URL Country').count().drop(columns = ['Unnamed: 0']).rename(columns = {'URL': 'Number of Domains Blocked'})
    curr_count['Country'] = country
    curr_count.reset_index(inplace = True)
    print(curr_count.head())
    final_counts.append(curr_count)
    print(curr_df['URL Country'].unique())
    print()

  URL Country  Number of Domains Blocked Country
0          AD                          1   China
1          AE                          6   China
2          AF                          2   China
3          AO                          1   China
4          AR                          1   China
['CN' nan 'KY' 'MY' 'VN' 'US' 'CA' 'GB' 'JP' 'CH' 'NZ' 'GA' 'KR' 'PA' 'ID'
 'HK' 'TH' 'FR' 'MT' 'DE' 'AU' 'BR' 'AE' 'BE' 'IN' 'KN' 'CZ' 'SE' 'MD'
 'UY' 'PS' 'IT' 'NO' 'LU' 'NL' 'RU' 'BS' 'QA' 'SG' 'MO' 'AT' 'DK' 'TW'
 'PK' 'ES' 'SI' 'HU' 'IL' 'IE' 'VG' 'TR' 'MX' 'SA' 'BM' 'LV' 'AR' 'EG'
 'IR' 'PH' 'PY' 'GR' 'AD' 'HR' 'AF' 'AO' 'CY' 'ZA' 'JO' 'BG' 'ZW' 'BY'
 'FI' 'UA' 'AW' 'KE' 'KH' 'SK' 'HN' 'SC' 'CO' 'GI']

  URL Country  Number of Domains Blocked Country
0          CA                         19    GDPR
1          CN                          2    GDPR
2          GB                          1    GDPR
3          GU                          1    GDPR
4          IN                          1    GDPR


In [116]:
combined_df = pd.DataFrame(columns = ['URL Country', 'Number of Domains Blocked', 'Country'])

for i in range(len(countries)):
    curr_country = countries[i]
    curr_grouped_df = final_counts[i]
    combined_df = combined_df.append(curr_grouped_df)

In [117]:
combined_df.head()

Unnamed: 0,URL Country,Number of Domains Blocked,Country
0,AD,1,China
1,AE,6,China
2,AF,2,China
3,AO,1,China
4,AR,1,China


In [118]:
combined_df.shape

(322, 3)

In [119]:
combined_df = combined_df[['Country', 'URL Country', 'Number of Domains Blocked']]
combined_df.head()

Unnamed: 0,Country,URL Country,Number of Domains Blocked
0,China,AD,1
1,China,AE,6
2,China,AF,2
3,China,AO,1
4,China,AR,1


In [120]:
def get_country_name(c):
    result = pycountry.countries.get(alpha_2 = c)
    if result:
        return result.name
    print(c)
    return None

combined_df['URL Country'] = [get_country_name(cc) if cc else None for cc in combined_df['URL Country']]

In [121]:
combined_df.head()

Unnamed: 0,Country,URL Country,Number of Domains Blocked
0,China,Andorra,1
1,China,United Arab Emirates,6
2,China,Afghanistan,2
3,China,Angola,1
4,China,Argentina,1


In [103]:
combined_df.to_csv("cleaned_data/adversarial_data/combined_adversarial.csv")

In [122]:
json_combined_df = combined_df.reset_index()
json_combined_df.to_json("cleaned_data/adversarial_data/combined_adversarial.json")