# FCC Data Merging



## About the Data

FCC data was taken from: https://www.fcc.gov/form-477-broadband-deployment-data-december-2019-version-1
* Download the US - US - Fixed with Satellite - Dec 19v1(CSV)
* Note this downloads as a zip. Joanie's Mac had trouble unzipping but what worked well wa

The columns in the FCC dataset are: https://www.fcc.gov/general/explanation-broadband-deployment-data 

With more info on the tech codes: https://www.fcc.gov/general/technology-codes-used-fixed-broadband-deployment-data

In [1]:
## imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
#import os

In [2]:
#get census data
census_data = pd.read_csv("../data/relabeled_census.csv")
census_data.head()

Unnamed: 0,NAME,total_pop2,median_age_overall,median_age_male,median_age_female,state,county,tract,employment_rate,median_income,...,pct_internet_broadband_satellite,pct_internet_only_satellite,pct_internet_other,pct_internet_no_subscrp,pct_internet_none,pct_computer,pct_computer_with_dialup,pct_computer_with_broadband,pct_computer_no_internet,pct_no_computer
0,"Census Tract 11, Jefferson County, Alabama",4781,39.0,42.5,38.1,1,73,1100,51.0,37030.0,...,9.02215,0.918422,0.0,1.134522,24.851432,80.821178,0.0,74.014046,6.807131,19.178822
1,"Census Tract 14, Jefferson County, Alabama",1946,44.3,40.5,49.1,1,73,1400,45.4,36066.0,...,4.901961,0.0,0.0,2.083333,25.490196,85.661765,0.0,71.078431,14.583333,14.338235
2,"Census Tract 20, Jefferson County, Alabama",4080,34.0,31.0,36.4,1,73,2000,47.7,27159.0,...,4.651163,0.0,0.0,0.0,45.454545,71.317829,0.0,54.545455,16.772375,28.682171
3,"Census Tract 38.02, Jefferson County, Alabama",5291,35.8,31.7,37.3,1,73,3802,51.7,38721.0,...,3.959873,0.0,0.0,6.335797,33.632524,85.744456,0.0,59.450898,26.293559,14.255544
4,"Census Tract 40, Jefferson County, Alabama",2533,52.1,51.6,53.8,1,73,4000,36.9,18525.0,...,4.548635,1.959412,0.0,5.108467,47.515745,63.051085,0.0,44.786564,18.264521,36.948915


In [3]:
fcc = pd.read_csv("../data/fcc/fbd_us_with_satellite_dec2019_v1.csv", converters={'BlockCode' : lambda x: str(x)})

In [4]:
fcc.head()

Unnamed: 0,LogRecNo,Provider_Id,FRN,ProviderName,DBAName,HoldingCompanyName,HocoNum,HocoFinal,StateAbbr,BlockCode,TechCode,Consumer,MaxAdDown,MaxAdUp,Business
0,1,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001336,70,1,10.0,1.0,1
1,2,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001391,70,1,10.0,1.0,1
2,3,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001398,70,1,10.0,1.0,1
3,4,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001399,70,1,10.0,1.0,1
4,5,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001400,70,1,10.0,1.0,1


In [5]:
fcc.tail()

Unnamed: 0,LogRecNo,Provider_Id,FRN,ProviderName,DBAName,HoldingCompanyName,HocoNum,HocoFinal,StateAbbr,BlockCode,TechCode,Consumer,MaxAdDown,MaxAdUp,Business
73215372,73215373,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",PR,721537506022018,60,1,2.0,1.3,1
73215373,73215374,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",PR,721537506022019,60,1,2.0,1.3,1
73215374,73215375,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",PR,721537506022020,60,1,2.0,1.3,1
73215375,73215376,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",PR,721537506022021,60,1,2.0,1.3,1
73215376,73215377,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",PR,721537506022022,60,1,2.0,1.3,1


In [6]:
fcc.shape

(73215377, 15)

In [7]:
# We can see there are codes for Guam, Virgin Islands, Puerto Rico, etc which have very different policies and may 
# not display similar correlations with broadband. We will drop non-US
#np.unique(fcc.StateAbbr)

array(['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
       'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA',
       'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE',
       'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI',
       'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV',
       'WY'], dtype=object)

In [8]:
# https://www.census.gov/library/reference/code-lists/ansi/ansi-codes-for-states.html
state_codes_to_drop = ["AS", "GU", "MP", "PR", "VI"]

In [9]:
df = fcc[~fcc['StateAbbr'].isin(state_codes_to_drop)]
df

Unnamed: 0,LogRecNo,Provider_Id,FRN,ProviderName,DBAName,HoldingCompanyName,HocoNum,HocoFinal,StateAbbr,BlockCode,TechCode,Consumer,MaxAdDown,MaxAdUp,Business
0,1,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001336,70,1,10.0,1.0,1
1,2,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001391,70,1,10.0,1.0,1
2,3,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001398,70,1,10.0,1.0,1
3,4,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001399,70,1,10.0,1.0,1
4,5,53763,1630201,Union Telephone Company,Union Telephone Company,Union Holding Corp.,360114,Union Holding Corp.,WY,560379705001400,70,1,10.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73067164,73067165,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",WY,560459513003125,60,1,2.0,1.3,1
73067165,73067166,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",WY,560459513003126,60,1,2.0,1.3,1
73067166,73067167,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",WY,560459513003127,60,1,2.0,1.3,1
73067167,73067168,55262,18756155,"VSAT Systems, LLC",Skycasters,"VSAT Systems, LLC",300167,"VSAT Systems, LLC",WY,560459513003128,60,1,2.0,1.3,1


In [10]:
df["tract_geoid"] = df["BlockCode"].apply(lambda row: row[:-4])
#df["tract_code"] = df["BlockCode"].apply(lambda row: row[-10:-4])
#df["county_code"] = df["BlockCode"].apply(lambda row: row[-13:-10])
#df["state_code"] = df["BlockCode"].apply(lambda row: row[:-13])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tract_geoid"] = df["BlockCode"].apply(lambda row: row[:-4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tract_code"] = df["BlockCode"].apply(lambda row: row[-10:-4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["county_code"] = df["BlockCode"].apply(lambda row: row[-13:-10])
A value

In [11]:
df.ProviderName.value_counts()[:100]

ViaSat, Inc.                              11049786
HNS License Sub, LLC                      10554865
GCI Communication Corp.                   10548830
VSAT Systems, LLC                         10524847
Charter Communications, Inc.               2390426
                                            ...   
DigitalPath, Inc                             39242
Cal.net, Inc.                                38820
Service Electric Cablevision, Inc.           38658
Windstream Georgia Communications, LLC       38431
Inventive Wireless of Nebraska, LLC          38415
Name: ProviderName, Length: 100, dtype: int64

In [12]:
df.DBAName.value_counts()[:100]

Viasat Inc                        11049786
HughesNet                         10554865
GCI Communication Corp.           10548518
Skycasters                        10524847
Charter Communications Inc         2390426
                                    ...   
US Signal Company                    45767
MCI                                  45106
NewWave                              45054
Armstrong Utilities, Inc.            44148
Frontier North Indiana  Contel       43957
Name: DBAName, Length: 100, dtype: int64

In [13]:
#np.unique(df.Provider_Id).shape
#result: (2977,)

(2977,)

In [14]:
#np.unique(df.BlockCode).shape
#result: (11078297,)

(11078297,)

In [15]:
#np.unique(df.tract_geoid).shape
#result: (73057,)

(73057,)

In [16]:
df.isnull().sum()
#result: mostly filled in, only some holdingcompanyname missing

LogRecNo                0
Provider_Id             0
FRN                     0
ProviderName            0
DBAName                 0
HoldingCompanyName    214
HocoNum                 0
HocoFinal               0
StateAbbr               0
BlockCode               0
TechCode                0
Consumer                0
MaxAdDown               0
MaxAdUp                 0
Business                0
tract_geoid             0
tract_code              0
county_code             0
state_code              0
dtype: int64

## Deciding how to summarize the Data

There are over 11M block codes in the data if we include the non-US state codes of "AS", "GU", "MP", "PR", "VI" : 11165833

Otherwise, there are XX block codes.

But only XXX tract codes.


The ACS data is available by tract.

We want to summarize the FCC data by tract.

### Ideas

Groupby Tract_geoid

* LogRecNo : can toss
* Provider_Id list of providers in tract
* StateAbbr should be same -> i.e. choose most common
* TechCode list
* Consumer_All only 1 if all blocks 1
* Consumer_Some 1 if any blocks 1
* MaxAdDown : max in tract
* MaxAdUp : max in tract
* Business_All only 1 if all blocks 1
* Business_Some 1 if any blocks 1


Could later add unique tech code columns based on apply definitions if a code is present
* Wired if tech code is Cable, Copper, DSL, Fiber



Make a dictionary of Provider_Id to List of [ProviderName, DBAName, TechCode]

In [35]:
grouped = df[:250000].groupby(["BlockCode"])


In [None]:
#grouped_tract = df.groupby(["tract_geoid"])

In [45]:
agged = grouped.agg(
    #total_providers = ("Provider_Id", "nunique"),
    provider_ids = ("Provider_Id", lambda x: set(x)),
    num_blocks = ("BlockCode", "nunique"),
    provider_names = ("ProviderName", lambda x: set(x)),
    state = ("StateAbbr", pd.Series.mode),
    tract = ("tract_geoid", pd.Series.mode),
    tech_codes = ("TechCode", lambda x: set(x)),
    consumer_any = ("Consumer", pd.Series.any),
    max_ad_down = ("MaxAdDown", pd.Series.max),
    max_ad_up = ("MaxAdUp", pd.Series.max),
    business_any = ("Business", pd.Series.any),
    
)

In [50]:
agged["fcc_satellite"] = agged["tech_codes"].apply(lambda x: 60 in x)
agged["fcc_dsl"]= agged["tech_codes"].apply(lambda x: (10 in x) | (11 in x) | (12 in x) | (20 in x))
agged["fcc_cable_modem"] = agged["tech_codes"].apply(lambda x: (40 in x) | (41 in x) | (42 in x) | (43 in x))
agged["fcc_other_copper"] = agged["tech_codes"].apply(lambda x: 30 in x)
agged["fcc_fiber"] = agged["tech_codes"].apply(lambda x: 50 in x)
agged["fcc_terrestrial_fixed_wireless"] = agged["tech_codes"].apply(lambda x: 70 in x)
agged["fcc_other"] = agged["tech_codes"].apply(lambda x: (90 in x) | (0 in x))
#agged["fcc_all_other"] = agged["tech_codes"].apply(lambda x: 0 in x)
agged

Unnamed: 0_level_0,provider_ids,num_blocks,provider_names,state,tract,tech_codes,consumer_any,max_ad_down,max_ad_up,business_any,fcc_satellite,fcc_dsl,fcc_cable_modem,fcc_other_copper,fcc_fiber,fcc_terrestrial_fixed_wireless,fcc_electric_power_line,fcc_all_other
BlockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
010010202002029,{53788},1,"{Level 3 Communications, LLC}",AL,01001020200,{30},False,0.0,0.0,True,False,False,False,True,False,False,False,False
010010203002020,{53788},1,"{Level 3 Communications, LLC}",AL,01001020300,{30},False,0.0,0.0,True,False,False,False,True,False,False,False,False
010010205002010,{53788},1,"{Level 3 Communications, LLC}",AL,01001020500,{30},False,0.0,0.0,True,False,False,False,True,False,False,False,False
010010205002014,{53788},1,"{Level 3 Communications, LLC}",AL,01001020500,{30},False,0.0,0.0,True,False,False,False,True,False,False,False,False
010010205002021,{53788},1,"{Level 3 Communications, LLC}",AL,01001020500,{30},False,0.0,0.0,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560419752004147,{53763},1,{Union Telephone Company},WY,56041975200,{12},True,100.0,20.0,True,False,True,False,False,False,False,False,False
560419753005006,{53763},1,{Union Telephone Company},WY,56041975300,{12},True,100.0,20.0,True,False,True,False,False,False,False,False,False
560419753006361,{53763},1,{Union Telephone Company},WY,56041975300,{12},True,100.0,20.0,True,False,True,False,False,False,False,False,False
560419754003043,{53763},1,{Union Telephone Company},WY,56041975400,{12},True,100.0,20.0,True,False,True,False,False,False,False,False,False


In [66]:
df.columns

Index(['LogRecNo', 'Provider_Id', 'FRN', 'ProviderName', 'DBAName',
       'HoldingCompanyName', 'HocoNum', 'HocoFinal', 'StateAbbr', 'BlockCode',
       'TechCode', 'Consumer', 'MaxAdDown', 'MaxAdUp', 'Business',
       'tract_geoid', 'tract_code', 'county_code', 'state_code'],
      dtype='object')

In [65]:
my_set = set()
my_set.update([{1,2},{2,3}])
my_set
# agged.groupby("tract").agg(
#     providers = ("provider_ids", lambda x: my_set.update(x))
# )

TypeError: unhashable type: 'set'

## EDA

Why do tech code 30 have speed of 0?

In [39]:
tech_code_30 = df[df.TechCode==30]
tech_code_30.MaxAdDown.value_counts()

0.000       407081
20.000       53373
12.000       14947
100.000       5146
25.000        4110
1000.000      1308
10.000         705
1.500          678
5.000          330
45.000         289
50.000         224
3.000          146
2.000          119
1.000           89
500.000         55
7.000           49
15.000          41
40.000          40
150.000         17
6.000           15
30.000          13
250.000         13
200.000         10
4.000            9
400.000          5
32.000           4
8.000            4
300.000          3
18.000           2
80.000           1
70.000           1
0.256            1
16.000           1
0.768            1
60.000           1
9.000            1
75.000           1
Name: MaxAdDown, dtype: int64

In [None]:
tech_code_30.MaxAdUp.value_counts()