<a href="https://colab.research.google.com/github/ischmidl-nd/trafficking/blob/main/Human_Trafficking_EG_10118.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from os import makedirs
from os import path
# initalize folder for html
newpath = r'html'
if not path.exists(newpath): 
  makedirs(newpath)

# csv/api to lists/plots
 modified from HW 6 (Q6-8) 

1. UCR Human trafficking by state and gender

2. Census -- state name, population total, poverty total

3. NCVS -- crime reports by household income & US region

In [9]:
# UCR Human Trafficking data by state and act type
# Modified from HW6 Q6 new csv to lists


import csv
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd

# ORIGINAL source: https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/additional-publications/human-trafficking/table-1
# mannually removed newlines from inside cells/lines
filename = "table-1.csv"

# read file to 2D list
fh = open(filename, 'r')
csv_reader = csv.reader(fh)
line_list = []
for row in csv_reader:
  line_list.append(row)
fh.close()

# initialize lists
col_list_dic = {} # organize trafficking by category, then type, then count
state_list = []
offense_list = [] # col lst dic collapsed


# initialize dict & list with lists of offense counts per state
for act_type in line_list[3][1:]: # 1st lvl headers skip state
  if act_type != '' and act_type not in col_list_dic:
    col_list_dic[act_type] = {}
  
  for offense in line_list[4][1:]: # 2nd lvl headers
    if act_type in col_list_dic and offense not in col_list_dic[act_type]:
      col_list_dic[act_type][offense] = []
      # store literal of list in dict  in list of collapsed dictionaires, i.e. values w/o keys
      offense_list.append(col_list_dic[act_type][offense])


# populate dict & list with lists of offense counts per state
for i in range(5, len(line_list)): # skip first 4 lines headers, for rows
  # list of states alphabetical
  state_list.append(line_list[i][0])
  for j in range(1, 10): # for col in rows
    # offense counts in dict & list corresponding to state list indexes
    offense_list[j - 1].append(int(line_list[i][j])) # append value to list literal
    # equivalent: col_list_dic[act_type][offense] = line_list[i][j]

# --------------------data frame-----------------------------
# create list of offenses
offense_act_list = []
for act_type in col_list_dic:
  for offense in col_list_dic[act_type]:
        if act_type == "Involuntary servitude & Commercial sex acts":
            offense_act_list.append("Total Acts " + offense)
        else:
            offense_act_list.append(act_type + " " + offense)

# transpose offense list
tran_offense_list = [[row[i] for row in offense_list] for i in range(len(offense_list[0]))]
df_traffick = pd.DataFrame(tran_offense_list, columns=offense_act_list, index=state_list)
# print(df_traffick)

# # plotly express
title = "Offenses and Clearances of Sex Acts and Involuntary Servitude by State 2016 "+"<br><sup>FBI: Uniform Crime Reports</sup>"

#------------------------------Removed single bar chart-------------------
# # Stacking did not make sense & comparision was unintelligibly small
# fig1 = px.bar(df_traffick, labels={"index" : "State", "value" : "Occurrences"},
#                  title=title, barmode="overlay", width=1700, height=900,
#               opacity=0.5)
# fig1.update_xaxes(tickangle=45, nticks=len(state_list))
# fig1.show()

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig1 = make_subplots(rows=len(col_list_dic), cols=3, subplot_titles=(offense_act_list),
                     specs=[[{"type": "domain"}]*3]*3,
                    # shared_xaxes=True, shared_yaxes=True, 
                    vertical_spacing=0.05, horizontal_spacing=0.01)

## ------------------removed subplot bar chart because redundant with pies-------------
# fig2 = make_subplots(rows=len(col_list_dic), cols=3, subplot_titles=(offense_act_list),
#                     shared_xaxes=True, shared_yaxes=True, 
#                     vertical_spacing=0.05, horizontal_spacing=0.01)

#----------------------Potentially grouptogether contributors of <4% as "Other"------------
# # eliminate values too small
# df_min_traff = df_traffick.copy()
# for col in df_min_traff:
#     if max(df_min_traff[col]) < 1:
#         df_min_traff.drop(col, axis=1, inplace=True)

# df_min_traff["States <4% of Total"] = (df_min_traff.sum(axis=1) - 100) * -1

#--------------------------Plot--------------------------------------------------
# plot number of offenses per state by offense type
for i, act_type in enumerate(col_list_dic):
  for j, offense in enumerate(col_list_dic[act_type]):
    name = offense_act_list[3*i+j]

    # fig2.add_trace(
    #     go.Bar(x=state_list, y=col_list_dic[act_type][offense], name=name), 
    #     row=i+1, col=j+1
    #               )
    

    fig1.add_trace(
        go.Pie(labels=state_list, values=col_list_dic[act_type][offense],
               domain=(dict(x=[i*0.3, 0.3+i*0.3],y=[j*0.3, 0.3+j*0.3])),
               ),       
        row=i+1, col=j+1,
    )

# fig2.update_layout(height=700, width=2000, title=title)
# fig2.update_xaxes(tickangle=45, nticks=len(state_list))

fig1.update_layout(
    # height=1500, width=1500, 
    title=title)
fig1.update_traces(textposition='inside', textinfo='percent+label')

for i in fig1['layout']['annotations']:
    i['font'] = dict(size=6)

fig1.update_layout(font=dict(size=8))
fig1.show()
# fig2.show()

# WRITE HTML 
with open('html/5_ucr_type.html', 'w') as f:
    f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))
    # f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))


In [10]:
# Census ACS (American Community Survey) -- state name, population total, poverty total
# Modified from HW6  Q7 api requests to list


import requests
import json
import matplotlib.pyplot as plt

import plotly.express as px
import pandas as pd


url4 = "https://api.census.gov/data/2019/acs/acs1?get=NAME,B01001_001E,B17001_002E&for=state" # reference to census data in json dictionary
# state name, population total, poverty total
data_text = requests.get(url4).text

# state codes
url_states = "https://worldpopulationreview.com/static/states/name-abbr.json"
codes_text = requests.get(url_states).text
codes_dct = json.loads(codes_text)

data_list = json.loads(data_text)

state_name_list = []
poverty_list = []
population_list = []
pov_pop_list = []
codes_list = []


# sort  
for state in data_list[1:]: # skip headers
  state_name_list.append(state[0])
  population_list.append(int(state[1]) / 1000000)
  poverty_list.append(int(state[2]) / 100000)
  pov_pop_list.append(round(100 * (int(state[2]) / int(state[1])), 1)) # ratio poverty to total
  
  if state[0].title() in codes_dct:
    codes_list.append(codes_dct[state[0].title()])
  else:
    print(state[0])

print(state_name_list, "pov pop", poverty_list, "tot pop", population_list, "pov per tot", pov_pop_list, sep='\n')

# plotly express
for i, lst in enumerate(data_list[1:]):
  # convert apropriate data_list values to integers
  data_list[i + 1] = [lst[0]] + [ int(num) for num in lst[1:-1]]

df_census = pd.DataFrame(data_list[1:], columns=["State", "Total Population", "Poverty Population"])
df_census['Percent Poverty'] = pov_pop_list
df_census = df_census.sort_values(by="Percent Poverty")
print(df_census)

sub = "<br><sup>U.S. Census: American Community Survey (ACS)</sup>"

fig1 = px.bar(df_census, x="State", y=df_census.columns[1:-1], labels={"value" : "Number of People"},
            #   width=1000, height=500,
              title="Total population and poverty popuation by U.S. state"+sub, barmode="overlay", opacity=1)

# write percent because easier than adding addtional line
fig1.for_each_trace(
    lambda trace: trace.update(text=df_census["Percent Poverty"], textposition='outside') if trace.name == "Total Population" else (),
)

# fig1.update_layout(uniformtext_minsize=60)
fig1.update_layout(font=dict(size=8))
fig1.show()

# # line of percentages for each state -------- too difficult
# fig1 = px.line(df_census, x="State", y=df_census["Percent Poverty"]*1000, title="Total population and poverty popuation by U.S. state"+sub, )
# fig1.add_bar(x=df_census["State"], y=df_census.columns[1:-1],
#              text=df_census["Percent Poverty"], width=1000, opacity=1)
# 

# ---------------------- redundant with choropleth ---------------------------------------------
# fig2 = px.bar(df_census, x="State", y="Percent Poverty", labels={"value" : "Number of People"}, 
#              width=1500, color="Percent Poverty", range_color=[5,20], 
#               title="Percent of popuation in poverty by U.S. state"+sub)
# fig2.show()

# ------------------------------CHOROPLETH-------------------------------------------
pov_pop_list.remove(pov_pop_list[state_name_list.index("Puerto Rico")]) # puerto rico not in state code api
fig3 = px.choropleth(locations=codes_list, locationmode="USA-states", 
                     color=pov_pop_list, scope="usa", labels={"color" : "Percent population in poverty"},
                     title="Percent of popuation in poverty by U.S. state"+sub,
                    #  height=500, width=700
                     )
fig3.show()

# WRITE HTML
with open('html/1_acs_pov_pop_census.html', 'w') as f:
    f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))
    # f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig3.to_html(full_html=False, include_plotlyjs='cdn'))


Puerto Rico
['Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Vermont', 'Utah', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico', 'Illinois', 'Georgia', 'Idaho', 'Hawaii', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Connecticut', 'Florida', 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Delaware', 'District of Columbia']
pov pop
[5.64192, 7.70175, 1.31882, 1.85761, 3.79564, 0.95711, 7.98262, 3.7323, 24.67006, 13.86122, 0.78112, 14.84862, 5.83029, 4.70643, 14.89333, 1.10244, 6.92744, 1.01946, 9.22176, 38.6501, 0.60897, 2.81279, 8.22775, 7.3024, 2.78734, 5.91486, 0.56945, 13.76809, 14.20542, 13.73909, 1.95984, 1.28722, 7.75823, 3.41613, 3.2318

In [11]:
# NCVS Household income and crime by region
# modified from hw6 Q8 another api requests to list

import requests
import json
import matplotlib.pyplot as plt

import plotly.express as px
import pandas as pd

# https://www.bjs.gov/developer/ncvs/ -->
# https://api.bjs.ojp.gov/bjs/ncvs/v2/personal/2019?format=json  -->
url5 = "https://ischmidls.github.io/html/NCVS_2019_PERSONAL.json"

data_text = requests.get(url5).text

json_data_dic = json.loads(data_text)
data_list = json_data_dic["personalData"]

# intialize lists of json values
hinc_list = [] # household income
region_list = [] # number of people in household
pop_list = [] # size of population around crime

for survey_dic in data_list:
  hinc_list.append(survey_dic["hincome"])
  region_list.append(survey_dic["region"])
  pop_list.append(survey_dic["popsize"])
# raw_value_list = [hinc_list, region_list, pop_list]
# # transpose
# raw_value_list = [[row[i] for row in raw_value_list] for i in range(len(raw_value_list[0]))]

# meaning of json values: https://api.bjs.ojp.gov/bjs/ncvs/v2/household/fields/
# lists of json value meanings
hinc_key_list = "0 7,500 15,000 25,000 35,000 50,000 75,000 Unknown".split(" ") # lower bound thousands, i + 1
region_key_list = "NE MW S W".split(" ") # region, i + 1
pop_key_list = "Unknown 100,000 250,000 500,000 1,000,000 >1,000,000".split(" ") # i
x_title_list = "Household Incomes ($),Regions in US,Surrounding Population".split(',')
key_lists = [hinc_key_list, region_key_list, pop_key_list]

# lists of occurances of values in json
def percent_list(lst, val):
  """given val and lst, return percent of lst that is val"""
  return round((lst.count(val) / len(lst)) * 100, 1)

# lists of percent count of occurances of values
hinc_num_list = [percent_list(hinc_list, str(i)) for i in range(1, 8)] + [percent_list(hinc_list, '88')]
region_num_list = [percent_list(region_list, str(i)) for i in range(1, 5)]
pop_num_list = [percent_list(pop_list, str(i)) for i in range(6)]

num_value_list = [hinc_num_list, region_num_list, pop_num_list]

# ---- nonsensible dataframe ----
# df = pd.DataFrame(num_value_list)
# df = df.transpose()
# df.columns = x_title_list

# plotly
fig_list = []
scale_list = [px.colors.sequential.ice, px.colors.sequential.Hot, px.colors.sequential.solar]
for i in range(3):
  labels = key_lists[i]
  x = num_value_list[i]
  title = x_title_list[i] + "<br><sup>Bureau of Justice Statistics: National Crime Victimization Survey (NCVS) 2019</sup>"
  fig = px.pie(values=x, names=labels, color_discrete_sequence=scale_list[i],
               title="Percent of Reports for Crime Occurances by " + title,
            #    width=500, height=500
               )
  fig.update_layout(font=dict(size=8))
  fig_list.append(fig)
  fig.show()


# write to HTML
with open('html/2_ncvs_crime.html', 'w') as f:

    for fig in fig_list:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))

    # cannot write on same line because overflows column in bootstrap
    # f.write('<div align="center" style=white-space: nowrap; overflow-x: auto;>')
    # for fig in fig_list:
    #     f.write('<div style="display: inline-block;margin-left: 10px;">' + fig.to_html(full_html=False, include_plotlyjs='cdn') + '</div>')
    # f.write("</div>")

# print lists
print(key_lists, '\n', num_value_list, '\n\n')


[['0', '7,500', '15,000', '25,000', '35,000', '50,000', '75,000', 'Unknown'], ['NE', 'MW', 'S', 'W'], ['Unknown', '100,000', '250,000', '500,000', '1,000,000', '>1,000,000']] 
 [[7.1, 9.1, 9.1, 8.0, 11.7, 12.3, 25.0, 17.8], [11.2, 30.4, 29.9, 28.5], [22.9, 43.8, 10.2, 8.3, 8.2, 6.6]] 




# CSV & API to lists/dictionaries --- to dataframes/plots
modified from HW 7 (Q1-5) 
plots for:
- census data on race and household income in different states
- trafficking data (focus on gender) from "Counter Trafficking Data Collaborative (CTDC)"

In [12]:
# ctdc trafficking - graphs not exported
# modified from hw7 Q1 CSV DATA TO LISTS

import pandas as pd
import plotly.express as px

# need random because original data too big
import random

# read file
filename = "USA-CTDC_synthetic_20210825.csv"
with open(filename, 'r') as fh: # each elem in lst is line
  line_list = fh.read().split('\n')

# RANDOM: 15600 total and 66221 for USA entries too many for colab, so take random 1000 with headers
# line_list = [line_list[0]] + random.sample(line_list[1:], 1000)
print(line_list)

data_dct = {}
for demographic in line_list[0].split(","): # quick list organization by column in dct
  data_dct[demographic] = []

# write lines to lists by column
for line in line_list[1:]: # skip header
  for i, demographic in enumerate(data_dct):
    data_dct[demographic].append(line.split(',')[i])

# list of years in order w/o unspecified
raw_years = data_dct["yearOfRegistration"]
year_list = sorted(list(set(raw_years)))[3:] # possible year options
print(year_list)


from plotly.subplots import make_subplots
import plotly.graph_objects as go

title_list = "Gender,Age,Minor status,Citizenship".split(",")
fig2 = make_subplots(rows=2, cols=2, subplot_titles=title_list,
                    shared_xaxes=True, shared_yaxes=True)

# cannot use count() function because interdependant dataframe columns
header_list = "gender ageBroad majorityStatusAtExploit citizenship".split(" ")

for j, header in enumerate(header_list):
  print("header", header)
  raw_demographic = data_dct[header]
  subdem_list = list(set(raw_demographic)) # ex: [Adult, Minor, Unspecified]
  subdem_count_dct = {header: [0] * len(year_list) for header in subdem_list} # ex: {Minor: [0,0,0,0,0], ...}
  for i, raw_subdem in enumerate(raw_demographic): # ex: data_dct["gender"] = [Female, Female, Female, Male, Female, ...]
    
    # create list with counts for header by year
    # get index of header, get year at index
    # increment count at index in fem/mal year lst
    
    year_col = raw_years[i] # year in same column as header
    if year_col not in ',2009,2010,2011,2012,2013,2014'.split(","): # do not count early years b/c inconsitent data
      i_year = year_list.index(year_col)
      subdem_count_dct[raw_subdem][i_year] += 1

  subdem_count_dct['Unspecified'] = subdem_count_dct.pop('') # call blank ctzn 'unspecified'

  header_count_list = [subdem_count_dct[header] for header in subdem_count_dct] # 2d_list[i][j] i (rows) are headers, j (columns) years
  header_year_rows = [[row[i] for row in header_count_list] for i in range(len(header_count_list[0]))] # transpose: # rows years, columns headers
  

  # means
  for i, year_row in enumerate(header_year_rows): # ex [0,1,2,3] (represents subdem counts for year)
    temp_list = list(range(len(year_row)))
    for k, subdem_year_count in enumerate(year_row): # ex: 2016 minor, 2016 adult, 2016 unspecified
      temp_list[k] = round((100 * subdem_year_count / sum(year_row)), 1) # percent
    header_year_rows[i] = temp_list
    print(year_row)

  # ------------DATAFRAME------------------------------------------------------
  df_header = pd.DataFrame(header_year_rows, index=year_list, columns=list(subdem_count_dct.keys()))
  with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_header)
  

  # --------------PLOT---------------------------------------------------------
  sub = "<br><sup>Counter Trafficking Data Collaborative: Global Sythetic Data Set</sup>"
  title= "Percent of victims in the U.S. trafficked by " + title_list[j] + " 2015-2019" + sub

  header_fig_list = []
  header_fig_subhtml_list = []

  if header == "citizenship":

    # eliminate values too small
    for col in df_header:
      if max(df_header[col]) < 1:
        df_header.drop(col, axis=1, inplace=True)
    
    df_header["Countries <1% of Total Victims"] = (df_header.sum(axis=1) - 100) * -1

    fig1 = px.bar(df_header, y=df_header.columns, 
                      labels={"index": "Year", "value": "Percent of victims"},
                      title=title,
                    #   width=700, height=500
                  )
    fig1.update_layout(font=dict(size=8))
    # fig1.show()
    header_fig_list.append(fig1)

    # specified = list(df_header.columns)
    # specified.remove("Unspecified")
    # title= "Percent of victims in the U.S. trafficked by (Specified) " + title_list[j] + " 2015-2019" + sub
    # fig2 = px.bar(df_header, y=specified, 
    #                 labels={"index": "Year", "value": "Percent of victims"},
    #                 title=title,
    #                 width=700, height=500)
    # fig2.show()
    # header_fig_list.append(fig2)

  else:

    # plot trafficking by gender data frame
    fig1 = px.scatter(df_header, x=df_header.index, y=df_header.columns, 
                      labels={"index": "Year", "value": "Percent of victims", "variable": title_list[j]},
                      title=title, trendline="ols",
                    #   width=500, height=500, 
                      log_y=False)
    
    title= "Percent of victims in the U.S. trafficked by (Specifed) " + title_list[j] + " 2019" + sub
    fig1.update_layout(font=dict(size=8))
    # fig1.show()
    header_fig_list.append(fig1)

    fig2 = px.pie(df_header, values=df_header.iloc[4][:-1], names=df_header.columns[:-1],
                    title=title, labels=dict(values="Percent of victims"),
                    # color_discrete_sequence=px.colors.sequential.ice,
                #   color=df_header.iloc[4][:-1],
                    # width=500, height=500
                  )
    fig2.update_layout(font=dict(size=8))
    # fig2.show()
    header_fig_list.append(fig2)
    
    #----------------REMOVE BAR CHARTS SHOWING CHANGE B/C REDUNDANT------------------------------------------------
    # d_list = [round(df_header[header].iloc[4] - df_header[header].iloc[0], 1) for header in list(df_header.columns)]
    # title= "Total change in percent of victims in the U.S. trafficked by " + title_list[j] + " 2015-2019" + sub
    # fig3 = px.bar(x=list(df_header.columns), y=d_list, 
    #                   labels={"x": "Demographic", "y": "Change in Percent", "color" : "Change in Percent", "text" : "Change in Percent"},
    #                   title=title, text=d_list, color=d_list,
    #                   width=500, height=500,)
    # fig3.update_layout(font=dict(size=8))
    # fig3.show()
    # header_fig_list.append(fig3)


  with open(f'html/3_ctdc_{header}.html', 'w') as f:
    for fig in header_fig_list:
      fig.show()
      f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))



#   # write visuals of same demographic to same html row
#   header_fig_subhtml_list.append(f"\n\t<!-- CTDC {header} --><div style=white-space: nowrap; overflow-x: auto;>")
#   for fig in header_fig_list:
#     subhtml_str = '<div style="display: inline-block;margin-left: 10px;">' + fig.to_html(full_html=False, include_plotlyjs='cdn') + "</div>"
#     header_fig_subhtml_list.append(subhtml_str)
#   header_fig_subhtml_list.append("</div>")


#     # write rows to html
#   with open(f'html/3_ctdc_{header}.html', 'w') as f:
#     for html in header_fig_subhtml_list:
#       f.write(html)



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



['2015', '2016', '2017', '2018', '2019']
header gender
[7, 297, 4030, 263]
[9, 686, 6035, 498]
[12, 1098, 7756, 846]
[45, 2407, 12973, 3305]
[57, 2644, 14285, 4334]
      Transgender/NonConforming  Male  Female  Unspecified
2015                        0.2   6.5    87.7          5.7
2016                        0.1   9.5    83.5          6.9
2017                        0.1  11.3    79.9          8.7
2018                        0.2  12.9    69.3         17.6
2019                        0.3  12.4    67.0         20.3



pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



header ageBroad
[768, 14, 79, 38, 8, 134, 244, 5, 112, 3195]
[1492, 94, 257, 129, 43, 368, 577, 45, 343, 3880]
[1740, 144, 272, 221, 57, 442, 601, 130, 387, 5718]
[2170, 231, 410, 283, 110, 481, 744, 235, 697, 13369]
[2011, 279, 358, 310, 220, 425, 676, 172, 792, 16077]
      09--17  39--47  24--26  27--29  48+  21--23  18--20  0--8  30--38  \
2015    16.7     0.3     1.7     0.8  0.2     2.9     5.3   0.1     2.4   
2016    20.6     1.3     3.6     1.8  0.6     5.1     8.0   0.6     4.7   
2017    17.9     1.5     2.8     2.3  0.6     4.6     6.2   1.3     4.0   
2018    11.6     1.2     2.2     1.5  0.6     2.6     4.0   1.3     3.7   
2019     9.4     1.3     1.7     1.5  1.0     2.0     3.2   0.8     3.7   

      Unspecified  
2015         69.5  
2016         53.7  
2017         58.9  
2018         71.4  
2019         75.4  


header majorityStatusAtExploit
[937, 458, 3202]
[2250, 1158, 3820]
[2718, 1427, 5567]
[4477, 1810, 12443]
[4981, 1042, 15297]
      Minor  Adult  Unspecified
2015   20.4   10.0         69.7
2016   31.1   16.0         52.9
2017   28.0   14.7         57.3
2018   23.9    9.7         66.4
2019   23.4    4.9         71.7


header citizenship
[0, 0, 1, 2, 0, 0, 0, 0, 1344, 0, 0, 0, 46, 2, 0, 0, 0, 0, 0, 0, 2, 0, 6, 0, 0, 0, 5, 0, 0, 41, 0, 20, 3128]
[0, 0, 2, 0, 0, 0, 0, 0, 1725, 0, 0, 8, 179, 1, 0, 1, 3, 0, 0, 0, 4, 0, 21, 0, 0, 0, 23, 0, 0, 17, 1, 58, 5185]
[0, 0, 2, 16, 0, 1, 0, 0, 1665, 5, 1, 2, 327, 20, 0, 0, 3, 0, 0, 1, 8, 0, 7, 0, 0, 0, 24, 20, 0, 20, 2, 34, 7554]
[17, 0, 12, 0, 0, 0, 54, 0, 1739, 0, 3, 5, 1111, 4, 1, 0, 31, 2, 0, 0, 8, 0, 11, 12, 0, 12, 31, 0, 0, 4, 11, 142, 15520]
[3, 2, 3, 18, 0, 0, 125, 0, 1036, 4, 2, 3, 952, 1, 1, 0, 0, 0, 108, 12, 9, 93, 46, 0, 6, 0, 30, 0, 21, 24, 16, 168, 18637]
      BRA  TWN  RUS  IND  KEN  DOM  JAM  ETH   USA  ROU  CAN  THA  MEX  SLV  \
2015  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  29.2  0.0  0.0  0.0  1.0  0.0   
2016  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  23.9  0.0  0.0  0.1  2.5  0.0   
2017  0.0  0.0  0.0  0.2  0.0  0.0  0.0  0.0  17.1  0.1  0.0  0.0  3.4  0.2   
2018  0.1  0.0  0.1  0.0  0.0  0.0  0.3  0.0   9.3  0.0  0.0  0.0  5.9  0.0   
2019  0

In [13]:
# # Data USA ---- Household Income, Race, State
# # modified from hw7 Question 02: API requests to lists


# import requests
# import json

# import pandas as pd
# import plotly.express as px

# url = "https://datausa.io/api/data?Geography=01000US:children&measure=Household%20Income%20by%20Race,Household%20Income%20by%20Race%20Moe&drilldowns=Race"
# data_text = requests.get(url).text
# json_data_dct = json.loads(data_text)
# state_list = json_data_dct["data"] # elem in lst : dict


# header_list = ['Race', 'State', 'ID Year', 'Household Income by Race']
# row_list = [[] for i in range(len(state_list))] # list of empty lists for state in state_list
# for i, state_dct in enumerate(state_list):
#   for header in header_list:
#     row_list[i].append(state_dct[header]) # populate list with header info for state

# # dataframes for Household income by state and race
# df2 = pd.DataFrame(row_list, columns=header_list)
# print(df2)


In [14]:
# Household income by race by household
# modifeid from hw7 Question 04: dictionaries to dataframes

import requests
import json
import pandas as pd

# codebeautify.org/jsonviewer
url = "https://datausa.io/api/data?Geography=01000US:children&measure=Household%20Income%20by%20Race,Household%20Income%20by%20Race%20Moe&drilldowns=Race"
data_text = requests.get(url).text
json_data_dct = json.loads(data_text)
state_list = json_data_dct["data"] # elem in lst : dict

col_list = ['Race', 'State', 'ID Year', 'Household Income by Race'] # 'Race', 'State', 'ID Year', 'Household Income by Race' columns
col_dct = {header: [state[header] for state in state_list] for header in col_list} # every row is a state, values lists representing columns
sorted_list_dct = {header: sorted(list(set(col_dct[header]))) for header in col_list} # unique elem in each col


mean_income_dct = {col : [[] for n in range(len(sorted_list_dct[col]))] for col in col_list[:-1]} # mean household
sample_income_dct = {col : [[] for n in range(len(sorted_list_dct[col]))] for col in col_list[:-1]} # sample household

for col in col_list[:-1]:
  for i, income in enumerate(col_dct['Household Income by Race']):
    mean_income_dct[col][sorted_list_dct[col].index(col_dct[col][i])].append(income) # list of incomes for each column
    sample_income_dct[col][sorted_list_dct[col].index(col_dct[col][i])].append(income) # list of incomes for each column

# --------------MEAN CALC----------------
for key in mean_income_dct:
  for i, inner_list in enumerate(mean_income_dct[key]):
    mean_income_dct[key][i] = round(sum(inner_list) / len(inner_list), -2) # mean incomes for each col

print(sorted_list_dct)
for key in mean_income_dct:
  print(key, sorted_list_dct[key])
  print(key, mean_income_dct[key])

# list of data frames for mean income by column
temp_dct = {}
income_df_list = []
for key in mean_income_dct:
  temp_dct[key] = sorted_list_dct[key]
  temp_dct["Mean Income"] = mean_income_dct[key]
  df = pd.DataFrame(temp_dct)
  print(df)
  income_df_list.append(df)
  temp_dct.clear()

# gender/trafficking plotted & mean household income/state/race
# modified from hw7 Question 05: Plotly with your dataframe

import plotly.express as px

# plot mean income data frames
sub = "<br><sub>Data USA: Geography, Household Income, Race</sub>"

# mean Household income by race
df_race = income_df_list[0]
df_race = df_race.sort_values(by="Mean Income")
key = list(sorted_list_dct.keys())[0]
fig2 = px.bar(df_race, x=key, y="Mean Income", 
            #   width=500, height=500,
                title="Mean Household Income ($) by " + key + sub, color="Mean Income",
              text=df_race["Mean Income"]/1000)
fig2.update_traces(textposition='outside')

# y axis
miny = min(df_race["Mean Income"]) - 5000
maxy = max(df_race["Mean Income"]) + 2000
fig2.update_yaxes(range=[miny, maxy])

fig2.update_layout(font=dict(size=8))
fig2.show()

# mean Household income by state
df_hincome = income_df_list[1]
df_hincome = df_hincome.sort_values(by="Mean Income")
key = list(sorted_list_dct.keys())[1]
fig3 = px.bar(df_hincome, x=key, y="Mean Income", 
            #   width=1200, height=500,
                title="Mean Household Income ($) by " + key + sub, color="Mean Income",
              text=df_hincome["Mean Income"]/1000)
fig3.update_traces(textposition='outside')

# y axis
miny = min(df_hincome["Mean Income"]) - 10000
maxy = max(df_hincome["Mean Income"]) + 5000
fig3.update_yaxes(range=[miny, maxy])

fig3.show()

with open('html/4_datausa_race_state_hinc.html', 'w') as f:
    f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig3.to_html(full_html=False, include_plotlyjs='cdn'))


print("""These numbers come from survey samples so cannot represent the entire population.
Further, the mean does not account for the range of possible values or the distribution
of values within the range.

Mean income plotted against race suggests unequality among racial groups
without accounting for age, gender, education, and other factors.

Mean income by state suggests individuals in some states have higher earnings than others
but does not account for cost of living or for outliers of unusually low and high incomes.

Mean income by year shows increases in the money that people make but does not 
account for inflation, population growth, or other variables.""")

{'Race': ['Asian', 'Black', 'Hispanic', 'Native American', 'Other', 'Pacific Islander', 'Total', 'Two Or More', 'White', 'White Non-Hispanic'], 'State': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'], 'ID Year': [2013, 2014, 2015, 2016, 2017, 2018, 2019], 'Household Income by Race': [2499, 8996, 10089, 14081, 14991, 15396, 15905, 16094, 16231, 16240, 16716, 16760, 16866, 17353, 1743

These numbers come from survey samples so cannot represent the entire population.
Further, the mean does not account for the range of possible values or the distribution
of values within the range.

Mean income plotted against race suggests unequality among racial groups
without accounting for age, gender, education, and other factors.

Mean income by state suggests individuals in some states have higher earnings than others
but does not account for cost of living or for outliers of unusually low and high incomes.

Mean income by year shows increases in the money that people make but does not 
account for inflation, population growth, or other variables.


# CROSS ANALYSIS

In [15]:
from numpy.ma.core import multiply
#   CROSS ANALYSIS

# from UCR state and gender offenses
offenses = df_traffick["Total Acts Offenses"]
offense_index = list(df_traffick.index)
offenses = pd.DataFrame(offenses, index=offense_index)
offenses = offenses.rename(columns={"Involuntary servitude & Commercial sex acts Offenses":"Total Trafficking"})
# print(offenses)

# from census poverty percent by state
percents = pd.DataFrame(df_census[["Percent Poverty"]])
percents.set_index(df_census["State"], inplace=True)
# print(percents)


# from Data USA: Household Income by State
hincome = pd.DataFrame(df_hincome["Mean Income"])
hincome.set_index(df_hincome["State"], inplace=True)
# print(hincome)


cross_df = percents.join(hincome)
cross_df = cross_df.join(offenses)
cross_df = cross_df.dropna()

fig = make_subplots(rows=3, cols=1, subplot_titles=(cross_df.columns),
                    shared_xaxes=True, vertical_spacing=0.1)

# plot number of offenses per state by offense type
scale_list = ["YlOrBr", "greens", "reds"]
choro_list = []
for i, col in enumerate(cross_df.columns):
    fig.add_trace(go.Bar(x=cross_df.index, y=cross_df[col], name=col + "by state", 
                marker=dict(color=cross_df[col], colorscale=scale_list[i], cmin=min(cross_df[col]))), 
                row=i+1, col=1)
    
    # state codes for chloropleth
    codes_list = []
    for state in list(cross_df.index):
        if state in codes_dct:
            codes_list.append(codes_dct[state])

    # remove value for puerto rico
    cross_df_col = list(cross_df[col])
    del cross_df_col[list(cross_df.index).index("Puerto Rico")]


    fig2 = px.choropleth(cross_df, locations=codes_list, locationmode="USA-states", 
                     color=cross_df_col, scope="usa", labels={"color" : col},
                     title="Cross Analysis " + col + " by U.S. state"+sub,
                    #  width=500, height=500
                     )
    fig2.update_layout(font=dict(size=8))
    fig2.show()
    choro_list.append(fig2)

sub = "<br><sup>Data USA 'Household Income by State'; Census 'American Community Survey'; FBI 'Uniform Crime Reports'</sup>"
title = "States Cross-analysis of Household Income, Poverty, and Trafficking" + sub

fig.update_layout(
    # height=500, width=1300, 
    title=title)
fig.update_xaxes(tickangle=45, nticks=len(state_list))
fig.update_layout(font=dict(size=8))
fig.show()

# HTML

#   DISPLAYS ON SHARED LINE OUTSIDE COLAB
with open('html/6_cross_anlys.html', 'w') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
    for choro in choro_list:
        f.write(choro.to_html(full_html=False, include_plotlyjs='cdn'))
    # # cannot write on same line because overflows column in bootstrap
    # f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
    # f.write("<div style=white-space: nowrap; overflow-x: auto;>")
    # for choro in choro_list:
    #     f.write('<div style="display: inline-block;margin-left: 10px;">' + choro.to_html(full_html=False, include_plotlyjs='cdn') + '</div>')
    # f.write("</div>")
cross_df

Unnamed: 0_level_0,Percent Poverty,Mean Income,Total Acts Offenses
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maryland,8.8,76300.0,17.0
Minnesota,8.8,55500.0,235.0
Utah,8.8,56800.0,0.0
Hawaii,9.1,69400.0,2.0
Massachusetts,9.1,62700.0,3.0
Colorado,9.2,58800.0,34.0
Washington,9.6,60900.0,14.0
Connecticut,9.7,60400.0,2.0
Vermont,9.8,52300.0,2.0
Wyoming,9.8,55400.0,5.0


In [16]:
import os
# assign directory
directory = 'html'
 
# iterate over files in
# that directory
directory_list = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        directory_list.append(f)

# write html files to single list of lines with HTML comment of file name 
html_string_list = []
for file in directory_list:
  with open(file, 'r') as fh:
    html_string = fh.read()
    html_string_list.append(f'<!--"{file}"--> <div align="center">{html_string}</div>')
  
# write list of lines to new HTML file
with open("index.html", 'w') as fh:
    fh.truncate()
    for html_string in html_string_list:
        fh.write(html_string + '\n')

# display html in colab ipy console
from IPython.core.display import display, HTML
display(HTML('\n'.join(html_string_list)))