In [1]:
import pandas as pd

# Dataset

## Import and modify the dataset

In [2]:
county_df = pd.read_csv("./data/merged_county_data.csv", dtype={"fips_code": str})

### Group Industries

In [3]:
# Group: Resource Extraction
county_df["pct_employed_resource_extraction"] = county_df[["pct_employed_agriculture_forestry_fishing_hunting_mining"]].sum(axis=1)

# Group: Manufacturing and Construction
county_df["pct_employed_manufacturing_construction"] = county_df[["pct_employed_construction", 
                                                                  "pct_employed_manufacturing"]].sum(axis=1)

# Group: Trade and Transportation
county_df["pct_employed_trade_transportation_utilities"] = county_df[["pct_employed_wholesale_trade", 
                                                                      "pct_employed_retail_trade", 
                                                                      "pct_employed_transportation_warehousing_utilities"]].sum(axis=1)

# Group: Knowledge Based
county_df["pct_employed_knowledge_professional"] = county_df[["pct_employed_information", 
                                                               "pct_employed_finance_insurance_real_estate", 
                                                               "pct_employed_professional_scientific_management_administration"]].sum(axis=1)

# Group: Essential (Public and Social)
county_df["pct_employed_public_social"] = county_df[["pct_employed_education_healthcare_social_assistance", 
                                                      "pct_employed_public_administration"]].sum(axis=1)

# Group: Recreation and Other
county_df["pct_employed_hospitality_other"] = county_df[["pct_employed_arts_entertainment_recreation_food_accommodation", 
                                                          "pct_employed_other_services"]].sum(axis=1)                                                          

In [4]:
# Round newly created columns to 4 decimals

county_rounding_dict = {
    "pct_employed_resource_extraction": 4,
    "pct_employed_manufacturing_construction": 4,
    "pct_employed_trade_transportation_utilities": 4,
    "pct_employed_knowledge_professional": 4,
    "pct_employed_public_social": 4,
    "pct_employed_hospitality_other": 4,
}

county_df = county_df.round(county_rounding_dict)

### New Metric: Percentage of Income spent on Housing 

 23  median_monthly_housing_cost                  3144 non-null   float64

 15  median_household_income                      3144 non-null   int64 

### Drop Columns

In [5]:
columns_to_drop = [
    "households",
    "mean_household_size",
    "pct_employed_agriculture_forestry_fishing_hunting_mining",
    "pct_employed_construction",
    "pct_employed_manufacturing",
    "pct_employed_wholesale_trade",
    "pct_employed_retail_trade",
    "pct_employed_transportation_warehousing_utilities",
    "pct_employed_information",
    "pct_employed_finance_insurance_real_estate",
    "pct_employed_professional_scientific_management_administration",
    "pct_employed_education_healthcare_social_assistance",
    "pct_employed_public_administration",
    "pct_employed_arts_entertainment_recreation_food_accommodation",
    "pct_employed_other_services"
]

county_df.drop(columns = columns_to_drop, inplace=True)

## NA values

### Find NA's

####  Top 5 values for each column

To find any different encodings for missing values other than NA (detected by pandas) it is useful to display the 5 most common values for each column. 

In [None]:
# Create a dictionary to store the top 5 values (as a list) for each column (key)
nan_top5_dict = {}

for col in county_df.columns:
    # Get the value counts
    counts = county_df[col].value_counts()
    # Get the first 5 results (most common)
    top5_values = counts.index.tolist()[:5]
    # If fewer than 5 unique values, replace non existing with None
    top5_values += [None] * (5 - len(top5_values))
    nan_top5_dict[col] = top5_values

# Create a new df from the results
nan_top5_df = pd.DataFrame.from_dict(nan_top5_dict, orient='index', 
                                 columns=["#1", "#2", "#3", "#4", "#5"])

nan_top5_df

Unnamed: 0,#1,#2,#3,#4,#5
fips_code,56045,01001,01003,01005,01007
county_name,Washington County,Jefferson County,Franklin County,Lincoln County,Jackson County
state_name,Texas,Georgia,Virginia,Kentucky,Missouri
rucc,9,8,1,2,6
area_classification,Nonmetro,Metro,,,
latitude,31.869194,33.980816,32.100516,31.752436,46.052904
longitude,-104.567657,-86.642734,-87.72256,-85.393235,-87.126467
population,25619,8810,10489,19229,14342
median_age,40.8,40.3,41.8,41.3,42.2
birth_rate,10.9,11.4,10.4,10.3,11.0


It this case no obvious anomalies could be found with this method. The only thing which is a bit suspicious are the 0's in the "pct_households_limited_english_proficiency" column, this should be investigated.

In [15]:
filtered_df = county_df[county_df["pct_households_limited_english_proficiency"] == 0]

filtered_df = filtered_df[["fips_code", "county_name", "state_name", "population", "pct_households_limited_english_proficiency"]]

filtered_df

Unnamed: 0,fips_code,county_name,state_name,population,pct_households_limited_english_proficiency
11,01023,Choctaw County,Alabama,12252,0.0
18,01037,Coosa County,Alabama,10268,0.0
28,01057,Fayette County,Alabama,15967,0.0
32,01065,Hale County,Alabama,14888,0.0
37,01075,Lamar County,Alabama,13661,0.0
...,...,...,...,...,...
3088,55078,Menominee County,Wisconsin,4226,0.0
3129,56017,Hot Springs County,Wyoming,4661,0.0
3130,56019,Johnson County,Wyoming,8759,0.0
3134,56027,Niobrara County,Wyoming,2354,0.0


After reviewing several cases and the original dataset for these percentages, it appears that these counties generally have a minimal or non-existent "limited English-speaking" population. The reported zeros may be a result of rounding.

#### NaN and negative Values

In [8]:
print(top5_dict)

{'fips_code': ['56045', '01001', '01003', '01005', '01007'], 'county_name': ['Washington County', 'Jefferson County', 'Franklin County', 'Lincoln County', 'Jackson County'], 'state_name': ['Texas', 'Georgia', 'Virginia', 'Kentucky', 'Missouri'], 'rucc': [9, 8, 1, 2, 6], 'area_classification': ['Nonmetro', 'Metro', None, None, None], 'latitude': [31.869194, 33.980816, 32.100516, 31.752436, 46.052904], 'longitude': [-104.567657, -86.642734, -87.72256, -85.393235, -87.126467], 'population': [25619, 8810, 10489, 19229, 14342], 'median_age': [40.8, 40.3, 41.8, 41.3, 42.2], 'birth_rate': [10.9, 11.4, 10.4, 10.3, 11.0], 'death_rate': [13.5, 13.2, 14.1, 12.7, 13.9], 'males_per_100_females': [99.0, 99.2, 97.6, 99.3, 100.1], 'poverty_rate': [0.105, 0.13, 0.135, 0.11, 0.119], 'unemployment_rate': [0.035, 0.033, 0.028, 0.034, 0.029], 'labor_force_participation_rate': [0.619, 0.572, 0.584, 0.558, 0.591], 'median_household_income': [63750, 56648, 62500, 60313, 55417], 'mean_household_income': [81976

most_common = df_.apply(lambda x: x.mode()[0] if not x.mode().empty else None)
print(most_common)

#### Save and retrieve dataset

In [9]:
asif

NameError: name 'asif' is not defined

Variable Explainations as a table

In [None]:
# Save to csv "county_data.csv"
county_df.to_csv("./data/county_data.csv", index=False)

# Retrieve dataset
county_df = pd.read_csv("./data/county_data.csv", dtype={"fips_code": str})

county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3144 entries, 0 to 3143
Data columns (total 34 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   fips_code                                    3144 non-null   int64  
 1   county_name                                  3144 non-null   object 
 2   state_name                                   3144 non-null   object 
 3   rucc                                         3144 non-null   int64  
 4   area_classification                          3144 non-null   object 
 5   latitude                                     3144 non-null   float64
 6   longitude                                    3144 non-null   float64
 7   population                                   3144 non-null   int64  
 8   median_age                                   3144 non-null   float64
 9   birth_rate                                   3144 non-null   float64
 10  