In [1]:
import pandas as pd 

csv_files = ['acs2015_census_tract_data.csv', 'acs2015_county_data.csv', 'acs2017_census_tract_data.csv', 'acs2017_county_data.csv']

dataframes = [pd.read_csv(file) for file in csv_files]

combined_dataframe = pd.concat(dataframes)

print(combined_dataframe.head())

    CensusTract    State   County  TotalPop   Men  Women  Hispanic  White  \
0  1.001020e+09  Alabama  Autauga      1948   940   1008       0.9   87.4   
1  1.001020e+09  Alabama  Autauga      2156  1059   1097       0.8   40.4   
2  1.001020e+09  Alabama  Autauga      2968  1364   1604       0.0   74.5   
3  1.001020e+09  Alabama  Autauga      4423  2172   2251      10.5   82.8   
4  1.001020e+09  Alabama  Autauga     10763  4922   5841       0.7   68.5   

   Black  Native  ...  Employed  PrivateWork  PublicWork  SelfEmployed  \
0    7.7     0.3  ...       943         77.1        18.3           4.6   
1   53.3     0.0  ...       753         77.0        16.9           6.1   
2   18.6     0.5  ...      1373         64.1        23.6          12.3   
3    3.7     1.6  ...      1782         75.7        21.2           3.1   
4   24.8     0.0  ...      5037         67.1        27.6           5.3   

   FamilyWork  Unemployment  CensusId  TractId  VotingAgeCitizen  CountyId  
0         0.0  

In [7]:
num_instances = combined_dataframe.shape[0]
num_attributes = combined_dataframe.shape[1]

print("Number of instances:", num_instances)
print("Number of attributes:", num_attributes)

Number of instances: 154442
Number of attributes: 41


In [9]:
variables = combined_dataframe.columns

print(variables)

Index(['CensusTract', 'State', 'County', 'TotalPop', 'Men', 'Women',
       'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Citizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment', 'CensusId', 'TractId',
       'VotingAgeCitizen', 'CountyId'],
      dtype='object')


In [13]:
variable_types = combined_dataframe.dtypes

print(combined_dataframe.dtypes)

CensusTract         float64
State                object
County               object
TotalPop              int64
Men                   int64
Women                 int64
Hispanic            float64
White               float64
Black               float64
Native              float64
Asian               float64
Pacific             float64
Citizen             float64
Income              float64
IncomeErr           float64
IncomePerCap        float64
IncomePerCapErr     float64
Poverty             float64
ChildPoverty        float64
Professional        float64
Service             float64
Office              float64
Construction        float64
Production          float64
Drive               float64
Carpool             float64
Transit             float64
Walk                float64
OtherTransp         float64
WorkAtHome          float64
MeanCommute         float64
Employed              int64
PrivateWork         float64
PublicWork          float64
SelfEmployed        float64
FamilyWork          

In [17]:
variables = ['IncomePerCap', 'Black', 'Income', 'Men', 'Women']
statistics = combined_dataframe[variables].describe()

display(statistics)

Unnamed: 0,IncomePerCap,Black,Income,Men,Women
count,152957.0,153056.0,152225.0,154442.0,154442.0
mean,29371.403963,13.086991,58665.595638,4106.666,4240.434
std,15388.515022,21.442707,29257.886012,33623.47,34873.25
min,32.0,0.0,2611.0,0.0,0.0
25%,19880.0,0.7,39015.0,1434.0,1485.0
50%,26116.0,3.7,52119.0,2041.0,2119.0
75%,34758.0,14.3,71486.0,2797.0,2894.0
max,254204.0,100.0,249750.0,4979641.0,5126081.0


In [20]:
# Create a list to store variable information
variable_list = []

# Iterate over the columns
for column in combined_dataframe.columns:
    variable_type = combined_dataframe[column].dtype
    
    # Determine the variable type
    if variable_type == 'object':
        variable_info = f"● {column} (Discrete, Categorical)"
    elif variable_type.name == 'category':
        variable_info = f"● {column} (Discrete, Categorical)"
    elif pd.api.types.is_numeric_dtype(variable_type):
        variable_info = f"● {column} (Discrete, Ordinal)"
    elif pd.api.types.is_bool_dtype(variable_type):
        variable_info = f"● {column} (Binary)"
    else:
        variable_info = f"● {column} ({variable_type})"
    
    variable_list.append(variable_info)

# Print the list of variables
print("The variables in this dataset are:")
for variable in variable_list:
    print(variable)


The variables in this dataset are:
● CensusTract (Discrete, Ordinal)
● State (Discrete, Categorical)
● County (Discrete, Categorical)
● TotalPop (Discrete, Ordinal)
● Men (Discrete, Ordinal)
● Women (Discrete, Ordinal)
● Hispanic (Discrete, Ordinal)
● White (Discrete, Ordinal)
● Black (Discrete, Ordinal)
● Native (Discrete, Ordinal)
● Asian (Discrete, Ordinal)
● Pacific (Discrete, Ordinal)
● Citizen (Discrete, Ordinal)
● Income (Discrete, Ordinal)
● IncomeErr (Discrete, Ordinal)
● IncomePerCap (Discrete, Ordinal)
● IncomePerCapErr (Discrete, Ordinal)
● Poverty (Discrete, Ordinal)
● ChildPoverty (Discrete, Ordinal)
● Professional (Discrete, Ordinal)
● Service (Discrete, Ordinal)
● Office (Discrete, Ordinal)
● Construction (Discrete, Ordinal)
● Production (Discrete, Ordinal)
● Drive (Discrete, Ordinal)
● Carpool (Discrete, Ordinal)
● Transit (Discrete, Ordinal)
● Walk (Discrete, Ordinal)
● OtherTransp (Discrete, Ordinal)
● WorkAtHome (Discrete, Ordinal)
● MeanCommute (Discrete, Ordinal)


In [1]:
import pandas as pd 

csv_files = ['communities.csv']

dataframes = [pd.read_csv(file) for file in csv_files]

combined_dataframe = pd.concat(dataframes)

print(combined_dataframe.head())

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x86 in position 11: invalid start byte