In [1]:
from pyspark import SparkContext, SparkConf
cf = SparkConf()
cf.set("spark.submit.deployMode","client")
sc = SparkContext.getOrCreate(cf)
from pyspark.sql import SparkSession
spark = SparkSession \
	    .builder \
	    .appName("Python Spark SQL basic example") \
	    .config("spark.some.config.option", "some-value") \
	    .getOrCreate()
                      

## Data Cleaning: Legally_Operating_Businesses_copy.csv

Start by reading in 'Legally_Operating_Businesses_copy.csv' as a pandas data frame. We are going to be looking at the years and months, so it is important to change the datatypes of the date column to a datetime datatype. We will also change the datatypes of license status column, industry column, address state column, and address zip column to string. This is so that all of the entries in this column are uniform. I will also modify the column names from words separated by spaces to words separated by underscores. For example 'Address State' will be renamed 'Address_State'. We are only modifying certain columns because we are only interested in these columns of the dataset. All of this is done, so to make it easier to not only convert the dataframe into a pyspark dataframe, but also to make it easier to use SQL queries on the soon to be created pyspark dataframe.

In [2]:
import numpy as np
import pandas as pd
#read dataset in pandas
#set null values as a string, to make all the columns the same data type - makes it easier to convert to spark data frame
business_license = pd.read_csv('data/Legally_Operating_Businesses_copy.csv', na_values = "not available")

#set respective columns to its designated datatype
business_license['License Creation Date'] = pd.to_datetime(business_license['License Creation Date']) 
business_license = business_license.astype({'License Status':'string','Industry':'string', 'Address State':'string', 'Address ZIP':'string' })

#rename each column name from separate words to words separated by underscores
business_license = business_license.loc[:, ["License Creation Date", "License Status", "Industry", "Address State", "Address ZIP"]] 
business_license.rename(columns={"License Creation Date": "License_Creation_Date", "License Status": "License_Status", "Address State":"Address_State", "Address ZIP": "Address_ZIP"}, inplace=True)

business_license.dtypes

  business_license = pd.read_csv('data/Legally_Operating_Businesses_copy.csv', na_values = "not available")


License_Creation_Date    datetime64[ns]
License_Status                   string
Industry                         string
Address_State                    string
Address_ZIP                      string
dtype: object

 
Read the business_license pandas dataframe into a spark dataframe, This way we can now use Pyspark SQL queries to further work on the data

In [3]:
from pyspark.sql.types import *
#create schema for your dataframe
schema = StructType([StructField("License_Creation_Date", DateType(), True)\
                   ,StructField("License_Status",StringType(), True)\
                   ,StructField("Industry", StringType(), True)\
                   ,StructField("Address_State", StringType(), True)\
                   ,StructField("Address_ZIP", StringType(), True)])

#create spark dataframe using schema
business_license_df = spark.createDataFrame(business_license,schema=schema)
business_license_df.show()

+---------------------+--------------+--------------------+----------------+-----------+
|License_Creation_Date|License_Status|            Industry|   Address_State|Address_ZIP|
+---------------------+--------------+--------------------+----------------+-----------+
|           2010-02-04|      Inactive|Home Improvement ...|              NJ|      07726|
|           2022-01-11|        Active|   Sightseeing Guide|              NY|      11214|
|           2017-10-31|        Active|   Sightseeing Guide|              NY|      10028|
|           2022-04-22|        Active|      Pedicab Driver|              NY|      11214|
|           2018-09-11|        Active|   Sightseeing Guide|British Columbia|      V6Z1C|
|           2017-09-01|      Inactive|Home Improvement ...|              PA|      19013|
|           2011-12-21|      Inactive|    Tow Truck Driver|              NY|      10302|
|           2015-04-20|      Inactive|Home Improvement ...|              NY|      11793|
|           2000-05-2

As we can see not all of the licenses are from New York, so I want to remove the entries of the table that are not businesses from NY. Also we are only focused on New York City, so we will only keep entries that are in the five boroughs (Queens, Manhattan/New York, Brooklyn, Bronx, Staten Island).

In [4]:
#remove non NY entries
business_license_df = business_license_df.filter(business_license_df["Address_State"] == "NY")
business_license_df.show()

+---------------------+--------------+--------------------+-------------+-----------+
|License_Creation_Date|License_Status|            Industry|Address_State|Address_ZIP|
+---------------------+--------------+--------------------+-------------+-----------+
|           2022-01-11|        Active|   Sightseeing Guide|           NY|      11214|
|           2017-10-31|        Active|   Sightseeing Guide|           NY|      10028|
|           2022-04-22|        Active|      Pedicab Driver|           NY|      11214|
|           2011-12-21|      Inactive|    Tow Truck Driver|           NY|      10302|
|           2015-04-20|      Inactive|Home Improvement ...|           NY|      11793|
|           2000-05-23|        Active|   Sightseeing Guide|           NY|      10023|
|           2015-10-29|      Inactive|    Tow Truck Driver|           NY|      10460|
|           2022-07-15|        Active|       Ticket Seller|           NY|      10454|
|           2014-04-10|      Inactive|          Auctio

Now, we will filter for businesses that have a NYC zip code

In [5]:
#aggregate dictionary of zip codes mapped to NYC boroughs
zips = {}
with open("zipcodes.txt") as f:
    for line in f:
        #each line has two zips matched to borough
        line = line.strip().split()
        for i in range(0,len(line),2):
            zips[line[i]] = line[i+1]
            
zips_lst = list(zips.keys())

#fetch all zip codes associated with borough
def borough_zips(zips, borough):
    lst = [key for key,value in zips.items() if value == borough]
    return lst

#zip codes by borough
manhattan_zips = borough_zips(zips, "Manhattan")
brooklyn_zips= borough_zips(zips, "Brooklyn")
queen_zips = borough_zips(zips, "Queens")
bronx_zips = borough_zips(zips, "Bronx")
statenisland_zips = borough_zips(zips, "Staten")

In [6]:
#remove non NYC entries
business_license_df = business_license_df.filter(business_license_df["Address_ZIP"].isin(zips_lst))
business_license_df.show()

+---------------------+--------------+--------------------+-------------+-----------+
|License_Creation_Date|License_Status|            Industry|Address_State|Address_ZIP|
+---------------------+--------------+--------------------+-------------+-----------+
|           2022-01-11|        Active|   Sightseeing Guide|           NY|      11214|
|           2017-10-31|        Active|   Sightseeing Guide|           NY|      10028|
|           2022-04-22|        Active|      Pedicab Driver|           NY|      11214|
|           2011-12-21|      Inactive|    Tow Truck Driver|           NY|      10302|
|           2000-05-23|        Active|   Sightseeing Guide|           NY|      10023|
|           2015-10-29|      Inactive|    Tow Truck Driver|           NY|      10460|
|           2022-07-15|        Active|       Ticket Seller|           NY|      10454|
|           2014-04-10|      Inactive|          Auctioneer|           NY|      11354|
|           2011-03-02|      Inactive|    Tow Truck Dr

We will also label the borough for each license

In [7]:
from pyspark.sql.functions import when

business_license_df = business_license_df.withColumn("Borough", when(business_license_df.Address_ZIP.isin(manhattan_zips), "Manhattan")
                                                     .when(business_license_df.Address_ZIP.isin(brooklyn_zips), "Brooklyn")
                                                     .when(business_license_df.Address_ZIP.isin(queen_zips), "Queens")
                                                     .when(business_license_df.Address_ZIP.isin(bronx_zips), "Bronx")
                                                     .when(business_license_df.Address_ZIP.isin(statenisland_zips), "Staten Island"))

Now we want to look at the number of licenses created each year. We use a pyspark SQL query to do this. After, we then convert the SQL table to a new pyspark dataframe, which we futher convert back into a new pandas dataframe, so that we can export the table into a csv file. 

In [8]:
from pyspark.sql.functions import year

#Use SQL to group the data by years and count number of licenses made

business_license_df.createOrReplaceTempView("license")
business_license_by_year_df = spark.sql("SELECT YEAR(License_Creation_Date) AS Year, COUNT(*) AS Opened_Licenses FROM license GROUP BY YEAR(License_Creation_Date) ORDER BY year")

#turn dataframe into pandas df to export as csv
business_license_by_year = business_license_by_year_df.toPandas()
#rename column back into words separated by spaces
business_license_by_year.rename(columns={'Opened_Licenses': 'Opened Licenses'}, inplace=True)
display(business_license_by_year[business_license_by_year['Year'] > 2010])


Unnamed: 0,Year,Opened Licenses
31,2011,11907
32,2012,12351
33,2013,11890
34,2014,10152
35,2015,10525
36,2016,10247
37,2017,12693
38,2018,12352
39,2019,8032
40,2020,3213


In [9]:
#export CSV file
business_license_by_year.to_csv('cleaned_data/business_license_by_year_updated.csv')

We are doing the same thing here, except now we are grouping by Borough 

In [10]:
#Use SQL to group the data by years and count number of licenses made
business_license_by_borough_df = spark.sql("SELECT YEAR(License_Creation_Date) AS Year, Borough, COUNT(*) AS Opened_Licenses FROM license GROUP BY YEAR(License_Creation_Date), Borough ORDER BY year")
#convert to pandas
business_license_by_borough = business_license_by_borough_df.toPandas()
#rename column
business_license_by_borough.rename(columns={'Opened_Licenses': 'Opened Licenses'}, inplace=True)
display(business_license_by_borough[business_license_by_borough['Year'] > 2018])

Unnamed: 0,Year,Borough,Opened Licenses
173,2019,Staten Island,464
174,2019,Brooklyn,2312
175,2019,Manhattan,1570
176,2019,Queens,2373
177,2019,Bronx,1313
178,2020,Staten Island,217
179,2020,Brooklyn,986
180,2020,Bronx,453
181,2020,Queens,948
182,2020,Manhattan,609


In [11]:
#export CSV
business_license_by_borough.to_csv('cleaned_data/business_license_by_borough_updated.csv')

Once Again we repeat the same as above, except now we are grouping by industry

In [12]:
#Use SQL to group by indsutry
business_license_by_ind_df = spark.sql("SELECT YEAR(License_Creation_Date) AS Year, Industry, COUNT(*) AS Opened_Licenses FROM license GROUP BY YEAR(License_Creation_Date), Industry ORDER BY year DESC")
#convert to pandas
business_license_by_ind = business_license_by_ind_df.toPandas()
#rename
business_license_by_ind.rename(columns={'Opened_Licenses': 'Opened Licenses'}, inplace=True)
display(business_license_by_ind[business_license_by_ind['Year'] > 2019])

Unnamed: 0,Year,Industry,Opened Licenses
0,2023,General Vendor,20
1,2023,Car Wash,8
2,2023,Locksmith,37
3,2023,Ticket Seller,44
4,2023,Garage,37
...,...,...,...
165,2020,Booting Company,1
166,2020,Tow Truck Exemption,1
167,2020,Games of Chance,3
168,2020,Debt Collection Agency,15


In [13]:
#export
business_license_by_ind.to_csv('cleaned_data/business_license_by_industry_updated.csv')

## Data Cleaning: cases-by-day.csv

First we start by reading the 'cases-by-day.csv' file into a pandas dataframe. We are doing this because we want to set the 'date_of_interest' column into a datetime datatype. This is so that later on when we convert it to a spark dataframe and try to perform SQL queries we can access that data by different dates. We then turn the pandas dataframe into a spark dataframe.

In [14]:
#read csv as pandas
covid_numbers = pd.read_csv("data/cases-by-day.csv")
#turn column into datetime datatype
covid_numbers['date_of_interest'] = pd.to_datetime(covid_numbers['date_of_interest'])

#convert to a spark dataframe and create a tempview for sql queries
covid_df = spark.createDataFrame(covid_numbers)
covid_df.createOrReplaceTempView("covid")
covid_df.show()

+-------------------+----------+-------------------+-------------------+-----------------------+-------------+----------------------+----------------------+--------------------------+-------------+----------------------+----------------------+--------------------------+-------------+----------------------+----------------------+--------------------------+-------------+----------------------+----------------------+--------------------------+-------------+----------------------+----------------------+--------------------------+----------+
|   date_of_interest|CASE_COUNT|PROBABLE_CASE_COUNT|CASE_COUNT_7DAY_AVG|ALL_CASE_COUNT_7DAY_AVG|BX_CASE_COUNT|BX_PROBABLE_CASE_COUNT|BX_CASE_COUNT_7DAY_AVG|BX_ALL_CASE_COUNT_7DAY_AVG|BK_CASE_COUNT|BK_PROBABLE_CASE_COUNT|BK_CASE_COUNT_7DAY_AVG|BK_ALL_CASE_COUNT_7DAY_AVG|MN_CASE_COUNT|MN_PROBABLE_CASE_COUNT|MN_CASE_COUNT_7DAY_AVG|MN_ALL_CASE_COUNT_7DAY_AVG|QN_CASE_COUNT|QN_PROBABLE_CASE_COUNT|QN_CASE_COUNT_7DAY_AVG|QN_ALL_CASE_COUNT_7DAY_AVG|SI_CASE_COUNT

The covid dataset is taken per day. However, what we are trying to do is get an average number of cases per month. To do this we perform a SQL query where we group the data by year and then group by month. For each group we will take the average number of cases for that month. 

In [15]:
from pyspark.sql.functions import year, month, max,concat, col
covid_df_total = spark.sql("SELECT YEAR(date_of_interest) AS year, MONTH(date_of_interest) AS month, AVG(CASE_COUNT) as avg_per_day, SUM(CASE_COUNT) as total_cases FROM covid GROUP BY YEAR(date_of_interest), MONTH(date_of_interest) ORDER BY year, month")
covid_df_total.show()

+----+-----+------------------+-----------+
|year|month|       avg_per_day|total_cases|
+----+-----+------------------+-----------+
|2020|    2|               1.0|          1|
|2020|    3|2102.6451612903224|      65182|
|2020|    4|            3643.2|     109296|
|2020|    5| 916.6451612903226|      28416|
|2020|    6|361.46666666666664|      10844|
|2020|    7|315.93548387096774|       9794|
|2020|    8|240.19354838709677|       7446|
|2020|    9|             369.9|      11097|
|2020|   10| 545.9677419354839|      16925|
|2020|   11|1492.1666666666667|      44765|
|2020|   12|3154.7419354838707|      97797|
|2021|    1| 4471.774193548387|     138625|
|2021|    2|3089.5714285714284|      86508|
|2021|    3| 2969.516129032258|      92055|
|2021|    4|1811.1666666666667|      54335|
|2021|    5| 437.4193548387097|      13560|
|2021|    6|175.43333333333334|       5263|
|2021|    7| 607.1612903225806|      18822|
|2021|    8|            1498.0|      46438|
|2021|    9|            1302.9| 

Another issue we are facing is that the year and month are now in separte columns. The next thing we do is combine the two columns. We start by converting the pyspark dataframe into a pandas dataframe. Next we append the year and month together and separting them by a period and store this as a new column in the dataframe. 2020 1 becomes 2020.1. Finally, we select only the cases column and the newly created year_month column. The purpose of combining the month and year columns is so that when we want to plot the data we can have both the year and month included in the plot.

In [16]:
#convert to pandas
covid_numbers = covid_df_total.toPandas()
#combine year and month columns as a new column 'year_month'
covid_numbers['date'] = pd.to_datetime(covid_numbers['year'].astype(str) + '-' + covid_numbers['month'].astype(str))
#take only the cases column and 'year_month' column
covid_numbers = covid_numbers.iloc[:,2:5]
covid_numbers

Unnamed: 0,avg_per_day,total_cases,date
0,1.0,1,2020-02-01
1,2102.645161,65182,2020-03-01
2,3643.2,109296,2020-04-01
3,916.645161,28416,2020-05-01
4,361.466667,10844,2020-06-01
5,315.935484,9794,2020-07-01
6,240.193548,7446,2020-08-01
7,369.9,11097,2020-09-01
8,545.967742,16925,2020-10-01
9,1492.166667,44765,2020-11-01


In [17]:
#export as csv
covid_numbers.to_csv('cleaned_data/covid_numbers_updated.csv')

## Data Cleaning: savings.csv

I want to compare the data of covid cases to amount of personal savings. From the previous step, the 'covid_numbers_updated.csv' reads the months as '2020.1', '2020.2', etc. The Personal savings.csv does not group the data by this. In the  savings data, the columns are grouped by year in one row and by month in another row. In order to compare the two data sets I want to format the dates of the Personal savings data to match the updated covid numbers data. I start by reading the data into a pandas dataframe.

In [18]:
import decimal
income_disposition = pd.read_csv("data/savings.csv")
income_disposition.head()

Unnamed: 0,Line,Unnamed: 1,2017,2017.1,2017.2,2017.3,2017.4,2017.5,2017.6,2017.7,...,2022.5,2022.6,2022.7,2022.8,2022.9,2022.10,2022.11,2023,2023.1,2023.2
0,Line,,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,...,JUN,JUL,AUG,SEP,OCT,NOV,DEC,JAN,FEB,MAR
1,1,Personal income,16489261,16557491,16609059,16667168,16763660,16791030,16847997,16907887,...,21687046,21852289,21975795,22080374,22283009,22375915,22446071,22575592,22649070,22716956
2,2,Compensation of employees,10194656,10231767,10246421,10298027,10329427,10380266,10432173,10467818,...,13468831,13654085,13754853,13856069,13921039,13977288,14019434,14138891,14178693,14224868
3,3,Wages and salaries,8286937,8318008,8328008,8371638,8396316,8438386,8481031,8508186,...,11103991,11271794,11360707,11450576,11506774,11553505,11587745,11693839,11726779,11765591
4,4,Private industries,6958868,6986215,6993336,7035182,7056407,7093815,7130552,7156181,...,9500072,9656398,9735876,9819494,9872200,9908691,9937328,10033610,10059718,10091831


As we can see, the first row of the table lists the months:"JAN","FEB",etc. We also notice that the labels of the years already indicate the month with a number. Therefore, we do not really need the row with the months so we can delete that. We also don't need the first column of the dataset with the line count. We also notice that there is a row in the data that has empty values. We can either delete it or fill it in with 0's, in this case we fill it in with 0's. Also we want to make sure the data is the correct datatype, so we change every column to a float datatype. 

In [19]:
#drop row with months and drop first column with row count
income_disposition = income_disposition.drop(['Line'], axis = 1)
income_disposition= income_disposition.iloc[1:, :]

#fill empty rows with 0
income_disposition = income_disposition.fillna(0)

#change column datatypes to floats
cols = income_disposition.columns
cols = cols[1:]
for col in cols:
    income_disposition[col] = income_disposition[col].astype("float")

income_disposition.head()

Unnamed: 0,Unnamed: 1,2017,2017.1,2017.2,2017.3,2017.4,2017.5,2017.6,2017.7,2017.8,...,2022.5,2022.6,2022.7,2022.8,2022.9,2022.10,2022.11,2023,2023.1,2023.2
1,Personal income,16489261.0,16557491.0,16609059.0,16667168.0,16763660.0,16791030.0,16847997.0,16907887.0,17008347.0,...,21687046.0,21852289.0,21975795.0,22080374.0,22283009.0,22375915.0,22446071.0,22575592.0,22649070.0,22716956.0
2,Compensation of employees,10194656.0,10231767.0,10246421.0,10298027.0,10329427.0,10380266.0,10432173.0,10467818.0,10535107.0,...,13468831.0,13654085.0,13754853.0,13856069.0,13921039.0,13977288.0,14019434.0,14138891.0,14178693.0,14224868.0
3,Wages and salaries,8286937.0,8318008.0,8328008.0,8371638.0,8396316.0,8438386.0,8481031.0,8508186.0,8565557.0,...,11103991.0,11271794.0,11360707.0,11450576.0,11506774.0,11553505.0,11587745.0,11693839.0,11726779.0,11765591.0
4,Private industries,6958868.0,6986215.0,6993336.0,7035182.0,7056407.0,7093815.0,7130552.0,7156181.0,7208255.0,...,9500072.0,9656398.0,9735876.0,9819494.0,9872200.0,9908691.0,9937328.0,10033610.0,10059718.0,10091831.0
5,Government,1328069.0,1331793.0,1334672.0,1336457.0,1339910.0,1344571.0,1350479.0,1352005.0,1357303.0,...,1603920.0,1615396.0,1624831.0,1631082.0,1634574.0,1644813.0,1650417.0,1660229.0,1667061.0,1673760.0


Another thing we notice is that the years do not quite match up with year_month labels of our covid data. In our covid data, the year_month is represented as by date time format while this dataset has it as "2017.1". We will amend this by converting the dates back into a datetime format.

In [20]:
new_cols = ["type"]

for col in income_disposition.columns[1:]:
    #split column name by .
    col = col.split(".")
    
    #if this is just 2017
    if len(col) == 1:
        new_name = col[0] + "-01" + "-01" 
        
    #otherwise add 1 to month
    else:
        month = str(int(col[-1]) + 1)
        
        #if our month is <10 we need to append 0 in front 
        if int(month) < 10:
            month = "0" + month
        
        #format name
        new_name = col[0] + "-" + month + "-01" 
    
    new_cols.append(new_name)
    
print(new_cols)

['type', '2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01', '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01', '2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01', '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01', '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01', '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01', '2021-11-01', '2021-12-01', '2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01', '2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01

In [21]:
#fix columns
income_disposition.columns = new_cols
income_disposition.head()

Unnamed: 0,type,2017-01-01,2017-02-01,2017-03-01,2017-04-01,2017-05-01,2017-06-01,2017-07-01,2017-08-01,2017-09-01,...,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,2022-11-01,2022-12-01,2023-01-01,2023-02-01,2023-03-01
1,Personal income,16489261.0,16557491.0,16609059.0,16667168.0,16763660.0,16791030.0,16847997.0,16907887.0,17008347.0,...,21687046.0,21852289.0,21975795.0,22080374.0,22283009.0,22375915.0,22446071.0,22575592.0,22649070.0,22716956.0
2,Compensation of employees,10194656.0,10231767.0,10246421.0,10298027.0,10329427.0,10380266.0,10432173.0,10467818.0,10535107.0,...,13468831.0,13654085.0,13754853.0,13856069.0,13921039.0,13977288.0,14019434.0,14138891.0,14178693.0,14224868.0
3,Wages and salaries,8286937.0,8318008.0,8328008.0,8371638.0,8396316.0,8438386.0,8481031.0,8508186.0,8565557.0,...,11103991.0,11271794.0,11360707.0,11450576.0,11506774.0,11553505.0,11587745.0,11693839.0,11726779.0,11765591.0
4,Private industries,6958868.0,6986215.0,6993336.0,7035182.0,7056407.0,7093815.0,7130552.0,7156181.0,7208255.0,...,9500072.0,9656398.0,9735876.0,9819494.0,9872200.0,9908691.0,9937328.0,10033610.0,10059718.0,10091831.0
5,Government,1328069.0,1331793.0,1334672.0,1336457.0,1339910.0,1344571.0,1350479.0,1352005.0,1357303.0,...,1603920.0,1615396.0,1624831.0,1631082.0,1634574.0,1644813.0,1650417.0,1660229.0,1667061.0,1673760.0


In [22]:
#export as csv
income_disposition.to_csv('cleaned_data/savings_updated.csv')

## Data Cleaning: us_small_bus.csv

Here is a simple data set about the number of US small businesses over the years. It is a small dataset, that does not require much cleaning. However, there are small things that need to be fixed. We start by reading the dataset into a pandas dataframe. 

In [23]:
US_business_df = pd.read_csv("data/us_small_bus.csv")
US_business_df.head()

Unnamed: 0,Line,Unnamed: 1,2014,2015,2016,2017,2018,2019,2020,2021
0,,,,,,,,,,
1,1.0,Self-employed persons1,9358.0,9508.0,9604.0,9525.0,9707.0,9539.0,9253.0,9956.0
2,2.0,"Agriculture, forestry, fishing, and hunting",757.0,843.0,852.0,790.0,766.0,741.0,741.0,737.0
3,3.0,Farms2,680.0,760.0,771.0,711.0,684.0,664.0,669.0,669.0
4,4.0,"Forestry, fishing and related activities",77.0,83.0,81.0,79.0,82.0,77.0,72.0,68.0


As we can see the first column 'Line' is unecessary so we can drop it. There are also rows that are all 0's. These will also be unecessary so we can drop those rows as well. We can also see that the second column is named 'Unnamed:1' which is not very informative. We can rename this column to 'industry'. Throughout the dataset, there are some strangely named entries such as "Farms2". Because the dataset is small, we can individually change the names of each of these entries. One last thing that is easy to overlook is that each entry in the "Unnamed:1" column (renamed to "industry") has leading white spaces. This will cause unecceasry issues later, so we can just remove all leading white space in each entry.

In [24]:
#drop rows that are all 0's
US_business_df = US_business_df.drop([0,6])

#drop column 'Line' because it is not necessary
US_business_df = US_business_df.drop(columns=['Line'])

#rename the variables of two different cells
US_business_df.iloc[0,0] = "Self-employed persons"
US_business_df.iloc[2,0] = "Farms"
US_business_df.iloc[14,0] = "Professional and business services"

#rename first column
US_business_df.rename(columns={'Unnamed: 1': 'industry'}, inplace=True)

#clean up the leading white spaces in the industry column
num_industries = len(US_business_df)
for i in range(num_industries):
    cleaned_word = US_business_df.iloc[i,0].strip()
    US_business_df.iloc[i,0] = cleaned_word

US_business_df.head()

Unnamed: 0,industry,2014,2015,2016,2017,2018,2019,2020,2021
1,Self-employed persons,9358.0,9508.0,9604.0,9525.0,9707.0,9539.0,9253.0,9956.0
2,"Agriculture, forestry, fishing, and hunting",757.0,843.0,852.0,790.0,766.0,741.0,741.0,737.0
3,Farms,680.0,760.0,771.0,711.0,684.0,664.0,669.0,669.0
4,"Forestry, fishing and related activities",77.0,83.0,81.0,79.0,82.0,77.0,72.0,68.0
5,Mining,20.0,20.0,17.0,11.0,9.0,14.0,10.0,13.0


In [25]:
#export as csv
US_business_df.to_csv('cleaned_data/us_small_bus_updated.csv')