In [1]:
import pandas as pd

In [2]:
market_df = pd.read_csv('data/farmers_markets_from_usda.csv')
county_df = pd.read_csv('data/county_info.csv')

# Market DF

In [3]:
market_df.head()

Unnamed: 0,FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,...,Coffee,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime
0,1018261,Caledonia Farmers Market Association - Danville,https://sites.google.com/site/caledoniafarmers...,https://www.facebook.com/Danville.VT.Farmers.M...,,,,,Danville,Caledonia,...,Y,Y,Y,N,N,Y,Y,N,N,6/20/2017 22:43
1,1018318,Stearns Homestead Farmers' Market,http://www.StearnsHomestead.com,StearnsHomesteadFarmersMarket,,,,6975 Ridge Road,Parma,Cuyahoga,...,N,N,Y,N,N,N,N,N,N,6/21/2017 17:15
2,1009364,106 S. Main Street Farmers Market,http://thetownofsixmile.wordpress.com/,,,,,106 S. Main Street,Six Mile,Pickens,...,,,,,,,,,,2013
3,1010691,10th Steet Community Farmers Market,,,,,http://agrimissouri.com/mo-grown/grodetail.php...,10th Street and Poplar,Lamar,Barton,...,N,N,Y,N,N,N,N,N,N,10/28/2014 9:49
4,1002454,112st Madison Avenue,,,,,,112th Madison Avenue,New York,New York,...,N,N,N,N,N,N,N,N,N,3/1/2012 10:38


In [4]:
market_df.columns

Index(['FMID', 'MarketName', 'Website', 'Facebook', 'Twitter', 'Youtube',
       'OtherMedia', 'street', 'city', 'County', 'State', 'zip', 'Season1Date',
       'Season1Time', 'Season2Date', 'Season2Time', 'Season3Date',
       'Season3Time', 'Season4Date', 'Season4Time', 'x', 'y', 'Location',
       'Credit', 'WIC', 'WICcash', 'SFMNP', 'SNAP', 'Organic', 'Bakedgoods',
       'Cheese', 'Crafts', 'Flowers', 'Eggs', 'Seafood', 'Herbs', 'Vegetables',
       'Honey', 'Jams', 'Maple', 'Meat', 'Nursery', 'Nuts', 'Plants',
       'Poultry', 'Prepared', 'Soap', 'Trees', 'Wine', 'Coffee', 'Beans',
       'Fruits', 'Grains', 'Juices', 'Mushrooms', 'PetFood', 'Tofu',
       'WildHarvested', 'updateTime'],
      dtype='object')

In [24]:
random_cols = ['FMID', 'updateTime']

In [85]:
market_df.drop(columns=random_cols, axis=1, inplace=True)

## Investigate time columns

In [5]:
time_cols = ['Season1Date', 'Season1Time', 'Season2Date', 'Season2Time', 
             'Season3Date', 'Season3Time', 'Season4Date', 'Season4Time']

In [6]:
time_df = market_df.copy()

In [7]:
time_df = time_df[time_cols]
time_df

Unnamed: 0,Season1Date,Season1Time,Season2Date,Season2Time,Season3Date,Season3Time,Season4Date,Season4Time
0,06/14/2017 to 08/30/2017,Wed: 9:00 AM-1:00 PM;,09/06/2017 to 10/18/2017,Wed: 2:00 PM-6:00 PM;,,,,
1,06/24/2017 to 09/30/2017,Sat: 9:00 AM-1:00 PM;,,,,,,
2,,,,,,,,
3,04/02/2014 to 11/30/2014,Wed: 3:00 PM-6:00 PM;Sat: 8:00 AM-1:00 PM;,,,,,,
4,July to November,Tue:8:00 am - 5:00 pm;Sat:8:00 am - 8:00 pm;,,,,,,
...,...,...,...,...,...,...,...,...
8799,07/04/2014 to 10/24/2014,Fri: 4:00 PM-7:00 PM;,,,,,,
8800,06/06/2017 to 10/03/2017,Tue: 2:30 PM-6:00 PM;,,,,,,
8801,05/07/2016 to 10/15/2016,Sat: 9:00 AM-12:00 PM;,,,,,,
8802,,,,,,,,


In [8]:
time_df.isna().sum()

Season1Date    3107
Season1Time    2940
Season2Date    8339
Season2Time    8348
Season3Date    8726
Season3Time    8729
Season4Date    8799
Season4Time    8799
dtype: int64

### Drop time columns

In [9]:
market_df.drop(columns=time_cols, axis=1, inplace=True)

## Investigate Social Media

In [10]:
social_cols = ['Website', 'Facebook', 'Twitter', 'Youtube', 'OtherMedia']

In [11]:
social_df = market_df.copy()

In [12]:
social_df = social_df[social_cols]
social_df.head()

Unnamed: 0,Website,Facebook,Twitter,Youtube,OtherMedia
0,https://sites.google.com/site/caledoniafarmers...,https://www.facebook.com/Danville.VT.Farmers.M...,,,
1,http://www.StearnsHomestead.com,StearnsHomesteadFarmersMarket,,,
2,http://thetownofsixmile.wordpress.com/,,,,
3,,,,,http://agrimissouri.com/mo-grown/grodetail.php...
4,,,,,


In [13]:
social_df.isna().sum()

Website       3513
Facebook      4614
Twitter       7778
Youtube       8636
OtherMedia    7933
dtype: int64

### Drop all social; create boolean feature for website

- Websites are String objects
- Nulls are float objects

In [14]:
def new_col_boolean(dfName, colName, valType):
    new_col = []
    for val in dfName[colName]:
        if (type(val)) == valType:
            new_col.append(1)
        else:
            new_col.append(0)
    return new_col


In [15]:
market_df['has_website'] = new_col_boolean(market_df, 'Website', str)

In [16]:
cols = ['Website', 'has_website']
market_df[cols].head()

Unnamed: 0,Website,has_website
0,https://sites.google.com/site/caledoniafarmers...,1
1,http://www.StearnsHomestead.com,1
2,http://thetownofsixmile.wordpress.com/,1
3,,0
4,,0


In [17]:
market_df['has_website'].value_counts()

1    5291
0    3513
Name: has_website, dtype: int64

In [18]:
market_df.drop(columns=social_cols, axis=1, inplace=True)
market_df.head()

Unnamed: 0,FMID,MarketName,street,city,County,State,zip,x,y,Location,...,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime,has_website
0,1018261,Caledonia Farmers Market Association - Danville,,Danville,Caledonia,Vermont,5828.0,-72.140335,44.411037,,...,Y,Y,N,N,Y,Y,N,N,6/20/2017 22:43,1
1,1018318,Stearns Homestead Farmers' Market,6975 Ridge Road,Parma,Cuyahoga,Ohio,,-81.73394,41.374802,,...,N,Y,N,N,N,N,N,N,6/21/2017 17:15,1
2,1009364,106 S. Main Street Farmers Market,106 S. Main Street,Six Mile,Pickens,South Carolina,29682.0,-82.818703,34.804199,,...,,,,,,,,,2013,1
3,1010691,10th Steet Community Farmers Market,10th Street and Poplar,Lamar,Barton,Missouri,64759.0,-94.27462,37.495628,,...,N,Y,N,N,N,N,N,N,10/28/2014 9:49,0
4,1002454,112st Madison Avenue,112th Madison Avenue,New York,New York,New York,10029.0,-73.949303,40.7939,Private business parking lot,...,N,N,N,N,N,N,N,N,3/1/2012 10:38,0


## Investigate Location cols

In [19]:
location_cols = ['street', 'city', 'County', 'State', 'zip', 'x', 'y', 'Location']

In [20]:
location_df = market_df.copy()

In [21]:
location_df = location_df[location_cols]
location_df.head()

Unnamed: 0,street,city,County,State,zip,x,y,Location
0,,Danville,Caledonia,Vermont,5828.0,-72.140335,44.411037,
1,6975 Ridge Road,Parma,Cuyahoga,Ohio,,-81.73394,41.374802,
2,106 S. Main Street,Six Mile,Pickens,South Carolina,29682.0,-82.818703,34.804199,
3,10th Street and Poplar,Lamar,Barton,Missouri,64759.0,-94.27462,37.495628,
4,112th Madison Avenue,New York,New York,New York,10029.0,-73.949303,40.7939,Private business parking lot


In [22]:
location_df.isna().sum()

street       287
city          40
County        36
State          0
zip          948
x             28
y             28
Location    6290
dtype: int64

### Drop Location column

In [23]:
market_df.drop(columns='Location', axis=1, inplace=True)
market_df.head()

Unnamed: 0,FMID,MarketName,street,city,County,State,zip,x,y,Credit,...,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime,has_website
0,1018261,Caledonia Farmers Market Association - Danville,,Danville,Caledonia,Vermont,5828.0,-72.140335,44.411037,Y,...,Y,Y,N,N,Y,Y,N,N,6/20/2017 22:43,1
1,1018318,Stearns Homestead Farmers' Market,6975 Ridge Road,Parma,Cuyahoga,Ohio,,-81.73394,41.374802,Y,...,N,Y,N,N,N,N,N,N,6/21/2017 17:15,1
2,1009364,106 S. Main Street Farmers Market,106 S. Main Street,Six Mile,Pickens,South Carolina,29682.0,-82.818703,34.804199,Y,...,,,,,,,,,2013,1
3,1010691,10th Steet Community Farmers Market,10th Street and Poplar,Lamar,Barton,Missouri,64759.0,-94.27462,37.495628,Y,...,N,Y,N,N,N,N,N,N,10/28/2014 9:49,0
4,1002454,112st Madison Avenue,112th Madison Avenue,New York,New York,New York,10029.0,-73.949303,40.7939,N,...,N,N,N,N,N,N,N,N,3/1/2012 10:38,0


## Binary Product Columns

In [56]:
product_cols = ['Organic', 'Bakedgoods', 'Cheese', 'Crafts', 'Flowers', 'Eggs', 
                'Seafood', 'Herbs', 'Vegetables', 'Honey', 'Jams', 'Maple', 'Meat', 
                'Nursery', 'Nuts', 'Plants', 'Poultry', 'Prepared', 'Soap', 'Trees', 
                'Wine', 'Coffee', 'Beans', 'Fruits', 'Grains', 'Juices', 
                'Mushrooms', 'PetFood', 'Tofu', 'WildHarvested']

In [57]:
for product in product_cols:
    try:
        market_df[product] = market_df[product].replace(to_replace=['Y', 'N'], value=[1,0])
    except:
        continue

In [109]:
market_df.head()

Unnamed: 0,MarketName,street,city,County,State,zip,x,y,Credit,WIC,...,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime,has_website
0,Caledonia Farmers Market Association - Danville,,Danville,Caledonia,Vermont,5828.0,-72.140335,44.411037,Y,Y,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,6/20/2017 22:43,1
1,Stearns Homestead Farmers' Market,6975 Ridge Road,Parma,Cuyahoga,Ohio,,-81.73394,41.374802,Y,N,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6/21/2017 17:15,1
2,106 S. Main Street Farmers Market,106 S. Main Street,Six Mile,Pickens,South Carolina,29682.0,-82.818703,34.804199,Y,N,...,,,,,,,,,2013,1
3,10th Steet Community Farmers Market,10th Street and Poplar,Lamar,Barton,Missouri,64759.0,-94.27462,37.495628,Y,N,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10/28/2014 9:49,0
4,112st Madison Avenue,112th Madison Avenue,New York,New York,New York,10029.0,-73.949303,40.7939,N,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3/1/2012 10:38,0


# Categories

In [31]:
categories = {'Produce': ['Vegetables', 'Fruits', 'Mushrooms', 'WildHarvested', 'Beans', 'Grains', 'Herbs', 'Organic', 'Nuts'],
              'Confectionary': ['Bakedgoods', 'Honey', 'Jams', 'Maple', 'Juices'],
              'Household': ['Crafts', 'Soap', 'PetFood'],
              'Dairy': ['Cheese', 'Eggs', 'Tofu'],
              'Plants': ['Flowers', 'Nursery', 'Plants', 'Trees'],
              'Meat': ['Seafood','Meat', 'Poultry'],
              'Other': ['Wine', 'Prepared', 'Coffee']}

In [48]:
def categorize_products():
    new_col = []
    for key in categories:
        this_list = (categories[key])
        print(iterate_product_list(this_list))
        

In [69]:
def iterate_product_list(this_list):
    new_col = []
    index = 1
    while index < len(market_df):
        print(market_df[:index])
        index+=1

In [70]:
categorize_products()

['Vegetables', 'Fruits', 'Mushrooms', 'WildHarvested', 'Beans', 'Grains', 'Herbs', 'Organic', 'Nuts']
['Bakedgoods', 'Honey', 'Jams', 'Maple', 'Juices']
['Crafts', 'Soap', 'PetFood']
['Cheese', 'Eggs', 'Tofu']
['Flowers', 'Nursery', 'Plants', 'Trees']
['Seafood', 'Meat', 'Poultry']
['Wine', 'Prepared', 'Coffee']


In [105]:
this_list = categories['Produce']
print(this_list)

['Vegetables', 'Fruits', 'Mushrooms', 'WildHarvested', 'Beans', 'Grains', 'Herbs', 'Organic', 'Nuts']


iterate_product_list(this_list)

In [107]:
produce_df = market_df[this_list]

In [112]:
produce_df[:1]

Unnamed: 0,Vegetables,Fruits,Mushrooms,WildHarvested,Beans,Grains,Herbs,Organic,Nuts
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1,0.0
