In [1]:
import sqlite3
import os
import pandas as pd

In [2]:
%reload_ext sql
# %reload_ext lib.sqlite.sqlite_evaluate_magic

In [3]:
%%bash
pwd
ls ../CleanedData

/Users/jchang16/UIUC/cs513-data-cleaning/final_project/SQL
farmersmarkets_location.csv
farmersmarkets_output.csv
farmersmarkets_payments.csv
farmersmarkets_products.csv


In [4]:
df = pd.read_csv("../CleanedData/farmersmarkets_location.csv")

In [5]:
df.shape

(8686, 10)

## Commands for sqllite

In [6]:
%%bash
sqlite3 ./farmersmarkets.db

.header on
.mode csv


CREATE TABLE location(
	"FMID"			INT PRIMARY KEY NOT NULL,
	"MarketName" 	TEXT,
	"street"		TEXT,
	"City"			TEXT,
	"County"		TEXT,
	"State"			TEXT,
	"zip"			TEXT,
	"latitude"		REAL,
	"longitude"		REAL,
	"updateTime"	DATETIME
);
.import /Users/jchang16/UIUC/cs513-data-cleaning/final_project/CleanedData/farmersmarkets_location.csv location

CREATE TABLE payments(
	"FMID"			INT PRIMARY KEY NOT NULL,
	"Credit"		CHAR(1),
	"WIC"			CHAR(1),
	"WICcash"		CHAR(1),
	"SFMNP"			CHAR(1),
	"SNAP"			CHAR(1)
);
.import /Users/jchang16/UIUC/cs513-data-cleaning/final_project/CleanedData/farmersmarkets_payments.csv payments

CREATE TABLE products(
	"FMID"			INT PRIMARY KEY NOT NULL,
	"Organic"		CHAR(1),
	"Bakedgoods"	CHAR(1),
	"Cheese"		CHAR(1),
	"Crafts"		CHAR(1),
	"Flowers"		CHAR(1),
	"Eggs"			CHAR(1),
	"Seafood"		CHAR(1),
	"Herbs"			CHAR(1),
	"Vegetables"	CHAR(1),
	"Honey"			CHAR(1),
	"Jams"			CHAR(1),
	"Maple"			CHAR(1),
	"Meat"			CHAR(1),
	"Nursery"		CHAR(1),
	"Nuts"			CHAR(1),
	"Plants"		CHAR(1),
	"Poultry"		CHAR(1),
	"Prepared"		CHAR(1),
	"Soap"			CHAR(1),
	"Trees"			CHAR(1),
	"Wine"			CHAR(1),
	"Coffee"		CHAR(1),
	"Beans"			CHAR(1),
	"Fruits"		CHAR(1),
	"Grains"		CHAR(1),
	"Juices"		CHAR(1),
	"Mushrooms"		CHAR(1),
	"PetFood"		CHAR(1),
	"Tofu"			CHAR(1),
	"WildHarvested"	CHAR(1)
);
.import /Users/jchang16/UIUC/cs513-data-cleaning/final_project/CleanedData/farmersmarkets_products.csv products

.output ./farmersmarkets.sql
.dump

## Connect to database

In [7]:
# Connect to database. Following command will connect you to the database.
# Any query that you will run after this cell will be run on the `surverys` database.
# You can always change the path in database url and connect to some other sqlite db of your choice.
survey_db_url = 'sqlite:///' + os.path.expanduser('./farmersmarkets.db')
%sql $survey_db_url

'Connected: @./farmersmarkets.db'

In [8]:
%%sql
SELECT * FROM location LIMIT 5;

 * sqlite:///./farmersmarkets.db
Done.


FMID,MarketName,street,City,County,State,zip,latitude,longitude,updateTime
1018261,Caledonia Farmers Market Association - Danville,,DANVILLE,CALEDONIA,VERMONT,5828.0,-72.140337,44.411036,2018-08-06T10:43:57Z
1018318,Stearns Homestead Farmers' Market,6975 RIDGE ROAD,PARMA,CUYAHOGA,OHIO,,-81.7339387,41.3748009,2018-09-06T05:15:01Z
1009364,106 S. Main Street Farmers Market,106 S MAIN STREET,SIX MILE,,SOUTH CAROLINA,29682.0,-82.8187,34.8042,2013-01-01T05:00:00Z
1010691,10th Steet Community Farmers Market,10TH STREET AND POPLAR,LAMAR,BARTON,MISSOURI,64759.0,-94.2746191,37.495628,2016-04-10T09:49:46Z
1002454,112st Madison Avenue,112TH MADISON AVENUE,NEW YORK,NEW YORK,NEW YORK,10029.0,-73.9493,40.7939,2012-01-03T10:38:22Z


## Integrity Constraints

### 1. Check that FMID is an appropriate primary key: non-null and unique
Looks good.

In [9]:
%%sql
SELECT * FROM location where FMID IS NULL OR FMID = '';
SELECT COUNT(distinct FMID) from location;

 * sqlite:///./farmersmarkets.db
Done.
Done.


COUNT(distinct FMID)
8687


### 2. Ensure that data for my use case is non-null. Turns out that we have 29 rows where latitude and longitude information are missing.
* latitude and longitude
* state
* credit

In [10]:
%%sql
SELECT *
FROM location loc
WHERE loc.latitude IS NULL 
        OR loc.latitude = ''
        OR loc.longitude IS NULL
        OR loc.longitude = ''
        LIMIT 5;

 * sqlite:///./farmersmarkets.db
Done.


FMID,MarketName,street,City,County,State,zip,latitude,longitude,updateTime
2000001,Center for Design Practice - Mobile Farmers Market,,,,MARYLAND,,,,2013-01-01T05:00:00Z
1011689,Charlotte Regional Farmers Market,1801 YORKMONT ROAD,CHARLOTTE,MECKLENBURG,NORTH CAROLINA,28217.0,,,2015-11-09T01:23:36Z
2000002,Dig It!,,,,PENNSYLVANIA,,,,2013-01-01T05:00:00Z
1002854,East Goshen Farmers Market,EAST GOSHEN PARK - PAOLI PIKE EAST OF ROUTE 352,WEST CHESTER,CHESTER,PENNSYLVANIA,19380.0,,,2014-06-05T03:45:15Z
2000004,Farm a la Carte,,,,GEORGIA,,,,2013-01-01T05:00:00Z


In [11]:
%%sql
SELECT COUNT(*)
FROM location loc
WHERE loc.latitude IS NULL 
        OR loc.latitude = ''
        OR loc.longitude IS NULL
        OR loc.longitude = '';

 * sqlite:///./farmersmarkets.db
Done.


COUNT(*)
29


In [12]:
%%sql
SELECT *
FROM location loc
WHERE loc.State IS NULL OR loc.State = '';

 * sqlite:///./farmersmarkets.db
Done.


FMID,MarketName,street,City,County,State,zip,latitude,longitude,updateTime


In [13]:
%%sql
SELECT *
FROM Payments pay
WHERE pay.Credit IS NULL OR pay.Credit = '';

 * sqlite:///./farmersmarkets.db
Done.


FMID,Credit,WIC,WICcash,SFMNP,SNAP


### 3. Check that latitude must be in [0,90] and longitude should be [-180, 180]
Looks good.

In [14]:
%%sql
SELECT loc.FMID,
       loc.MarketName, 
       loc.latitude, 
       loc.longitude 
FROM location loc
WHERE  ( Cast(loc.latitude AS FLOAT) > 180 
          OR Cast(loc.latitude AS FLOAT) <- 180 
          OR Cast(loc.longitude AS FLOAT) < 0 
          OR Cast(loc.longitude AS FLOAT) > 90 ) ; 

 * sqlite:///./farmersmarkets.db
Done.


FMID,MarketName,latitude,longitude


### 4. Every FMID has single address (street, City, County, State, zip) if it exists
Looks good.

In [15]:
%%sql
SELECT
    FMID, street, City, County, State, zip, count(*)
FROM
    location
GROUP BY
    FMID, street, City, County, State, zip
HAVING 
    COUNT(*) > 1

 * sqlite:///./farmersmarkets.db
Done.


FMID,street,City,County,State,zip,count(*)
