### Casually exploring the data and building out formulas

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from scipy import stats
import time

# set seaborn settings
sns.set()
plt.rcParams["patch.force_edgecolor"] = True # set lines

import warnings
warnings.filterwarnings('ignore')

In [2]:
root = "data\\2016\\"

winterMonths = ["01", "02", "11", "12"]
springMonths = ["03", "04", "05"]
summerMonths = ["06", "07", "08"]
fallMonths = ["09", "10"]

In [3]:
def interestedColumns(df):
    interestedCols = ["taxi_id", "tolls", "trip_start_timestamp", "trip_end_timestamp", "company", "dropoff_centroid_longitude", \
        "dropoff_centroid_latitude", "fare", "payment_type", "pickup_centroid_latitude", "pickup_centroid_longitude", \
        "dropoff_community_area", "pickup_community_area", "tips", "trip_miles", "trip_seconds", "trip_total"]
    
    result = df[interestedCols]
    return(result)

In [4]:
def combineMonths(months):
    df = pd.DataFrame()
    
    for month in months:
        print("Working on", month)
        monthDir = join(root, month)
        filesList = [f for f in listdir(monthDir) if isfile(join(monthDir, f))]
        print("-- Found", len(filesList), "files")
        
        for file in filesList:
            print("----- Working on", file)
            filePath = join(monthDir, file)
            fileDf = pd.read_csv(filePath)
            fileDf = interestedColumns(fileDf)
            df = pd.concat([df, fileDf])  
            
        print("")
            
    print("***** Row Count:", len(df), "*****") 
    print("=============================================")            
    return(df)

In [5]:
winterDf = combineMonths(winterMonths)
springDf = combineMonths(springMonths)
summerDf = combineMonths(summerMonths)
fallDf = combineMonths(fallMonths)

Working on 01
-- Found 9 files
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000000000000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000000200000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000000400000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000000600000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000000800000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000001000000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000001200000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000001400000.csv
----- Working on ChiTaxi_2016-01-01_2016-02-01_00000000000001600000.csv

Working on 02
-- Found 9 files
----- Working on ChiTaxi_2016-02-01_2016-03-01_00000000000000000000.csv
----- Working on ChiTaxi_2016-02-01_2016-03-01_00000000000000200000.csv
----- Working on ChiTaxi_2016-02-01_2016-03-01_00000000000000400000.csv
----- Working on ChiTaxi_2016-02-01_2016-03-01_00000000000000600000.csv
-

In [6]:
# output to csv

winterDf.to_csv("data/2016/winter.csv", index = False)
springDf.to_csv("data/2016/spring.csv", index = False)
summerDf.to_csv("data/2016/summer.csv", index = False)
fallDf.to_csv("data/2016/fall.csv", index = False)