In [10]:
import pandas as pd

In [11]:
df = pd.read_parquet('data/final/weather_trips_combined.parquet')

In [12]:
weather = df.iloc[:,19:].copy()

In [13]:
trips = df.iloc[:,:19].copy()

# Dataset description

## Weather data

In [14]:
weather.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,yr_bin,day
0,1.0,2011-01-01,1.0,2011.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
1,1.0,2011-01-01,1.0,2011.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
2,1.0,2011-01-01,1.0,2011.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
3,1.0,2011-01-01,1.0,2011.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
4,1.0,2011-01-01,1.0,2011.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0


| Variable | Description | Type | Range/Values |
|----------|-------------|------|-------------|
| `instant` | Record index | Integer | Sequential ID |
| `dteday` | Date | Date | YYYY-MM-DD |
| `season` | Season | Categorical | 1: Spring, 2: Summer, 3: Fall, 4: Winter |
| `yr` | Year | Categorical | 2011, 2012 |
| `yr_bin` | Year | Binary | 2011: 0, 2012: 1 |
| `mnth` | Month | Integer | 1-12 |
| `hr` | Hour | Integer | 0-23 |
| `holiday` | Holiday indicator | Binary | 0: No, 1: Yes; extracted from http://dchr.dc.gov/page/holiday-schedule |
| `weekday` | Day of week | Integer | 0: Sunday to 6: Saturday |
| `workingday` | Working day indicator | Binary | If day is neither weekend nor holiday is 1, otherwise is 0 |
| `weathersit` | Weather situation | Categorical | 	1: Clear, Few clouds, Partly cloudy, Partly cloudy <br> 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist <br> 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds <br> 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog |
| `temp` | Normalized temperature | Float | 0-1 (Normalized) |
| `atemp` | Normalized feeling temperature | Float | 0-1 (Normalized) |
| `hum` | Normalized humidity | Float | 0-1 (Normalized) |
| `windspeed` | Normalized wind speed | Float | 0-1 (Normalized) |
| `casual` | Count of casual users | Integer | Count |
| `registered` | Count of registered users | Integer | Count |
| `cnt` | Total bicycle rentals | Integer | Count (casual + registered) |
| `day` | Day | Integer | 1-31 |

## Trips data

| Variable | Description | Type | Range/Values |
|----------|-------------|------|-------------|
| `duration` | Trip duration | Integer | Time in seconds |
| `start_date` | Start date and time of trip | Datetime | YYYY-MM-DD HH:MM:SS |
| `end_date` | End date and time of trip | Datetime | YYYY-MM-DD HH:MM:SS |
| `start_station` | Start station name | String | Station name |
| `end_station` | End station name | String | Station name |
| `bike_number` | Unique identifier for bicycle | String | Bike ID |
| `member_type` | Type of user | String | 'Registered', 'Casual' |
| `start_yr` | Year trip started | Integer | 2011, 2012 |
| `start_yr_bin` | Year trip started | Binary | 0: 2011, 1: 2012 |
| `start_mnth` | Month trip started | Integer | 1-12 |
| `start_hr` | Hour trip started | Integer | 0-23 |
| `end_yr` | Year trip ended | Integer | 2011, 2012 |
| `end_yr_bin` | Year trip ended | Binary | 0: 2011, 1: 2012 |
| `end_mnth` | Month trip ended | Integer | 1-12 |
| `end_hr` | Hour trip ended | Integer | 0-23 |

In [15]:
trips.head()

Unnamed: 0,duration,start_date,end_date,start_station_number,start_station,end_station_number,end_station,bike_number,member_type,start_yr,start_yr_bin,start_mnth,start_day,start_hr,end_yr,end_yr_bin,end_mnth,end_day,end_hr
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member,2011,0,1,1,0,2011,0,1,1,1
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual,2011,0,1,1,0,2011,0,1,1,0
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member,2011,0,1,1,0,2011,0,1,1,0
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member,2011,0,1,1,0,2011,0,1,1,0
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual,2011,0,1,1,0,2011,0,1,1,0


## Final dataframe

In [16]:
df.head()

Unnamed: 0,duration,start_date,end_date,start_station_number,start_station,end_station_number,end_station,bike_number,member_type,start_yr,...,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,yr_bin,day
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member,2011,...,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual,2011,...,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member,2011,...,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member,2011,...,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual,2011,...,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,0.0,1.0


# Exploratory data analysis (EDA)