# Step 2 - Create Database with Pymongo

In [1]:
# Import Dependencies 
import pandas as pd
import pymongo

## Data Q2 of 2019

In [2]:
q219_df = pd.read_csv('Output\q219.csv', index_col=None)
q219_df.head()

Unnamed: 0,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,313067326,8,3027,3119,30.0,One Way,Indego30,standard
1,313067325,16,3037,3053,30.0,One Way,Indego30,standard
2,313067324,12,3026,3157,30.0,One Way,Indego30,standard
3,313067323,5,3026,3007,30.0,One Way,Indego30,standard
4,313067322,4,3034,3068,30.0,One Way,Indego30,standard


In [3]:
q219_df.count()

trip_id                206354
duration               206354
start_station          206354
end_station            206354
plan_duration          206347
trip_route_category    206354
passholder_type        206319
bike_type              206354
dtype: int64

In [4]:
q219_df.dtypes

trip_id                  int64
duration                 int64
start_station            int64
end_station              int64
plan_duration          float64
trip_route_category     object
passholder_type         object
bike_type               object
dtype: object

## Data Q3 of 2019

In [5]:
q319_df = pd.read_csv('Output\q319.csv', index_col=None)
q319_df.head()

Unnamed: 0,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,320093117,30,3049,3121,30,One Way,Indego30,standard
1,320093114,26,3119,3118,30,One Way,Indego30,electric
2,320093115,10,3043,3155,365,One Way,Indego365,standard
3,320093116,10,3043,3155,30,One Way,Indego30,standard
4,320093113,15,3040,3007,1,One Way,Day Pass,standard


In [6]:
q319_df.count()

trip_id                275197
duration               275197
start_station          275197
end_station            275197
plan_duration          275197
trip_route_category    275197
passholder_type        275197
bike_type              275197
dtype: int64

## Data Q2 of 2020

In [7]:
q220_df = pd.read_csv('Output\q319.csv', index_col=None)
q220_df.head()

Unnamed: 0,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,320093117,30,3049,3121,30,One Way,Indego30,standard
1,320093114,26,3119,3118,30,One Way,Indego30,electric
2,320093115,10,3043,3155,365,One Way,Indego365,standard
3,320093116,10,3043,3155,30,One Way,Indego30,standard
4,320093113,15,3040,3007,1,One Way,Day Pass,standard


In [8]:
q220_df.count()

trip_id                275197
duration               275197
start_station          275197
end_station            275197
plan_duration          275197
trip_route_category    275197
passholder_type        275197
bike_type              275197
dtype: int64

## Data Q3 of 2020

In [9]:
q320_df = pd.read_csv('Output\q320.csv', index_col=None)
q320_df.head()

Unnamed: 0,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,345127322,10,3021,3051,365,One Way,Indego365,standard
1,345127320,112,3067,3104,30,One Way,Indego30,electric
2,345127318,111,3067,3104,30,One Way,Indego30,electric
3,345127315,3,3187,3168,30,One Way,Indego30,standard
4,345127313,42,3056,3017,1,One Way,Day Pass,electric


In [10]:
q320_df.count()

trip_id                269985
duration               269985
start_station          269985
end_station            269985
plan_duration          269985
trip_route_category    269985
passholder_type        269985
bike_type              269985
dtype: int64

# Create Database Using Pymongo

In [11]:
#Establish connection to mongo db
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [12]:
#Create the database indego_db
db = client.indego_db

In [13]:
#Setup collections, if collections exist drop them start anew, 
db.tripQ219.drop()
db.tripQ319.drop()

## Q2 of 2019

In [14]:
#Convert pandas dataframe to dictionary, orient as a record, keeps a copy
# Quarter 2 of 2019
df_dict = q219_df.to_dict(orient='records').copy()

In [15]:
#Insert the dictionary into collection
db.tripQ219.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x1d9d4f8b300>

In [16]:
#print documents
cursor = db.tripQ219.find({},limit=2)
for document in cursor:
    print(document)

{'_id': ObjectId('6010a8cbcf0a73db476ec950'), 'trip_id': 313067326, 'duration': 8, 'start_station': 3027, 'end_station': 3119, 'plan_duration': 30.0, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'standard'}
{'_id': ObjectId('6010a8cbcf0a73db476ec951'), 'trip_id': 313067325, 'duration': 16, 'start_station': 3037, 'end_station': 3053, 'plan_duration': 30.0, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'standard'}


## Read Q2 of 2019 Collection into a Dataframe

In [17]:
#Read tripQ219 collection from mongodb into a dataframe tripQ219_df
tripQ219_df = pd.DataFrame(list(db.tripQ219.find({})))
tripQ219_df.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a8cbcf0a73db476ec950,313067326,8,3027,3119,30.0,One Way,Indego30,standard
1,6010a8cbcf0a73db476ec951,313067325,16,3037,3053,30.0,One Way,Indego30,standard
2,6010a8cbcf0a73db476ec952,313067324,12,3026,3157,30.0,One Way,Indego30,standard
3,6010a8cbcf0a73db476ec953,313067323,5,3026,3007,30.0,One Way,Indego30,standard
4,6010a8cbcf0a73db476ec954,313067322,4,3034,3068,30.0,One Way,Indego30,standard


## Q3 of 2019

In [18]:
#Convert pandas dataframe to dictionary, orient as a record, keeps a copy
# Quarter 3 of 2019
df_dict2 = q319_df.to_dict(orient='records').copy()

In [19]:
#Insert the dictionary into collection
db.tripQ319.insert_many(df_dict2)

<pymongo.results.InsertManyResult at 0x1d9d9e5a4c0>

In [20]:
#print duration documents
cursor = db.tripQ319.find({},limit=2)
for document in cursor:
    print(document)

{'_id': ObjectId('6010a8d4cf0a73db4771ef62'), 'trip_id': 320093117, 'duration': 30, 'start_station': 3049, 'end_station': 3121, 'plan_duration': 30, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'standard'}
{'_id': ObjectId('6010a8d4cf0a73db4771ef63'), 'trip_id': 320093114, 'duration': 26, 'start_station': 3119, 'end_station': 3118, 'plan_duration': 30, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'electric'}


## Read Q3 of 2019 Collection into a Dataframe

In [21]:
#Read tripQ319 collection from mongodb into a dataframe tripQ319_df
tripQ319_df = pd.DataFrame(list(db.tripQ319.find({})))
tripQ319_df.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a8d4cf0a73db4771ef62,320093117,30,3049,3121,30,One Way,Indego30,standard
1,6010a8d4cf0a73db4771ef63,320093114,26,3119,3118,30,One Way,Indego30,electric
2,6010a8d4cf0a73db4771ef64,320093115,10,3043,3155,365,One Way,Indego365,standard
3,6010a8d4cf0a73db4771ef65,320093116,10,3043,3155,30,One Way,Indego30,standard
4,6010a8d4cf0a73db4771ef66,320093113,15,3040,3007,1,One Way,Day Pass,standard


## Merge Q2 and Q3 2019 into Combined Dataframe

In [22]:
# Stack the DataFrames on top of each other
combined19 = pd.concat([tripQ219_df, tripQ319_df], axis=0)
combined19.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a8cbcf0a73db476ec950,313067326,8,3027,3119,30.0,One Way,Indego30,standard
1,6010a8cbcf0a73db476ec951,313067325,16,3037,3053,30.0,One Way,Indego30,standard
2,6010a8cbcf0a73db476ec952,313067324,12,3026,3157,30.0,One Way,Indego30,standard
3,6010a8cbcf0a73db476ec953,313067323,5,3026,3007,30.0,One Way,Indego30,standard
4,6010a8cbcf0a73db476ec954,313067322,4,3034,3068,30.0,One Way,Indego30,standard


In [23]:
combined19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481551 entries, 0 to 275196
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   _id                  481551 non-null  object 
 1   trip_id              481551 non-null  int64  
 2   duration             481551 non-null  int64  
 3   start_station        481551 non-null  int64  
 4   end_station          481551 non-null  int64  
 5   plan_duration        481544 non-null  float64
 6   trip_route_category  481551 non-null  object 
 7   passholder_type      481516 non-null  object 
 8   bike_type            481551 non-null  object 
dtypes: float64(1), int64(4), object(4)
memory usage: 36.7+ MB


## Export 2019 Combined Dataframe into CSV File

In [24]:
# Export as a CSV from the data of the MongoDB document
# Two quarters of 2019
combined19.to_csv("Output\combined19.csv", ",")

## Q2 of 2020

In [25]:
#Setup collections, if collections exist drop them start anew, 
db.tripQ220.drop()
db.tripQ320.drop()

In [26]:
#Convert pandas dataframe to dictionary, orient as a record, keeps a copy
# Quarter 2 of 2020
df_dict3 = q220_df.to_dict(orient='records').copy()

In [27]:
#Insert the dictionary into collection
db.tripQ220.insert_many(df_dict3)

<pymongo.results.InsertManyResult at 0x1d9d757adc0>

In [28]:
#print duration documents
cursor = db.tripQ220.find({},limit=2)
for document in cursor:
    print(document)

{'_id': ObjectId('6010a8e2cf0a73db4776225f'), 'trip_id': 320093117, 'duration': 30, 'start_station': 3049, 'end_station': 3121, 'plan_duration': 30, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'standard'}
{'_id': ObjectId('6010a8e2cf0a73db47762260'), 'trip_id': 320093114, 'duration': 26, 'start_station': 3119, 'end_station': 3118, 'plan_duration': 30, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'electric'}


## Read Q2 of 2020 Collection into a Dataframe

In [29]:
#Read tripQ220 collection from mongodb into a dataframe tripQ220_df
tripQ220_df = pd.DataFrame(list(db.tripQ220.find({})))
tripQ220_df.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a8e2cf0a73db4776225f,320093117,30,3049,3121,30,One Way,Indego30,standard
1,6010a8e2cf0a73db47762260,320093114,26,3119,3118,30,One Way,Indego30,electric
2,6010a8e2cf0a73db47762261,320093115,10,3043,3155,365,One Way,Indego365,standard
3,6010a8e2cf0a73db47762262,320093116,10,3043,3155,30,One Way,Indego30,standard
4,6010a8e2cf0a73db47762263,320093113,15,3040,3007,1,One Way,Day Pass,standard


## Q3 of 2020

In [36]:
#Convert pandas dataframe to dictionary, orient as a record, keeps a copy
# Quarter 3 of 2020
df_dict4 = q320_df.to_dict(orient='records').copy()

In [37]:
#Insert the dictionary into collection
db.tripQ320.insert_many(df_dict4)

<pymongo.results.InsertManyResult at 0x1d9df341380>

In [38]:
# Print duration documents
cursor = db.q320_df.find({},limit=2)
for document in cursor:
    print(document)

{'_id': ObjectId('60109d1a79b21a6ac32f6480'), 'trip_id': 345127322, 'duration': 10, 'start_station': 3021, 'end_station': 3051, 'plan_duration': 365, 'trip_route_category': 'One Way', 'passholder_type': 'Indego365', 'bike_type': 'standard'}
{'_id': ObjectId('60109d1a79b21a6ac32f6481'), 'trip_id': 345127320, 'duration': 112, 'start_station': 3067, 'end_station': 3104, 'plan_duration': 30, 'trip_route_category': 'One Way', 'passholder_type': 'Indego30', 'bike_type': 'electric'}


## Read Q3 of 2020 Collection into a Dataframe

In [39]:
# Read tripQ320 collection from mongodb into a dataframe tripQ320_df
tripQ320_df = pd.DataFrame(list(db.tripQ320.find({})))
tripQ320_df.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a945cf0a73db477e73fd,345127322,10,3021,3051,365,One Way,Indego365,standard
1,6010a945cf0a73db477e73fe,345127320,112,3067,3104,30,One Way,Indego30,electric
2,6010a945cf0a73db477e73ff,345127318,111,3067,3104,30,One Way,Indego30,electric
3,6010a945cf0a73db477e7400,345127315,3,3187,3168,30,One Way,Indego30,standard
4,6010a945cf0a73db477e7401,345127313,42,3056,3017,1,One Way,Day Pass,electric


## Merge Q2 and Q3 2020 into Combined Dataframe

In [40]:
# Stack the DataFrames on top of each other
combined20 = pd.concat([tripQ220_df, tripQ320_df], axis=0)
combined20.head()

Unnamed: 0,_id,trip_id,duration,start_station,end_station,plan_duration,trip_route_category,passholder_type,bike_type
0,6010a8e2cf0a73db4776225f,320093117,30,3049,3121,30,One Way,Indego30,standard
1,6010a8e2cf0a73db47762260,320093114,26,3119,3118,30,One Way,Indego30,electric
2,6010a8e2cf0a73db47762261,320093115,10,3043,3155,365,One Way,Indego365,standard
3,6010a8e2cf0a73db47762262,320093116,10,3043,3155,30,One Way,Indego30,standard
4,6010a8e2cf0a73db47762263,320093113,15,3040,3007,1,One Way,Day Pass,standard


In [41]:
combined20.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 545182 entries, 0 to 269984
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   _id                  545182 non-null  object
 1   trip_id              545182 non-null  int64 
 2   duration             545182 non-null  int64 
 3   start_station        545182 non-null  int64 
 4   end_station          545182 non-null  int64 
 5   plan_duration        545182 non-null  int64 
 6   trip_route_category  545182 non-null  object
 7   passholder_type      545182 non-null  object
 8   bike_type            545182 non-null  object
dtypes: int64(5), object(4)
memory usage: 41.6+ MB


## Export 2020 Combined Dataframe into CSV File

In [42]:
# Export as a CSV from the data of the MongoDB document
# Two quarters of 2020
combined20.to_csv("Output\combined20.csv", ",")