In [99]:
import numpy as np
import pandas as pd

In [100]:
bikes = pd.read_csv("bluebikes_2021.csv")


  bikes = pd.read_csv("C:/Users/Laura/Documents/DSML/Project/bluebikes_2021.csv")


# Exploration

In [101]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934388 entries, 0 to 2934387
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   start_time         object 
 2   end_time           object 
 3   start_station_id   int64  
 4   start_station_lat  float64
 5   start_station_lon  float64
 6   end_station_id     object 
 7   end_station_lat    float64
 8   end_station_lon    float64
 9   bike_id            int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 223.9+ MB


In [112]:
# explain what we found in the data frame (e.g. inconsistencies with column types, start_time / end-time columns, etc.)


# Data cleaning

In [None]:
# explain how we are going to fix all the mistakes we found

In [113]:
# first we found that there are inconsistencies in the columns start_time and end_time:
# some of the entries don't follow the format year-month-day; hours:minutes:seconds
# to be able to change the format first we need the replace the slashes (/) with colons (:) of some values

def substitute_slashes(data, column):
    data[column] = data[column].str.replace('/', ':')
    return data

substitute_slashes(bikes, 'start_time')
substitute_slashes(bikes, 'end_time')

AttributeError: Can only use .str accessor with string values!

In [103]:
# second, we transform the start_time and end_time columns to the proper format with the pd.to_datatime method

bikes[['start_time','end_time']] = bikes[['start_time','end_time']].apply(pd.to_datetime)

In [114]:
# another mistake is that the variable 'end_station_id' is an object, when it should be an integer like 'start_station_id'
# that is because some of the values in the column are also not in the proper format, instead of being whole numers
# some of them have quote symbols (") which makes the program transform the column into an object type
# so what we need to do is eliminate the quote marks of the numbers with the next function

def remove_quotes(data, column):
    data[column] = data[column].apply(lambda x: x.replace('"', '') if isinstance(x, str) else x)
    return data

bikes = remove_quotes(bikes, 'end_station_id')


In [115]:
# once all the values in the column have the same format, we can proceed to transform 
# the variable type of 'end_sation_id' to integer

def convert_column_to_integer(dataset, column_index):
    dataset[column_index] = pd.to_numeric(dataset[column_index], errors='coerce').astype('int64')
    return dataset

convert_column_to_integer(bikes, 'end_station_id')

Unnamed: 0.1,Unnamed: 0,start_time,end_time,start_station_id,start_station_lat,start_station_lon,end_station_id,end_station_lat,end_station_lon,bike_id
0,0,2021-01-01 00:00:04,2021-01-01 00:15:19,91,42.366277,-71.091690,370,42.350961,-71.077828,5316
1,1,2021-01-01 00:00:21,2021-01-01 00:18:27,370,42.350961,-71.077828,169,42.378965,-71.068607,4917
2,2,2021-01-01 00:00:26,2021-01-01 00:16:12,46,42.343666,-71.085824,21,42.346520,-71.080658,2881
3,3,2021-01-01 00:00:30,2021-01-01 00:06:26,178,42.359573,-71.101295,107,42.362500,-71.088220,4792
4,4,2021-01-01 00:01:11,2021-01-01 00:09:43,386,42.368605,-71.099302,413,42.369553,-71.085790,6062
...,...,...,...,...,...,...,...,...,...,...
2934383,2934373,2021-12-31 23:58:38,2022-01-01 00:06:38,4,42.345392,-71.069616,415,42.349544,-71.072421,4895
2934384,2934374,2021-12-31 23:58:46,2022-01-01 00:11:24,77,42.386844,-71.098120,96,42.373379,-71.111075,3665
2934385,2934375,2021-12-31 23:58:49,2022-01-01 00:03:30,338,42.348359,-71.139972,8,42.353334,-71.137313,6470
2934386,2934376,2021-12-31 23:59:50,2022-01-01 00:01:49,32,42.343691,-71.102353,19,42.347241,-71.105301,6484


In [116]:
# we can use the info() function to confirm that the type of the variable 'end_station_id' has been changed

bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934388 entries, 0 to 2934387
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Unnamed: 0         int64         
 1   start_time         datetime64[ns]
 2   end_time           datetime64[ns]
 3   start_station_id   int64         
 4   start_station_lat  float64       
 5   start_station_lon  float64       
 6   end_station_id     int64         
 7   end_station_lat    float64       
 8   end_station_lon    float64       
 9   bike_id            int64         
dtypes: datetime64[ns](2), float64(4), int64(4)
memory usage: 223.9 MB


In [None]:
# note: i'm thinking that all of the 'id' variables shouldn't be integers 
# because we are not supposed to operate with those numbers but treat them as names, should be strings ???