# Análisis del uso de las bicis. 

* 1. Carga de los datos. 
* 2. Limpieza de Missings. 
* 3. Cálculo de distancia y de edad.
* 4. Eliminación de outliers.
* 5. Cálculo de variables auxiliares. 
* 6. TADs:
    * a) Compañías.
    * b) Bicis.

In [1]:
import rho_data_eng as rho
import datetime as dt
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', 500)

frames = ['../Exam2/' + f for f in os.listdir('../Exam2/') if '.csv' in f]

df = pd.DataFrame()

# 1. Carga de los datos.
for f in frames:
    df = pd.concat([df, pd.read_csv(f).dropna()])


# 2. Limpieza de Missings.
df = df[df.columns[:15]]
df = df.dropna()

# 3. Cálculo de la distancia y de la edad.
lat1 = 'start station latitude'
lat2 = 'end station latitude'
long1 = 'start station longitude'
long2 = 'end station longitude'
df['distance'] = rho.lat_long_to_distance(df[lat1], df[lat2], df[long1], df[long2])

df['age'] = df['starttime'].map(lambda x:int(x[:4])) - df['birth year']

varc = ['tripduration', 'age', 'distance']
vard = [c for c in df.columns if c not in varc]

rho.missing_zero_values_table(df)

Your selected dataframe has 17 columns and 753373 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


In [2]:
print(df.shape)
rho.iqr_(df,varc).shape

(753373, 17)


(580358, 17)

In [3]:
# 4. Eliminación de Outliers.
df = rho.iqr_(df, varc)

In [4]:
df['starttime'] = pd.to_datetime(df.starttime)

In [5]:
# 5. Cálculo de vairables auxiliares. 

# Disc_time
bins = pd.IntervalIndex.from_tuples([(1,6), (6, 12), (12, 20), (20, 25), (0, 1)], closed='left')
df['disc_time'] = pd.cut(df['starttime'].dt.hour, bins)
df['disc_time'] = df['disc_time'].map(dict(zip(bins, ['Madrugada', 'Mañana', 'Tarde', 'Noche', 'Madrugada'])))

# Disc_age
bins = pd.IntervalIndex.from_tuples([(0,30), (30, 60), (60, 100), (100,np.inf)], closed='left')
df['disc_age'] = pd.cut(df['age'], bins)
df['disc_age'] = df['disc_age'].map(dict(zip(bins, ['Joven', 'Adulto', 'Edad avanzada', 'Super Adulto'])))

# Disc_distance
bins = pd.IntervalIndex.from_tuples([(0,.5), (.5, 1), (1, 2), (2,np.inf)], closed='left')
df['disc_distance'] = pd.cut(df['distance'], bins)
df['disc_distance'] = df['disc_distance'].map(dict(zip(bins, ['Viaje Fantasma', 'Viaje Corto', 
                                                             'Viaje Medio', 'Viaje Largo'])))

# Configuración de las variables discretas. 

df['ssid'] = df['start station id'].astype(int).astype(str)
df['esid'] = df['end station id'].astype(int).astype(str)
df['bikeid'] = df['bikeid'].astype(int).astype(str)
df['birth year'] = df['birth year'].astype(int).astype(str)
df['gender'] = df['gender'].astype(int).astype(str)
df['age'] = df['age'].astype(int)

id_names1 = dict(df[['ssid', 'esid', 'start station name', 'end station name']]
                .groupby(['ssid', 'start station name']).agg(pd.Series.mode).index)
id_names2 = dict(df[['ssid', 'esid', 'start station name', 'end station name']]
                .groupby(['esid', 'end station name']).agg(pd.Series.mode).index)

id_names = id_names1
id_names.update(id_names2)

# Estaciones:

stations = list(set(list(df['ssid'].unique()) + list(df['esid'].unique())))
df_stat = pd.DataFrame(stations, index = [i for i in range(len(stations))], columns = ['id_']).sort_values(by='id_')
df_stat.reset_index(inplace=True, drop=True)


df_stat['name'] = df_stat['id_'].map(id_names)

stations_names = list(set(list(df['end station name'].unique()) + list(df['start station name'])))

In [10]:
# Validación de errores para algún tema del id. 

for r,i in df_stat.iterrows():
    name = i['name']
    id_ = i['id_']
    aux1 = df.loc[df['ssid'] == id_]
    aux2 = df.loc[df['esid'] == id_]
    print(len(aux1), sum(np.where(aux1['start station name'] == name, 0, 1)))
    print(len(aux2), sum(np.where(aux2['end station name'] == name, 0, 1)))

4614 0
5503 0
12664 0
13393 0
16039 0
16952 0
62740 0
76580 0
15449 0
15721 0
630 0
807 0
12413 0
12512 0
4395 0
5272 0
10351 0
7277 0
27640 0
27009 0
1807 0
1829 0
1542 0
1428 0
21008 0
19932 0
1679 0
2451 0
25191 0
25093 0
33803 0
32340 0
11985 0
11197 0
4157 0
3536 0
5483 0
4044 0
12900 0
12008 0
4824 0
5136 0
15785 0
16275 0
3456 0
2530 0
12996 0
11582 0
8616 0
8974 0
2748 0
2124 0
6227 0
6405 0
15071 0
13751 0
4580 0
4622 0
13920 0
12443 0
0 0
6 0
12698 0
10406 0
13321 0
12488 0
11328 0
11636 0
12966 0
13088 0
22012 0
21452 0
2361 0
2661 0
11766 0
11063 0
10439 0
9714 0
5408 0
5597 0
1878 0
1955 0
26 0
62 0
5313 0
5275 0
6901 0
6889 0
0 0
1 0
14153 0
13953 0
24666 0
24730 0
8192 0
7945 0
2446 0
2078 0
5117 0
4216 0
5223 0
5371 0
9038 0
8868 0
590 0
746 0
3248 0
3391 0
16555 0
18041 0


In [11]:
# Confirmamos que nuestro dataframe fue calculado correctamente.
df_stat.head(2)

Unnamed: 0,id_,name
0,3183,Exchange Place
1,3184,Paulus Hook


In [12]:
# Variables calculadas con total/suma (2,3)
df_vars = ['ssid', 'esid']
names_ = ['q_trips_start', 'q_trips_end']

df_stat = df_stat.merge(df[df_vars].groupby(df_vars[0]).count(), how = 'left', left_on = 'id_', right_index = True)
df_stat = df_stat.merge(df[df_vars].groupby(df_vars[1]).count(), how = 'left', left_on = 'id_', right_index = True)
df_stat.columns = list(df_stat.columns[:-2]) + names_
df_stat = df_stat.fillna(0)

# Variables calculadas con avg (4-7)
df_vars = ['distance', 'tripduration']
grbvars = ['esid', 'ssid']
names_ = ['avg_ended_distance_per_trip', 'avg_ended_time_per_trip',
         'avg_started_distance_per_trip', 'avg_started_time_per_trip']

df_stat = df_stat.merge(df[df_vars + [grbvars[0]]].groupby(grbvars[0]).mean(),
                        how = 'left', left_on = 'id_', right_index = True)
df_stat = df_stat.merge(df[df_vars + [grbvars[1]]].groupby(grbvars[1]).mean(),
                        how = 'left', left_on = 'id_', right_index = True)
df_stat.columns = list(df_stat.columns[:-4]) + names_
df_stat = df_stat.fillna(0)

# Female-Male
sex_dict = dict(zip(list(df['gender'].unique()), ['Male', 'Female', 'Female']))
df['gender'] = df['gender'].map(sex_dict)
ids_ = ['ssid', 'esid']
cols = 'gender'
agg_func = 'count'
aux_start = pd.pivot_table(df[ids_ + [cols]], columns = cols, index = ids_[0], 
                             aggfunc = agg_func, values = ids_[1])
aux_end = pd.pivot_table(df[ids_ + [cols]], columns = cols, index = ids_[1], 
                             aggfunc = agg_func, values = ids_[0])
aux = aux_end.merge(aux_start, left_index = True, right_index = True, how = 'outer', suffixes = ('_e', '_s'))
aux['Female'] = aux['Female_e'] + aux['Female_s']
aux['Male'] = aux['Male_e'] + aux['Male_s']

aux = aux.fillna(0)

df_stat = df_stat.merge(aux[['Female', 'Male']], how = 'left', right_index = True, left_on = 'id_')

# Subs-customer
ids_ = ['ssid', 'esid']
cols = 'usertype'
agg_func = 'count'
aux_start = pd.pivot_table(df[ids_ + [cols]], columns = cols, index = ids_[0], 
                             aggfunc = agg_func, values = ids_[1])
aux_end = pd.pivot_table(df[ids_ + [cols]], columns = cols, index = ids_[1], 
                             aggfunc = agg_func, values = ids_[0])
aux = aux_end.merge(aux_start, left_index = True, right_index = True, how = 'outer', suffixes = ('_e', '_s'))
aux['Customer'] = aux['Customer_e'] + aux['Customer_s']
aux['Subscriber'] = aux['Subscriber_e'] + aux['Subscriber_s']

df_stat = df_stat.merge(aux[['Subscriber', 'Customer']], how = 'left', right_index = True, left_on = 'id_')

# Months:
#pd.to_datetime(df['starttime']).dt.month
df['Month'] = pd.to_datetime(df['starttime']).dt.strftime('%b')

aux = pd.pivot_table(df[['esid', 'Month', 'tripduration']], 
              columns = 'Month', index = 'esid', values = 'tripduration', aggfunc = 'count', fill_value = 0)

aux.columns = [f'Bikes_arrived_{i}' for i in aux.columns]

df_stat = df_stat.merge(aux, how = 'left', right_index = True, left_on = 'id_')

# Bikes
cols = ['ssid', 'bikeid', 'esid']
#bikes_aux = df[cols].groupby(['ssid', 'bikeid']).count()
bike_start_avg_trips = df[cols].groupby(['ssid', 'bikeid']).count().groupby('ssid').mean()
bike_start_avg_trips.columns = ['bike_start_avg_trips']
bike_end_avg_trips = df[cols].groupby(['esid', 'bikeid']).count().groupby('esid').mean()
bike_end_avg_trips.columns = ['bike_end_avg_trips']


df_stat = df_stat.merge(bike_end_avg_trips, how = 'left', right_index = True, left_on = 'id_')
df_stat = df_stat.merge(bike_start_avg_trips, how = 'left', right_index = True, left_on = 'id_')

# Gender
df_stat['Gender'] = np.where(df_stat['Female']>=df_stat['Male'], 1, 0)

In [13]:
df_stat.head(3)

Unnamed: 0,id_,name,q_trips_start,q_trips_end,avg_ended_distance_per_trip,avg_ended_time_per_trip,avg_started_distance_per_trip,avg_started_time_per_trip,Female,Male,Subscriber,Customer,Bikes_arrived_Apr,Bikes_arrived_Aug,Bikes_arrived_Dec,Bikes_arrived_Feb,Bikes_arrived_Jan,Bikes_arrived_Jul,Bikes_arrived_Jun,Bikes_arrived_Mar,Bikes_arrived_May,Bikes_arrived_Nov,Bikes_arrived_Oct,Bikes_arrived_Sep,bike_end_avg_trips,bike_start_avg_trips,Gender
0,3183,Exchange Place,4614.0,5503,0.826185,344.871888,0.810498,362.802991,2311.0,7806.0,9516.0,601.0,1665,0,0,787,859,0,0,1046,1146,0,0,0,10.918651,9.172962,0
1,3184,Paulus Hook,12664.0,13393,1.019872,489.733816,1.02039,505.139845,7543.0,18514.0,21825.0,4232.0,724,1444,485,773,982,1647,1839,747,1128,865,1353,1406,8.635074,8.13359,0
2,3185,City Hall,16039.0,16952,0.875289,430.655793,0.830824,435.008043,9598.0,23393.0,28520.0,4471.0,947,1760,775,1107,1517,1741,1584,1122,1245,1188,1995,1971,10.368196,9.918986,0


In [14]:
df_stat.shape

(55, 27)

In [41]:
df_stat.to_csv('Stations_Oviedo_Quezada_Rolando.csv', index = False)

# TAD Bici

In [15]:
df.head(1)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,distance,age,disc_time,disc_age,disc_distance,ssid,esid,Month
0,156.0,2020-10-01 00:02:40.260,2020-10-01 00:05:17.0140,3186.0,Grove St PATH,40.719586,-74.043117,3270.0,Jersey & 6th St,40.725289,-74.045572,42293,Subscriber,1996,Male,0.667019,24,Madrugada,Joven,Viaje Corto,3186,3270,Oct


In [39]:
# Índice
df_bici = pd.DataFrame(df['bikeid'].unique(), columns = ['id_'])

# Variables de totales
id_ = 'bikeid'
vars_ = ['distance', 'tripduration']
aux = df[[id_] + vars_].groupby(id_).agg({'distance': ['count', 'sum'], 'tripduration': 'sum'})
aux.columns = ['totalTrips', 'totalDistance', 'totalTime']
df_bici = df_bici.merge(aux, how = 'left', right_index = True, left_on = 'id_')


# Pivot_tables 

vars_ = ['disc_time', 'disc_age']
values = ['distance']

aux = pd.DataFrame()

for m in vars_:
    data = pd.pivot_table(df[values + [id_, m]], index = id_, columns = m, values = values, aggfunc = 'sum')
    data.columns = [(c[1]+c[0].capitalize()).replace(' ', '_') for c in data.columns]
    df_bici = df_bici.merge(data, how = 'left', right_index = True, left_on = 'id_')
    
vars_ = ['Month']
values = ['distance', 'tripduration']
data = pd.pivot_table(df[values + vars_ + [id_]], index = id_, columns = vars_, values = values, aggfunc = 'sum',
              fill_value = 0)
data.columns = [(c[1] + '_Total' + c[0].capitalize()).replace('Tripduration', 'Time') for c in data.columns]
df_bici = df_bici.merge(data, how = 'left', right_index = True, left_on = 'id_')

df_bici.head(2)

Unnamed: 0,id_,totalTrips,totalDistance,totalTime,MadrugadaDistance,MañanaDistance,NocheDistance,TardeDistance,JovenDistance,AdultoDistance,Edad_avanzadaDistance,Super_AdultoDistance,Apr_TotalDistance,Aug_TotalDistance,Dec_TotalDistance,Feb_TotalDistance,Jan_TotalDistance,Jul_TotalDistance,Jun_TotalDistance,Mar_TotalDistance,May_TotalDistance,Nov_TotalDistance,Oct_TotalDistance,Sep_TotalDistance,Apr_TotalTime,Aug_TotalTime,Dec_TotalTime,Feb_TotalTime,Jan_TotalTime,Jul_TotalTime,Jun_TotalTime,Mar_TotalTime,May_TotalTime,Nov_TotalTime,Oct_TotalTime,Sep_TotalTime
0,42293,451,438.119978,246019.0,16.842678,133.99377,42.21245,245.071081,102.986961,322.71954,12.413476,0.0,1.951211,49.240094,11.733574,49.125746,25.503781,53.228018,51.277119,17.215215,21.036792,35.0385,59.811009,62.958918,798,29774,6207,19716,15764,32419,32197,6587,14483,18864,35125,34085
1,44740,314,320.624508,165319.0,13.521952,68.43536,54.776246,183.890951,102.180954,215.322691,3.120863,0.0,0.0,63.215434,10.866483,0.0,22.327264,51.434341,0.0,0.0,0.0,2.919029,85.796143,84.065814,0,28592,5782,0,8930,28679,0,0,0,3541,41910,47885


In [40]:
df_bici.shape

(1987, 36)

In [42]:
df_stat.to_csv('bikes_Oviedo_Quezada_Rolando.csv', index = False)