## 1. IMPORTING PYTHON LIBRARIES

In [1]:
# to work with data: dataframes, statistics & regular expressions
import pandas as pd
import numpy as np
import re
import pandas_profiling as pdp # suggested by Dani in Slack

# for data viz
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

## 2. DATA WRANGLING

In [2]:
def importing_csv(csv_path):
    df = pd.read_csv(csv_path)
    return df

def raw(df):
    print('shape:',df.shape)
    print('\n columns:',df.columns)
    print('\n variables info:')    
    return df.info(),df.describe()

def replace_Nan_with0(data,num_var):
    for nv in num_var: 
        data[nv].fillna("0", inplace = True, downcast='infer')
    return data

def split_datetime(df,datetime):
    year,month,date,time = str(datetime+'_year'),str(datetime+'_month'),str(datetime+'_date'),str(datetime+'_time')
    df[year] = pd.to_datetime(df[datetime]).dt.year
    df[month] = pd.to_datetime(df[datetime]).dt.month
    df[date] = pd.to_datetime(df[datetime]).dt.date
    df[time] = pd.to_datetime(df[datetime]).dt.time
    return df[[datetime,year,month,date,time]]

def second_to_min(df,second):
    minutes = str(second+'_min')
    df[minutes]=df[second]/60
    return df[minutes,second]

def centavos_a_pesos(df,centavos):
    pesos = str(centavos+'_pesos')
    df[pesos] = df[centavos]/100
    return df[pesos,centavos]

def valcount(df, var):
    return df[var].value_counts()

def export_clean_csv(data,csv_name):
    return data.to_csv(str(csv_name+'.csv'))

In [3]:
# importing csv
csv_path = './db/tableau_project.csv'
data = importing_csv(csv_path)

In [None]:
# using my function to explore variables as overview: 
# shape, columns name, variables info (# registers per variable & type of variable) and statistics desciption for numeric variables.
raw(data)

In [None]:
# checking # NaN per variable
data.isna().sum()

In [None]:
# checking how structured is the dataset looking for unique mentions per variable
data.nunique()

In [None]:
# just for testing this library suggested by Dani in Slack **very useful, by the way**
pdp.ProfileReport(data)

In [None]:
# first look at data inside variables:
data.head()

In [None]:
# as we can not see all columns, we will set display option for columns:
pd.set_option('display.max_columns', 50)
data.head()

In [None]:
# checking discount column value counts:
valcount(data, 'discount')

In [None]:
# dropping non relevant columns from dataset
data = data.drop(columns=["Unnamed: 0","discount"])

In [None]:
# replace Nan with '0' value:
num_var = ['distance','driver_waiting_time','rider_waiting_time','price','price_duration','duration','price_distance','price_supplements']
replace_Nan_with0(data,num_var)

In [None]:
# checking again NaN --> OK 
data.isna().sum()

In [None]:
# to convert to category type :
def convert_to_category(data,cat_var):
    for cv in cat_var: 
        data[cv] = data[cv].astype('category')
    return data

# to convert to numeric / int type :
def convert_to_int(data,int_var):
    for iv in int_var:
        data[iv] = data[iv].astype('int64', copy=False)
        #data[iv] = data[iv].apply(lambda x: x.astype('str').astype('int64', copy=False))
    return data

# to replace comma per dot:
def replace_comma_per_dot(data,var):
    for v in var:
        data[v] = data[v].apply(lambda x : str(x).replace(',','.').strip())
    return data

# to remove comma:
def remove_comma(data,var):
    for v in var:
        data[v] = data[v].apply(lambda x : str(x).replace(',','').strip())
    return data

# to convert to numeric / float type (if some NaN)
def convert_to_float(data,float_var):
    for fv in float_var:
        data[fv] = data[fv].apply(pd.to_numeric, errors='coerce')
    return data

# other way to convert to numeric / float type (if no NaN)
def convert_to_float2(data,float_var):
    for fv in float_var:
        data[iv] = data[iv].astype('float64', copy=False)
        #data[fv] = data[fv].apply(lambda x: x.astype('str').astype('float64', copy=False))
    return data

In [None]:
# first we convert categorical variable to category format: 
cat_var = ['vehicle_type_id', 'start_type', 'source','end_state']
convert_to_category(data,cat_var)

# for geospatial data, first we check and replace ',' with '.' if any :
geo_var = ['start_lat','start_lon','end_lat','end_lon']
var = geo_var
replace_comma_per_dot(data,var)

# for numeric data, first  we remove the ',' thousand separator if any :
float_var = ['price','price_distance','price_duration','distance','duration','price_supplements','rating','rider_waiting_time','driver_waiting_time']
var = float_var
remove_comma(data,var)

# then we convert all var (geospatial & numeric) to numeric / float.
float_var = [float_var + geo_var]
convert_to_float(data,float_var)
#convert_to_float2(data,float_var)

In [None]:
# checking all types conversion:
data.dtypes

In [None]:
valcount(data, 'vehicle_type_id')

In [None]:
data.loc[data.vehicle_type_id == '21620ea5749f2e0679a8c72c7fbafb9e', 'vehicle_type_id'] = 'A'
data.loc[data.vehicle_type_id == '077866c3fd1a75f51ca7f8eae166ae32', 'vehicle_type_id'] = 'B'
valcount(data, 'vehicle_type_id')

In [None]:
valcount(data, 'start_type')

In [None]:
valcount(data, 'source')

In [None]:
valcount(data, 'end_state')

In [None]:
data.loc[data['end_state'] == 'no show',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]
# here we see that 'no show' is when the rider had been waiting, the ride took place (distance and duration) but only got a price for duration.

In [None]:
data.loc[data['end_state'] == 'not found',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]
# here we see that 'not found' is when the driver did not find the rider so no ride took place.

In [None]:
data.loc[data['price'] == '4',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']].head(20)

In [None]:
data.loc[data['price'] == '4,5',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']].head(20)

In [None]:
data.loc[data['price'] > 20000,['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]

In [None]:
data.loc[data['price'] == '16',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]

In [None]:
data.loc[data['price'] == '10',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]

In [None]:
data[['price','price_distance','price_duration','price_supplements']].describe()

In [None]:
data['price'].value_counts().head(10)

In [None]:
data['price_distance'].value_counts().head(10)
# price distance seems to be expected values to convert to MXN

In [None]:
data['price_duration'].value_counts().head(10)
# price duration seems to be expected values to convert to MXN

In [None]:
data.loc[data['price_duration'] == '5',['end_state','price','price_distance','price_duration','distance','duration','rider_waiting_time','driver_waiting_time', 'price_supplements']]

In [None]:
data['price_supplements'].value_counts().head(10)
# price supplements seems to be expected values to convert to MXN

In [None]:
data.price.value_counts().head(100)

In [None]:
data.loc[((data['end_state'] == 'rider cancel') & (data['price'] != 0)& (data['price'] != 4)& (data['price'] != 10)),['end_state','price','price_distance','price_duration','price_supplements']]

In [None]:
data.loc[((data['price'] > 0) & (data['price'] < max(len(data['price_distance']),len(data['price_duration']),len(data['price_supplements'])))),['end_state','price','price_distance','price_duration','price_supplements']]

In [None]:
x = ''
if re.match('^\d{2,3}$', x): 
    df[v] = str(df[v]+'00') if df['end_state']=='rider cancel' else str(df[v]+'000')

In [None]:
# we convert currency format: from centavos to pesos
'''
45 pesos (2,12€) seems to be the minimal rate for a drive
https://www.numbeo.com/taxi-fare/in/Mexico-City
https://www.taxi-calculator.com/taxi-rate-mexico_city/350
https://vanguardia.com.mx/articulo/uber-vs-cabify-vs-easy-taxi-cual-es-mejor
https://www.elfinanciero.com.mx/tech/que-te-conviene-taxi-o-uber

price_mini = taxi 8 MXN (800 cents)(basic fee + kms) - UBER 35 MXN (3500 cents) - CABIFY 40 MXN (4000 cents)
price_distance mini = 1 MXN (250m) - 4 MXN (1km)
price_duration_mini = 100 MXN

1 MXN = 100 cents
1.000 cents = 0,47 EUR | 10.000 cents = 4.7 EUR | 100.000 cents = 47 EUR | 1.000.000 cents = 470 EUR
   10 MXN = 0,47 EUR   |    100 MXN = 4.7 EUR   |   1.000 MXN = 47 EUR   |    10.000 MXN = 470 EUR

4 = 4000 cents or 40 MXN --> concat '000'
10 = 1000(0) cents or 10(0) MXN --> concat '00' if rider cancel | --> concat '000' else
100 = 10000(0) cents or 100(0) MXN --> concat '00' if rider cancel | --> concat '000' else

4,5 = 4500 cents or 45 MXN --> concat '00'  & remove ','
16,2 = 1620(0) cents or 16(0).20 MXN --> (concat '0' if rider cancel | --> concat '00' else) & remove ',' 
160,2 = 16020(0) cents or 160(0).20 MXN --> (concat '0' if rider cancel | --> concat '00' else) & remove ','

4,20 = 4200(0) cents or 42(0) MXN  --> concat '0' else) & remove ','  
16,20 = 1620(0) cents or 16(0).20 MXN --> ( NO concat if rider cancel | --> concat '0' else) & remove ',' 
160,20 = 16020(0) cents or 160(0).20 MXN --> ( NO concat if rider cancel | --> concat '0' else) & remove ',' 

4,205 = 4205 cents or 42.05 MXN --> remove ','
14,200 = 14200 cents or 142 MXN --> remove ','
140,200 = 140200 cents or 1402 MXN --> remove ','
'''

# to replace comma per dot:
def replace_comma_per_dot(x):
    return str(x).replace(',','.').strip()

# to remove comma:
def remove_comma(x):
     return str(x).replace(',','').strip()

def correct_price_pattern(df,var):
    for v in var:
        df[v] = df[v].astype('str')
        s = str(df[v])
        rc = remove_comma(s)
        
        re.sub('^\d+\,\d{3}$',rc,df[v])
        re.sub('^\d{1}$',(s+'000'),df[v])
        re.sub('^\d{1}\,\d{1}$',(rc+'00'),df[v])
        re.sub('^\d{1}\,\d{2}$',(rc+'0'),df[v])
        
        if df['end_state']=='rider cancel' or df['end_state']=='no show':
            re.sub('^\d{2,3}$',(s+'00'),df[v])
            re.sub('^\d{2,3}\,\d{1}$',(rc+'0'),df[v])
            re.sub('^\d{2,3}\,\d{2}$',rc,df[v])    
        elif df['end_state']=='drop off':
            re.sub('^\d{2,3}$',(s+'000'),df[v])
            re.sub('^\d{2,3}\,\d{1}$',(rc+'00'),df[v])
            re.sub('^\d{2,3}\,\d{2}$',(rc+'0'),df[v])
        else:      
            df[v]=df[v]
    return df[var]
   

'''   
def correct_price_pattern(df,var):
    for v in var:
        df[v] = df[v].astype('str') 
               
        if re.match('^\d+\,\d{3}$', df[v]): 
            df[v] = remove_comma(df[v])
               
        elif re.match('^\d{1}$', df[v]): 
            df[v] = df[v]+'000'
        elif re.match('^\d{1}\,\d{1}$', df[v]): 
            df[v] = remove_comma(df[v])+'00'
        elif re.match('^\d{1}\,\d{2}$', df[v]): 
            df[v] = remove_comma(df[v])+'0'
               
        elif re.match('^\d{2,3}$', df[v]): 
            df[v] = (df[v]+'00') if df['end_state']=='rider cancel' else (df[v]+'000')        
        elif re.match('^\d{2,3}\,\d{1}$', df[v]):     
            df[v] = (remove_comma(df[v])+'0') if df['end_state']=='rider cancel' else (remove_comma(df[v])+'00')
        elif re.match('^\d{2,3}\,\d{2}$', df[v]):     
            df[v] = remove_comma(df[v]) if df['end_state']=='rider cancel' else (remove_comma(df[v])+'0')
        else:
            df[v] = df[v]
    return df[var]
    
'''
              




In [4]:
def remove_comma(x):
     return str(x).replace(',','').strip()

# to convert to numeric / int type :
def convert_to_int(df,int_var):
    for iv in int_var:
        df[iv] = df[iv].apply(lambda x: x.astype('int64', copy=False))
    return df[int_var]

# to convert to numeric / int or float type (if some NaN)
def convert_vartype_to_numeric(col):
    col = col.apply(pd.to_numeric, errors='coerce')
    return col

# to convert cents MXN to MXN:
def cents_to_MXN(x):
    return x/100

In [None]:
'''
def clean_cents(s):
    rc = remove_comma(s)
    s= re.sub('^\d+\,\d{3}$',rc,s)
    return int(s)

def clean_minimums(s):
    rc = remove_comma(s)
    s= re.sub('^\d{1}$',(s+'000'),s)
    s= re.sub('^\d{1}\,\d{1}$',(rc+'00'),s)
    s= re.sub('^\d{1}\,\d{2}$',(rc+'0'),s)
    return int(s)

def clean_dropoffs(s):
    rc = remove_comma(s)
    s= re.sub('^\d{2,3}$',(s+'000'),s)
    s= re.sub('^\d{2,3}\,\d{1}$',(rc+'00'),s)
    s= re.sub('^\d{2,3}\,\d{2}$',(rc+'0'),s)
    return int(s)

def clean_no_dropoffs(s):
    rc = remove_comma(s)
    s= re.sub('^\d{2,3}$',(s+'00'),s)
    s= re.sub('^\d{2,3}\,\d{1}$',(rc+'0'),s)
    s= re.sub('^\d{2,3}\,\d{2}$',rc,s) 
    return int(s)

def clean_others(s):
    return int(s)

s = clean_no_dropoffs('42,3')
print(s, type(s))'''

In [5]:
def clean_cents(s):
    rc = remove_comma(s)
    s = re.sub('^\d+\,\d{3}$',rc,s)
    return s

def clean_minimums(s):
    rc = remove_comma(s)
    s = re.sub('^\d{1}$',(s+'000'),s)
    s = re.sub('^\d{1}\,\d{1}$',(rc+'00'),s)
    s = re.sub('^\d{1}\,\d{2}$',(rc+'0'),s)
    return s

def clean_dropoffs(s):
    rc = remove_comma(s)
    s = re.sub('^\d{2,3}$',(s+'000'),s)
    s = re.sub('^\d{2,3}\,\d{1}$',(rc+'00'),s)
    s = re.sub('^\d{2,3}\,\d{2}$',(rc+'0'),s)
    return s

def clean_no_dropoffs(s):
    rc = remove_comma(s)
    s = re.sub('^\d{2,3}$',(s+'00'),s)
    s = re.sub('^\d{2,3}\,\d{1}$',(rc+'0'),s)
    s = re.sub('^\d{2,3}\,\d{2}$',rc,s) 
    return s

In [6]:
def clean_shit_string(df,var):
    for v in var:
        df[v] = df[v].astype('str')
        df[v] = df[v].apply(clean_cents)
        df[v] = df[v].apply(clean_minimums)
        df[v] = df[v].loc[(df['end_state'].isin(['rider cancel','no show']))] = df[v].apply(clean_no_dropoffs)
        df[v] = df[v].loc[(df['end_state']=='drop off')] = df[v].apply(clean_dropoffs)
    df[var] = df[var].apply(pd.to_numeric, errors='coerce')
    df[var] = df[var].apply(cents_to_MXN)
    return df[var]

In [7]:
price_var_to_clean = ['price','price_duration','price_distance','price_supplements']
clean_shit_string(data,price_var_to_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,price,price_duration,price_distance,price_supplements
0,58.66,0.00,52.04,662.00
1,0.00,0.00,0.00,0.00
2,40.00,0.00,35.52,0.00
3,0.00,0.00,0.00,0.00
4,40.00,12.00,29.74,0.00
5,40.00,290.00,33.46,0.00
6,0.00,,,0.00
7,64.49,20.82,43.67,0.00
8,0.00,,,0.00
9,80.63,0.00,40.63,0.00


In [29]:
total_prices = data.loc[(data['price'] > 0)]
print(total_prices[price_var_to_clean].describe())
print('\n', total_prices['price'].value_counts().head(20))

             price  price_duration  price_distance  price_supplements
count  2761.000000     2729.000000     2729.000000        2761.000000
mean     68.426204       62.749689       72.430022          10.137917
std      59.777768      176.989486      113.126254          88.238424
min      10.000000        0.000000        0.000000           0.000000
25%      40.000000        0.000000       31.150000           0.000000
50%      43.900000        0.000000       41.690000           0.000000
75%      69.920000       12.350000       63.640000           0.000000
max    1745.540000      991.000000     1241.210000         922.000000

 40.00    1064
45.00     103
10.00      14
42.25       5
44.04       5
44.24       4
45.47       4
45.73       4
42.17       4
42.37       4
45.40       3
40.05       3
46.05       3
41.95       3
41.61       3
51.53       3
40.99       3
41.82       3
45.38       3
45.66       3
Name: price, dtype: int64


In [26]:
no_dropoffs_prices = data.loc[((data['price'] > 0)&(data['end_state'].isin(['rider cancel','no show'])))]
print(no_dropoffs_prices[price_var_to_clean].describe())
print('\n', no_dropoffs_prices['price'].value_counts().head(20))

        Unnamed: 0  start_lat  start_lon    end_lat    end_lon       price  \
count    79.000000  79.000000  79.000000  75.000000  75.000000   79.000000   
mean   1932.139241  19.418704 -99.184573  19.418591 -99.182663   40.419114   
std    1174.578634   0.021290   0.031896   0.023475   0.037913   12.148500   
min      49.000000  19.356000 -99.278691  19.356000 -99.275269   10.000000   
25%     883.500000  19.407004 -99.183325  19.418878 -99.183926   40.000000   
50%    2119.000000  19.423091 -99.175830  19.423091 -99.174865   40.000000   
75%    2845.000000  19.434132 -99.171955  19.429759 -99.169503   40.000000   
max    3863.000000  19.455057 -99.081749  19.441434 -99.071908  101.770000   

       price_distance  price_duration  price_supplements     rating  
count            47.0       47.000000               79.0  79.000000  
mean              0.0       47.527234                0.0   8.227848  
std               0.0      121.829303                0.0   1.300388  
min              

In [28]:
dropoffs_prices = data.loc[((data['price'] > 0)&(data['end_state']=='drop off'))]
print(dropoffs_prices[price_var_to_clean].describe())
print('\n', dropoffs_prices['price'].value_counts().head(20))

             price  price_duration  price_distance  price_supplements
count  2682.000000     2682.000000     2682.000000        2682.000000
mean     69.251171       63.016450       73.699303          10.436536
std      60.420103      177.807871      113.702771          89.511616
min      10.000000        0.000000        0.000000           0.000000
25%      40.000000        0.000000       31.922500           0.000000
50%      44.260000        0.000000       42.020000           0.000000
75%      72.170000       12.237500       64.302500           0.000000
max    1745.540000      991.000000     1241.210000         922.000000

 40.00    1001
45.00      98
10.00       9
42.25       5
44.04       5
44.24       4
45.47       4
45.73       4
42.17       4
42.37       4
40.99       3
40.05       3
46.05       3
41.95       3
45.40       3
51.53       3
41.82       3
45.38       3
45.66       3
50.75       3
Name: price, dtype: int64


In [None]:
'''
def clean_shit_MXN(df,var):
    for v in var:
        df[v] = df[v].apply(convert_vartype_to_numeric)
        df[v] = df[v].apply(cents_to_MXN)
    return df[var]
clean_shit_MXN(data,price_var_to_clean)
'''

In [None]:
def concat0(s, n_zeros):
    # concatena a un string el número de ceros indicado
    zeros = n_zeros * '0'
    return s + zeros
    
s001 = concat0('4,5', 2)
print (s001, type(s001))

In [None]:
def check_comma(s):
    # comprueba si hay coma en el string dado
    if ',' in s:
        return True
    else:
        return False

In [None]:
def count_decimals(s):
    # devuelve el número de decimales de un número en string
    if check_comma(s):
        lista = s.split(',')
        return len(lista[1])
    else:
        return 0
print(count_decimals('4,1'))

In [None]:
def count_integers(s):
    i=s[:s.index(',')]
    return len(i)

f = count_integers('544,4')
print (f, type(f))

In [None]:
def normalize_dropoff_price(s):
    # normalize values of prices from string
    dec = count_decimals(s)
    zeros = 3-dec
    if dec != 0: # if decimals remove comma
        s = s.replace(',','') 
    return concat0(s, zeros)

print (normalize_dropoff_price('4,112'))

In [None]:
def clean_shit(df,lab,var):
    if df.loc[(df[lab].isin(['rider cancel','no show'])),[lab]]:
        if count_decimals()
    
    elif df.loc[(df[lab].isin(['drop off'])),[lab]]:
    
    else:
        df[lab]
df.loc[(df[lab] == 'driver cancel'),[lab]]   
    
check_end_state(data,'end_state')

In [None]:
data.loc[(data['end_state'] == 'driver cancel'),['price']] = 'NaN'
data['end_state'].loc[(data['end_state'].isin(['rider cancel','no show']))]

In [None]:
def clean_shit(df):
    

In [None]:
price_var_to_clean = ['price','price_duration','price_distance','price_supplements']
correct_price_pattern(data,price_var_to_clean)

In [None]:
centavos_a_pesos(data,'price')
centavos_a_pesos(data,'price_duration')
centavos_a_pesos(data,'price_distance')
centavos_a_pesos(data,'price_supplements')

In [None]:
second_to_min(data,'duration')
second_to_min(data,'driver_waiting_time')
second_to_min(data,'rider_waiting_time')

In [None]:
split_datetime(data, 'start_at')

In [None]:
split_datetime(data, 'end_at')

In [None]:
export_clean_csv(data,'clean_data')

### Data wrangling decisions:

* **Missing values:** 
- replace with '0': distance, driver_waiting_time, rider_waiting_time, price, price_duration, price_distance, price supplements
- keep 'NaN': end_lat & end_long (--> can be canceled or rider not found)

* **Variables types:** 
- object: none needed
- category: all _type & _state variables & source
- numeric: 
    + all prices+rating (float) variables
    + duration/time (float or int) variables
    + all geolocation variables: _lon & _lat variables (start & end)
    + datetime : _at variables (start & end)

* **Convert measures /Add new variables:** 
- datetime : extract year, month, day, time
- duration & time var --> second to minutes format
- price var --> centavos to pesos format

* **Change label:** 
- vehicle_type_id: 2 types --> rename with 'A'/'B'
- driver_waiting_time & rider_waiting_time: ' --> from categorical to time format

* **Drop variables:** 
- few values: discount
- not relevant: unnamed_1