In [1]:
import pandas as pd
import plotly.express as px
import sqlalchemy

In [2]:
df = pd.read_csv(r'UberDataset.csv')
df.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


# Data Exploration

In [12]:
df.columns = df.columns.str.lower().str.strip().str.replace(' ','_')

display(df.head())
df.tail()

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


In [5]:
df.describe()

Unnamed: 0,miles
count,1156.0
mean,21.115398
std,359.299007
min,0.5
25%,2.9
50%,6.0
75%,10.4
max,12204.7


In [16]:
print(df.shape)
df.info()

(1156, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   start_date  1156 non-null   object 
 1   end_date    1155 non-null   object 
 2   category    1155 non-null   object 
 3   start       1155 non-null   object 
 4   stop        1155 non-null   object 
 5   miles       1156 non-null   float64
 6   purpose     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


In [7]:
df.value_counts()

start_date        end_date          category  start        stop              miles  purpose        
6/28/2016 23:34   6/28/2016 23:59   Business  Durham       Cary              9.9    Meeting            2
01-01-2016 21:11  01-01-2016 21:17  Business  Fort Pierce  Fort Pierce       5.1    Meal/Entertain     1
12/29/2016 15:05  12/29/2016 15:16  Business  Kar?chi      Kar?chi           1.3    Errand/Supplies    1
12/28/2016 8:34   12/28/2016 9:06   Business  Kar?chi      Unknown Location  10.3   Meal/Entertain     1
12/29/2016 0:49   12/29/2016 1:06   Business  Kar?chi      Kar?chi           3.8    Errand/Supplies    1
                                                                                                      ..
1/26/2016 10:41   1/26/2016 10:50   Business  Whitebridge  Hazelwood         2.0    Meal/Entertain     1
1/26/2016 12:33   1/26/2016 12:41   Business  Hazelwood    Whitebridge       2.3    Errand/Supplies    1
1/26/2016 16:24   1/26/2016 16:32   Business  Whitebridge  W

In [10]:
df.sort_values('end_date', ascending=False)

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
780,9/30/2016 20:59,9/30/2016 22:34,Business,Islamabad,Unknown Location,16.7,
779,9/30/2016 17:39,9/30/2016 20:20,Business,Islamabad,Islamabad,37.7,
778,9/29/2016 16:13,9/29/2016 18:47,Business,Unknown Location,Islamabad,12.6,
776,9/27/2016 21:01,9/28/2016 2:37,Business,Unknown Location,Unknown Location,195.6,
777,9/28/2016 17:21,9/28/2016 19:36,Business,Islamabad,Unknown Location,20.5,
...,...,...,...,...,...,...,...
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain


# Cleaning

In [18]:
# drop final row, commented 
# df.drop(df.index[-1], inplace=True)
df.tail()
df.columns

Index(['start_date', 'end_date', 'category', 'start', 'stop', 'miles',
       'purpose'],
      dtype='object')

In [21]:
# converting date/time columns to standart format and dtype

df['start_date'] = pd.to_datetime(df['start_date'], format='mixed') 
df['end_date'] = pd.to_datetime(df['end_date'], format='mixed')


Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [24]:
df['purpose'] = df['purpose'].fillna('Unspecified')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   start_date  1155 non-null   datetime64[ns]
 1   end_date    1155 non-null   datetime64[ns]
 2   category    1155 non-null   object        
 3   start       1155 non-null   object        
 4   stop        1155 non-null   object        
 5   miles       1155 non-null   float64       
 6   purpose     1155 non-null   object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 63.3+ KB


In [29]:
df.describe(include='all')

display(df.groupby('category').size())


df.groupby('purpose').size()

category
Business    1078
Personal      77
dtype: int64

purpose
Airport/Travel       3
Between Offices     18
Charity ($)          1
Commute              1
Customer Visit     101
Errand/Supplies    128
Meal/Entertain     160
Meeting            187
Moving               4
Temporary Site      50
Unspecified        502
dtype: int64

# MySQL push and CSV export

mysql+mysqldb://username:password@port/db

In [30]:
con_str = 'mysql+mysqldb://ct_python:1ceT4me8*@localhost/new_schema'

my_con = sqlalchemy.create_engine(con_str)

In [31]:
df.to_sql('uber_rides', con=my_con, index=False, if_exists='replace')

1155

In [32]:
df_2 = pd.read_sql('SELECT * FROM uber_rides', con = my_con)

In [33]:
df.to_csv('uber_rides_clean.csv', index=False)