In [1]:
# This is a python based solution to the Tableau Preppin Data Challenge #2 of 2020.  Want to learn more, go here:
# https://preppindata.blogspot.com/2020/01/2020-week-2.html

# First things first, let's get the appropriate packages.  

import pandas as pd
import datetime as dt
import numpy as np
from datetime import datetime

In [4]:
# Let's read the data into a pandas dataframe.  
# Note that I stored it in a variable, this makes life easier in the long run.  

C2 = pd.read_csv('Challenge 2 Data.csv')

In [5]:
# Dates normally have semi-colons, colons, spaces or commas hidden, let's save ourselves headache and fix that, as well
# as prep the data accordingly.

C2 = C2.replace(regex=[','],value='')
C2 = C2.replace(regex=[';'],value='')
C2 = C2.replace(regex=[' '],value='')
C2 = C2.replace(regex=[':'],value='')

In [6]:
# Since our 'Time' column has text, let's save ourselves further headache by making it all the appropriate case.
C2['Time'] = C2['Time'].str.lower()

In [7]:
# Since I am special, I am going to create another column called "First" to house the first 2 characters 
# for some calculations I will do later, for midnight.

C2['First'] = C2['Time'].str[:2]

In [8]:
# Let's use numpy and pandas together to make some magic happen.  This step has a lot going on, so I am going to 
# separate it accordingly.  

# Numpy allows you to do some of your tried and true logical statements.  Note, these are a bit different in raw Python
# which is why I am using numpy for this logic.  We are finding anything about 'am' and cases where the first 2 digits
# are 12.  This will make sure we have an idea as to how to tackle converting midnight into 24 hour format.
am = np.logical_and(C2['Time'].str.contains('a'), C2['Time'].str[:2] == '12')

# This statement will fix our data and do the midnight conversion.
fix = C2['Time'].replace(regex=['12'],value='00')

# Numpy's .where component is similar to a where statement in SQL
New = np.where(am==True,fix,C2['Time'])

In [7]:
# Numpy outputs an array in this case, so let's take a look at what we have and you will notice that it worked!  
# Look at the next to last object, and you will see "0001a", which tells us that this one minute past midnight.  
New

array(['0145', '0817', '0836', '1018', '1152', '1407', '1601', '1607',
       '1626', '0804', '1825', '1913', '2117', '2230', '2255', '2353',
       '1917', '719a', '1152am', '1244', '1325', '204pm', '1508', '658pm',
       '1921', '2033', '2059', '2225', '2351', '0930', '0943', '0040',
       '1629', '1638', '516pm', '613pm', '736pm', '1929', '2100', '2318',
       '0611', '0725', '0925', '0835', '1241', '251pm', '1548', '1711',
       '0001a', '0857'], dtype=object)

In [8]:
# Let's update the 'Time' column in the Pandas dataframe with the array from above (Don't worry, it flows in easily).
C2['Time'] = New

In [9]:
# Since midnight is squared away, we can call this game/set/match, so let's remove everything about 'am' from the data.
C2 = C2.replace(regex=['a'],value='')
C2 = C2.replace(regex=['am'],value='')
C2 = C2.replace(regex=['m'],value='')

In [10]:
# Now, let's wrangle the PM portion of things and add a PM flag.
C2['PM'] = C2['Time'].str.contains('p')

In [11]:
# One of our last clean-ups!  Almost there.  Let's drop the 'p', thus leaving us only with numbers, which will make 
# the next part easier.  
C2 = C2.replace(regex=['p'],value='')

In [12]:
# Let's make sure we have the appropriate number of digits for the times.  In this case, there will need to be 4.
# This will add zeros up until we reach a length of 4.  

C2['Time']=C2['Time'].str.pad(4,side='left',fillchar='0')

In [13]:
# Since we now have the appropriate number of characters in the "Time" column and we have isolated which ones are PM,
# Let's recreate the "First" column to ensure that we are getting the first two digits.  Since some of the pm
# ones probably pulled in a semi-colon, etc. 

C2['First'] = C2['Time'].str[:2]

In [14]:
# Let's use numpy and pandas together again to handle the PM portion.  

# Same as above, let's segment this out for readability.  And we will change the variables a smidge.
pm = np.logical_and(C2['First'].astype('int32')>0, C2['PM'] ==True)

# This statement will fix our data and do the 24 hour conversion for PM identified columns.
fix = C2['First'].astype('int32') + 12

# This time, let's put the array directly into the dataframe, since we now know what is going on.  
C2['First'] = np.where(pm==True,fix,C2['First'])

In [15]:
# To seal the deal, let's take the values from "First" and replace the first 2 values of "Time" with the values in "First".
# You will notive slight difference in the notation from earlier.  str[:2] gets the first two while str[2:] gets the last
# two.

C2['Time'] = C2['First'].astype('str') + C2['Time'].str[2:]

In [16]:
# Since we now have the values we need, let's drop the last two columns in our dataframe.  The "First" & the "PM".
# Again, we will have to store this value over our current variable for it to 'stick'.

C2 = C2.drop(columns=['First','PM'])

In [17]:
# Almost there, let's convert our "Time" column to a true "Time" format using some built in Pandas functionality.  
C2['Time'] = pd.to_datetime(C2['Time'], format='%H%M').dt.time

In [18]:
# Next to last step!  Let's convert that "Date" column to the appropriate format, by using the "to_datetime" 
# pandas function.

C2['Date'] =pd.to_datetime(C2['Date']).dt.strftime('%d/%m/%Y')

In [19]:
# Here we go, last step!  Let's now create the "DateTime" column in our dateframe to finish this on up.

C2['Date Time'] = C2['Date'].astype('str') + ', ' + C2['Time'].astype('str')

In [20]:
# Since I am special, I will re-arrange the columns to fit the output that they were looking for.  However, I won't
# change the format of "Time", and will keep the seconds at the end of it for completeness sake.  

C2 = C2[['Date Time','Date','Time']]
C2

Unnamed: 0,Date Time,Date,Time
0,"01/01/2019, 01:45:00",01/01/2019,01:45:00
1,"01/01/2019, 08:17:00",01/01/2019,08:17:00
2,"01/01/2019, 08:36:00",01/01/2019,08:36:00
3,"01/01/2019, 10:18:00",01/01/2019,10:18:00
4,"01/01/2019, 11:52:00",01/01/2019,11:52:00
5,"01/01/2019, 14:07:00",01/01/2019,14:07:00
6,"01/01/2019, 16:01:00",01/01/2019,16:01:00
7,"01/01/2019, 16:07:00",01/01/2019,16:07:00
8,"01/01/2019, 16:26:00",01/01/2019,16:26:00
9,"01/01/2019, 08:04:00",01/01/2019,08:04:00
