In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/LinkedInLearning/data_cleaning_python_2883183/main/Ch05/challenge/workshops.csv')

In [3]:
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings
0,2021.0,,,,,
1,,June,,,,
2,,,1.0,3.0,gRPC in Go,"$33,019"
3,,,7.0,10.0,Optimizing Python,"$42,238"
4,,,28.0,30.0,python Foundations,"$24,372"
5,,July,,,,
6,,,5.0,8.0,go concurrency,"$46,382"
7,,,21.0,22.0,Writing Secure Go,"$27,038"


In [4]:
df.dtypes

Year        float64
Month        object
Start       float64
End         float64
Name         object
Earnings     object
dtype: object

"""

Fix the data frame. At the end, row should have the following columns:
- start: pd.Timestemap
- end: pd.Timestamp
- name: str
- topic: str (python or go)
- earnings: np.float64

"""

In [5]:
# use forward fill to fill in the missing year and months
df['Year'].fillna(method = 'ffill',inplace=True)
df['Month'].fillna(method = 'ffill',inplace=True)

In [6]:
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings
0,2021.0,,,,,
1,2021.0,June,,,,
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019"
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238"
4,2021.0,June,28.0,30.0,python Foundations,"$24,372"
5,2021.0,July,,,,
6,2021.0,July,5.0,8.0,go concurrency,"$46,382"
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038"


In [7]:
# select the earnings with not null value and copy the dataframe
df = df[pd.notnull(df['Earnings'])].copy()

In [8]:
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019"
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238"
4,2021.0,June,28.0,30.0,python Foundations,"$24,372"
6,2021.0,July,5.0,8.0,go concurrency,"$46,382"
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038"


In [9]:
def as_date(row, col):
    year = int(row['Year'])
    month = row['Month']
    day = int(row[col])
    ts = f'{month} {day}, {year}'
    return pd.to_datetime(ts, format='%B %d, %Y')

In [10]:
df['start'] = df.apply(as_date, axis=1, args=('Start',))
df['end'] = df.apply(as_date, axis=1, args=('End',))
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings,start,end
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019",2021-06-01,2021-06-03
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238",2021-06-07,2021-06-10
4,2021.0,June,28.0,30.0,python Foundations,"$24,372",2021-06-28,2021-06-30
6,2021.0,July,5.0,8.0,go concurrency,"$46,382",2021-07-05,2021-07-08
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038",2021-07-21,2021-07-22


In [11]:
# if name has go return go, if python return python
# then add a topic column 
# convert the name column into lowercase and apply topic definition

def topic(name):
    if 'go' in name:
        return 'go'
    if 'python' in name:
        return 'python'

df['topic'] = df['Name'].str.lower().apply(topic)
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings,start,end,topic
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019",2021-06-01,2021-06-03,go
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238",2021-06-07,2021-06-10,python
4,2021.0,June,28.0,30.0,python Foundations,"$24,372",2021-06-28,2021-06-30,python
6,2021.0,July,5.0,8.0,go concurrency,"$46,382",2021-07-05,2021-07-08,go
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038",2021-07-21,2021-07-22,go


In [12]:
import numpy as np

In [13]:
# fix the earning, remove dollar sign and comma.
# convert it to float64 value or datatype

df['earnings'] = pd.to_numeric(
    df['Earnings'].str.replace(r'[$,]', '')
).astype(np.float64)
df

  df['Earnings'].str.replace(r'[$,]', '')


Unnamed: 0,Year,Month,Start,End,Name,Earnings,start,end,topic,earnings
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019",2021-06-01,2021-06-03,go,33019.0
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238",2021-06-07,2021-06-10,python,42238.0
4,2021.0,June,28.0,30.0,python Foundations,"$24,372",2021-06-28,2021-06-30,python,24372.0
6,2021.0,July,5.0,8.0,go concurrency,"$46,382",2021-07-05,2021-07-08,go,46382.0
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038",2021-07-21,2021-07-22,go,27038.0


In [14]:
df.dtypes

Year               float64
Month               object
Start              float64
End                float64
Name                object
Earnings            object
start       datetime64[ns]
end         datetime64[ns]
topic               object
earnings           float64
dtype: object

In [15]:
# take only topics that are interesting
# rename the columns

df = df[['start', 'end', 'Name', 'topic', 'earnings']]
df.rename(columns={'Name': 'name'}, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Name': 'name'}, inplace=True)


Unnamed: 0,start,end,name,topic,earnings
2,2021-06-01,2021-06-03,gRPC in Go,go,33019.0
3,2021-06-07,2021-06-10,Optimizing Python,python,42238.0
4,2021-06-28,2021-06-30,python Foundations,python,24372.0
6,2021-07-05,2021-07-08,go concurrency,go,46382.0
7,2021-07-21,2021-07-22,Writing Secure Go,go,27038.0
