# Data preparation

## Setup

In [1]:
import pandas as pd

## Data

### Data import

In [2]:
URL = 'https://raw.githubusercontent.com/kirenz/datasets/master/6_2_data.csv'
df = pd.read_csv(URL)

In [3]:
df.head()

Unnamed: 0,2019,Dials Made,Accounts Worked,Not Worked,Penetration Rate
0,JAN,450000,225000,225000,5
1,FEB,390000,185000,205000,4743589744
2,MAR,330000,138000,192000,4181818182
3,APR,320000,137000,183000,428125
4,MAY,365000,150000,215000,4109589041


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   2019              12 non-null     object
 1   Dials Made        12 non-null     int64 
 2   Accounts Worked   12 non-null     int64 
 3   Not Worked        12 non-null     int64 
 4   Penetration Rate  12 non-null     object
dtypes: int64(3), object(2)
memory usage: 612.0+ bytes


### Data transformation

First, we prepare the date variable:

In [9]:
# Create Date
df['Date'] = pd.to_datetime('2019' + df['2019'], format='%Y%b')

# Drop column 2019

df.drop(columns='2019', inplace=True)

df

Unnamed: 0,Dials Made,Accounts Worked,Not Worked,Penetration Rate,Date
0,450000,225000,225000,5,2019-01-01
1,390000,185000,205000,4743589744,2019-02-01
2,330000,138000,192000,4181818182,2019-03-01
3,320000,137000,183000,428125,2019-04-01
4,365000,150000,215000,4109589041,2019-05-01
5,335000,115000,220000,3432835821,2019-06-01
6,275000,100000,175000,3636363636,2019-07-01
7,270000,108000,162000,4,2019-08-01
8,250000,90000,160000,36,2019-09-01
9,255000,95000,160000,3725490196,2019-10-01


Next, we use the [melt](https://pandas.pydata.org/docs/reference/api/pandas.melt.html) function to prepare our data

In [None]:
df = df.melt(id_vars=['Date'], value_vars=['Accounts Worked', 'Not Worked'], var_name='Accounts', value_name='Value')
df

Unnamed: 0,Date,Accounts,Value
0,2019-01-01,Accounts Worked,225000
1,2019-02-01,Accounts Worked,185000
2,2019-03-01,Accounts Worked,138000
3,2019-04-01,Accounts Worked,137000
4,2019-05-01,Accounts Worked,150000
5,2019-06-01,Accounts Worked,115000
6,2019-07-01,Accounts Worked,100000
7,2019-08-01,Accounts Worked,108000
8,2019-09-01,Accounts Worked,90000
9,2019-10-01,Accounts Worked,95000


In [None]:
# Change format to upper case
df["Accounts"] = df["Accounts"].str.upper()

df.head()

Unnamed: 0,Date,Accounts,Value
0,2019-01-01,ACCOUNTS WORKED,225000
1,2019-02-01,ACCOUNTS WORKED,185000
2,2019-03-01,ACCOUNTS WORKED,138000
3,2019-04-01,ACCOUNTS WORKED,137000
4,2019-05-01,ACCOUNTS WORKED,150000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      24 non-null     datetime64[ns]
 1   Accounts  24 non-null     object        
 2   Value     24 non-null     int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 708.0+ bytes


In [None]:
df.to_csv("6_2_data.csv", index=None)