In [1]:
# Obtaining the Underlying Data Types

import pandas as pd
import numpy as np

In [6]:
# Euro Sign = Alt + 0128

column_names = ["Customer ID", "Customer Name",\
                "2018 Revenue", "2019 Revenue",\
                "Growth", "Start Year", "Start Month",\
                "Start Day", "New Customer"]
row1 = list([1001.0, 'Pandas Banking',\
             '€235000', '€248000',\
             '5.5%', 2013, 3, 10, 0])
row2 = list([1002.0, 'Pandas Grovcery',\
             '€196000', '€205000',\
             '4.5%', 2016, 4, 30, 0])
row3 = list([1003.0, 'Pandas Telecom',\
             '€167000', '€193000', '15.5%',\
             2010, 11, 24, 0])
row4 = list([1004.0, 'Pandas Transport',\
             '€79000', '€90000', '13.9%',\
             2018, 1, 15, 1])
row5 = list([1005.0, 'Pandas Insurance',\
             '€241000', '€264000', '9.5%',\
             2009, 6, 1, 0])

data_frame = pd.DataFrame(data=[row1, row2, row3, row4, row5],\
                                columns = column_names)
                                
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,Start Year,Start Month,Start Day,New Customer
0,1001.0,Pandas Banking,€235000,€248000,5.5%,2013,3,10,0
1,1002.0,Pandas Grovcery,€196000,€205000,4.5%,2016,4,30,0
2,1003.0,Pandas Telecom,€167000,€193000,15.5%,2010,11,24,0
3,1004.0,Pandas Transport,€79000,€90000,13.9%,2018,1,15,1
4,1005.0,Pandas Insurance,€241000,€264000,9.5%,2009,6,1,0


In [8]:
data_frame['2018 Revenue'] + data_frame['2019 Revenue']

0    €235000€248000
1    €196000€205000
2    €167000€193000
3      €79000€90000
4    €241000€264000
dtype: object

In [9]:
data_frame.dtypes

Customer ID      float64
Customer Name     object
2018 Revenue      object
2019 Revenue      object
Growth            object
Start Year         int64
Start Month        int64
Start Day          int64
New Customer       int64
dtype: object

In [10]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer ID    5 non-null      float64
 1   Customer Name  5 non-null      object 
 2   2018 Revenue   5 non-null      object 
 3   2019 Revenue   5 non-null      object 
 4   Growth         5 non-null      object 
 5   Start Year     5 non-null      int64  
 6   Start Month    5 non-null      int64  
 7   Start Day      5 non-null      int64  
 8   New Customer   5 non-null      int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 492.0+ bytes


In [11]:
# Converting from One Type into Another (You have 3 Options)
# These can be found in Chapter 4 of The Pandas Workshop book.

# Use the astype() function to force an appropriate dtype.
# Use pandas functions such as to_numeric() or to_datetime().
# Create a custom function to convert the data type

In [12]:
# This is the astype() example.

data_frame["Customer ID"] = data_frame['Customer ID'].astype('int')

data_frame["Customer ID"]

0    1001
1    1002
2    1003
3    1004
4    1005
Name: Customer ID, dtype: int64

In [13]:
# This is the astype() example.
# The following code will cause an error due to € Currency.

data_frame['2018 Revenue'] = data_frame['2018 Revenue'].astype('int')

data_frame['2018 Revenue']

ValueError: invalid literal for int() with base 10: '€235000'

In [16]:
# The following code is for a custom function to remove the € and covnvert
# the remaining number into an integer.
# There is a potential typo in the book at this point.

def remove_currency(column):
    new_column = column.replace('€',"") 
    return int(new_column)

In [18]:
data_frame['2018 Revenue'] =\
data_frame['2018 Revenue'].apply(remove_currency)

data_frame["2018 Revenue"]

0    235000
1    196000
2    167000
3     79000
4    241000
Name: 2018 Revenue, dtype: int64

In [19]:
# This can be repeated for the "2019 Revenue".

data_frame['2019 Revenue'] =\
data_frame['2019 Revenue'].apply(remove_currency)

data_frame["2019 Revenue"]

0    248000
1    205000
2    193000
3     90000
4    264000
Name: 2019 Revenue, dtype: int64

In [39]:
# This function will be created for the Growth column to remove
# the % and convert it from an object into a float.
# This is another typo with the following line of code:
# new_column = column.replace('%', ")

def remove_percentage(column):
    new_column = column.replace('%', ' ')
    return float(new_column)

In [41]:
data_frame['Growth'] = data_frame['Growth'].apply(remove_percentage)
data_frame['Growth']

AttributeError: 'float' object has no attribute 'replace'

In [35]:
data_frame.rename(columns={'Start Year': 'year',\
                           'Start Month': 'month',\
                           'Start Day': 'day'},\
                           inplace = True)
                           
data_frame['Starting Date']=\
pd.to_datetime(data_frame[['day', 'month', 'year']])
                           
data_frame['Starting Date']                           

0   2013-03-10
1   2016-04-30
2   2010-11-24
3   2018-01-15
4   2009-06-01
Name: Starting Date, dtype: datetime64[ns]

In [36]:
# Converting New Customer from an int64 into a bool.

data_frame["New Customer"] =\
data_frame['New Customer'].astype('bool')

data_frame["New Customer"]

0    False
1    False
2    False
3     True
4    False
Name: New Customer, dtype: bool

In [37]:
# Converting Customer Name from an object into a category.

data_frame["Customer Name"] =\
data_frame['Customer Name'].astype('category')

data_frame["Customer Name"]

0      Pandas Banking
1     Pandas Grovcery
2      Pandas Telecom
3    Pandas Transport
4    Pandas Insurance
Name: Customer Name, dtype: category
Categories (5, object): ['Pandas Banking', 'Pandas Grovcery', 'Pandas Insurance', 'Pandas Telecom', 'Pandas Transport']

In [38]:
data_frame.dtypes

Customer ID               int64
Customer Name          category
2018 Revenue              int64
2019 Revenue              int64
Growth                  float64
year                      int64
month                     int64
day                       int64
New Customer               bool
Starting Date    datetime64[ns]
dtype: object