Data Set Demo

In [None]:
import numpy as np
import pandas as pd
# Load excel to dataframe
df = pd.read_csv("./data/sales_data_types.csv")

In [None]:
print(df)

In [None]:
# Check data type
print(df.dtypes)

# Check data info
print(df.info())

In [None]:
# Convert a pandas column Customer Number of data to a different type is to use
df['Customer Number'] = df['Customer Number'].astype(int)
print(df.dtypes)

In [None]:
print(df)

In [None]:
df['2016'].astype('float') #ValueError

astype() will only work if:

    the data is clean and can be simply interpreted as a number
    you want to convert a numeric value to a string object

Custom Conversion Functions

In [None]:
def convert_currency(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','').replace('$', '')
    return float(new_val)

In [None]:
df['2016'].apply(convert_currency)

In [None]:
df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)


In [None]:
print(df.dtypes)
print(df)

In [None]:
def convert_percent(val):
    """
    Convert the percentage string to an actual floating point percent
    - Remove %
    - Divide by 100 to make decimal
    """
    new_val = val.replace('%', '')
    return float(new_val) / 100

In [None]:
df['Percent Growth'] = df['Percent Growth'].apply(convert_percent)
df

In [None]:
# Change the Y/N string to Python boolean
df["Active"] = np.where(df["Active"] == "Y", True, False)
df

In [None]:
print(df)
print(df.dtypes)

In [None]:
# Rename the columns
# inplace=True: Modifies the original DataFrame and returns None. The changes are made directly to the DataFrame without creating a copy.
# inplace=False (default): Returns a new DataFrame with the modifications, leaving the original DataFrame unchanged.
df.rename(columns={'Jan Units': 'Units'}, inplace=True)
df

In [None]:
# Convert a column or Series to a numeric data type
pd.to_numeric(df['Units']) # Raise the except - default value

In [None]:
# Handling Errors using the errors parameter: 
#    errors='raise' (default): Will raise an error if conversion fails.
#    errors='coerce': Converts invalid parsing to NaN.
#    errors='ignore': Returns the original input if conversion fails.

In [None]:
pd.to_numeric(df['Units'], errors='coerce') # set as NaN.

In [None]:
pd.to_numeric(df['Units'], errors='coerce').fillna(0)
# it replaces the invalid “Closed” value with a NaN value because we passed errors=coerce .
# We can leave that value there or fill it in with a 0 using fillna(0)

In [None]:
# convert a Series (or a single column of a DataFrame) to datetime objects
pd.to_datetime(df[['Month', 'Day', 'Year']])

In [None]:
df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']])
df["Units"] = pd.to_numeric(df['Units'], errors='coerce').fillna(0)

In [None]:
df

In [None]:
print(df.dtypes)