In [1]:
import pandas

weather = pandas.read_csv('BrisbaneWeather.csv')
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,,Celcius,Celcius,Millimetres
1,January,20.7,29.4,159.6
2,February,20.6,29,158.3
3,March,19.4,28,140.7
4,April,16.6,26.1,92.5
5,May,13.3,23.2,73.7
6,June,10.9,20.9,67.8
7,July,9.5,20.4,56.5
8,August,10.3,21.8,45.9
9,September,12.9,24,45.7


In [2]:
# The above looks OK, but if we look carefully at the types of our columns, they all contain values of type object (i.e. strings and not numbers)
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Month     12 non-null     object
 1   MinTemp   13 non-null     object
 2   MaxTemp   13 non-null     object
 3   Rainfall  13 non-null     object
dtypes: object(4)
memory usage: 544.0+ bytes


In [4]:
# As a consequence, we can't compute the mean of the Rainfall column (because they are not numbers)
weather.Rainfall.mean()

TypeError: Could not convert Millimetres159.6158.3140.792.573.767.856.545.945.775.497133.3 to numeric

In [5]:
# We see, for example that the Rainfall on June is the string '67.8'
weather.loc[6,'Rainfall']

'67.8'

In [6]:
# which is of type string
type(weather.loc[6,'Rainfall'])

str

In [7]:
# The cause of the problem in this example is the 2nd row that contains the units (Celcius and Millimetres) rather than actual data values.
# So, the simple solution for this particular example is to skip row 1 when importing the data from the csv file
weather = pandas.read_csv('BrisbaneWeather.csv', skiprows=[1])
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,January,20.7,29.4,159.6
1,February,20.6,29.0,158.3
2,March,19.4,28.0,140.7
3,April,16.6,26.1,92.5
4,May,13.3,23.2,73.7
5,June,10.9,20.9,67.8
6,July,9.5,20.4,56.5
7,August,10.3,21.8,45.9
8,September,12.9,24.0,45.7
9,October,15.8,26.1,75.4


In [8]:
# Which now gives us numeric types in the Rainfall column
type(weather.loc[6,'Rainfall'])

numpy.float64

In [9]:
# However, if the error(s) were spread over many rows, then we'd need a different approach.
# Our first attempt is to tell pandas to try to convert all values to type float
# In this example it fails because the string 'January' can't be implicitly converted to a float
weather = pandas.read_csv('BrisbaneWeather.csv', dtype=float)

ValueError: could not convert string to float: 'January'

In [10]:
# Let's instead only try to convert the Rainfall column to type float (by using a dictionary)
# This also fails when we try to convert the string 'Millimetres' to a float
weather = pandas.read_csv('BrisbaneWeather.csv', dtype={'Rainfall':float})

ValueError: could not convert string to float: 'Millimetres'

In [11]:
# We can instead create our own custom function to convert values in the  Rainfall column of the csv file
# As a very simple starting point, we'll ignore the input parameter str and just return the constant value 3.14
def my_converter(str) :
    return 3.14

In [12]:
# Tell pandas to use our my_converter function to convert the Rainfall column
weather = pandas.read_csv('BrisbaneWeather.csv', converters={'Rainfall': my_converter})
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,,Celcius,Celcius,3.14
1,January,20.7,29.4,3.14
2,February,20.6,29,3.14
3,March,19.4,28,3.14
4,April,16.6,26.1,3.14
5,May,13.3,23.2,3.14
6,June,10.9,20.9,3.14
7,July,9.5,20.4,3.14
8,August,10.3,21.8,3.14
9,September,12.9,24,3.14


In [13]:
# Now let's replace the value 3.14 by something more sensible.
# Let's try to convert each input str value to a float
def my_converter(str) :
    return float(str)

weather = pandas.read_csv('BrisbaneWeather.csv', converters={'Rainfall': my_converter})
weather

# this fails as previously when we try to convert the string 'Millimetres' to a float

ValueError: could not convert string to float: 'Millimetres'

In [14]:
# instead we adapt our converter function to 'catch' the generated ValueError and instead return say math.nan (or any other value we choose) when a ValueError occurred
import math

def my_converter(str) :
    try :
        return float(str)
    except ValueError :
        return math.nan

weather = pandas.read_csv('BrisbaneWeather.csv', converters={'Rainfall': my_converter})
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,,Celcius,Celcius,
1,January,20.7,29.4,159.6
2,February,20.6,29,158.3
3,March,19.4,28,140.7
4,April,16.6,26.1,92.5
5,May,13.3,23.2,73.7
6,June,10.9,20.9,67.8
7,July,9.5,20.4,56.5
8,August,10.3,21.8,45.9
9,September,12.9,24,45.7


In [15]:
# The Rainfall in June is now a number (and not a string)
weather.loc[6,'Rainfall']

67.8

In [16]:
type(weather.loc[6,'Rainfall'])

numpy.float64

In [17]:
# Now that all the values in the Rainfall column are numbers, we can perform arithmetic calculations such as finding the mean Rainfall.
weather.Rainfall.mean()

95.53333333333332

In [18]:
# Or the sum of all Rainfall
weather.Rainfall.sum()

1146.4

In [19]:
# In the above example, we performed the conversion when we read the data in from the csv file.
# An alternative approach is to read it in as type string and do the conversion later ...
weather = pandas.read_csv('BrisbaneWeather.csv')
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,,Celcius,Celcius,Millimetres
1,January,20.7,29.4,159.6
2,February,20.6,29,158.3
3,March,19.4,28,140.7
4,April,16.6,26.1,92.5
5,May,13.3,23.2,73.7
6,June,10.9,20.9,67.8
7,July,9.5,20.4,56.5
8,August,10.3,21.8,45.9
9,September,12.9,24,45.7


In [20]:
# The to_numeric function attempts to convert a data series to be numeric values
pandas.to_numeric(weather.Rainfall)

# In this case, the convertion fails when trying to convert  the string 'Millimetres' to a numeric type

ValueError: Unable to parse string "Millimetres" at position 0

In [21]:
# We can however, specify what to do  if an error occurs. In this example we say to coerce (or force) the value to a numeric type.
# In our example, the string value 'Millimetres' is coerced to the special numeric value NaN
pandas.to_numeric(weather.Rainfall, errors='coerce')

0       NaN
1     159.6
2     158.3
3     140.7
4      92.5
5      73.7
6      67.8
7      56.5
8      45.9
9      45.7
10     75.4
11     97.0
12    133.3
Name: Rainfall, dtype: float64

In [22]:
# We can then use this approach to redefine the Rainfall column (which did contain strings), to now contain these values coerced to be numeric
weather.Rainfall = pandas.to_numeric(weather.Rainfall, errors='coerce')

In [23]:
weather

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall
0,,Celcius,Celcius,
1,January,20.7,29.4,159.6
2,February,20.6,29,158.3
3,March,19.4,28,140.7
4,April,16.6,26.1,92.5
5,May,13.3,23.2,73.7
6,June,10.9,20.9,67.8
7,July,9.5,20.4,56.5
8,August,10.3,21.8,45.9
9,September,12.9,24,45.7


In [24]:
# We see now that the Rainfall column contains values of type float64
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Month     12 non-null     object 
 1   MinTemp   13 non-null     object 
 2   MaxTemp   13 non-null     object 
 3   Rainfall  12 non-null     float64
dtypes: float64(1), object(3)
memory usage: 544.0+ bytes


In [25]:
# Let's try a new example with some different types of data cleaning needed
D202 = pandas.read_csv('D202.csv')
D202

Unnamed: 0,TYPE,DATE,START TIME,END TIME,USAGE,UNITS,COST,NOTES
0,Electric usage,10/22/2016,0:00,0:14,0.01,kWh,$0.00,
1,Electric usage,10/22/2016,0:15,0:29,0.01,kWh,$0.00,
2,Electric usage,10/22/2016,0:30,0:44,0.01,kWh,$0.00,
3,Electric usage,10/22/2016,0:45,0:59,0.01,kWh,$0.00,
4,Electric usage,10/22/2016,1:00,1:14,0.01,kWh,$0.00,
...,...,...,...,...,...,...,...,...
70363,Electric usage,10/24/2018,22:45,22:59,0.02,kWh,$0.00,
70364,Electric usage,10/24/2018,23:00,23:14,0.03,kWh,$0.01,
70365,Electric usage,10/24/2018,23:15,23:29,0.03,kWh,$0.01,
70366,Electric usage,10/24/2018,23:30,23:44,0.03,kWh,$0.01,


In [26]:
# We see in this example that the values stored in the COST column are of type object (i.e. of type string and not of a numeric type)
D202.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70368 entries, 0 to 70367
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TYPE        70368 non-null  object 
 1   DATE        70368 non-null  object 
 2   START TIME  70368 non-null  object 
 3   END TIME    70368 non-null  object 
 4   USAGE       70368 non-null  float64
 5   UNITS       70368 non-null  object 
 6   COST        70368 non-null  object 
 7   NOTES       0 non-null      float64
dtypes: float64(2), object(6)
memory usage: 4.3+ MB


In [None]:
# This prevents us from performing numeric calculations on the COST column
D202.COST.sum()

In [28]:
# The problem is the dollar signs in the COST column.
# So, we can compute a new series that replaces each dollar sign character by the empty string ''
D202.COST.str.replace('$','')

  D202.COST.str.replace('$','')


0        0.00 
1        0.00 
2        0.00 
3        0.00 
4        0.00 
         ...  
70363    0.00 
70364    0.01 
70365    0.01 
70366    0.01 
70367    0.01 
Name: COST, Length: 70368, dtype: object

In [29]:
# The above output looks OK (without the dollar signs), but we see that the values are still of type object (i.e. strings and not numbers)
D202.COST.str.replace('$','').astype('float')

  D202.COST.str.replace('$','').astype('float')


0        0.00
1        0.00
2        0.00
3        0.00
4        0.00
         ... 
70363    0.00
70364    0.01
70365    0.01
70366    0.01
70367    0.01
Name: COST, Length: 70368, dtype: float64

In [30]:
# We can use the astype function to convert all of these string values into floats (which will work now because we have removed the dollar signs)
D202.COST = D202.COST.str.replace('$','').astype('float')

  D202.COST = D202.COST.str.replace('$','').astype('float')


In [31]:
# We see now that the values in the COST column are now of type float64
D202.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70368 entries, 0 to 70367
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TYPE        70368 non-null  object 
 1   DATE        70368 non-null  object 
 2   START TIME  70368 non-null  object 
 3   END TIME    70368 non-null  object 
 4   USAGE       70368 non-null  float64
 5   UNITS       70368 non-null  object 
 6   COST        70368 non-null  float64
 7   NOTES       0 non-null      float64
dtypes: float64(3), object(5)
memory usage: 4.3+ MB


In [32]:
# Which allows us to perform  numeric calculations such as sum
D202.COST.sum()

1736.9500000000003

In [None]:
# Try creating some examples of your own ...