# Data types

### Converting data types

In [1]:
import pandas as pd
from numpy import nan

In [2]:
tips = pd.read_csv('tips.csv')

In [3]:
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null object
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB
None


In [4]:
print(tips.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [5]:
# Convert the sex column to type 'category'
tips.sex = tips.sex.astype('category')

In [6]:
# Convert the smoker column to type 'category'
tips.smoker = tips.smoker.astype('category')

In [7]:
# Print the info of tips
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 10.3+ KB
None


### Working with numeric data

In [8]:
# Convert 'total_bill' to a numeric dtype
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')

In [9]:
# Convert 'tip' to a numeric dtype
tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')

In [10]:
# Print the info of tips
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 10.3+ KB
None


### String parsing with regular expressions

In [11]:
# Import the regular expression module
import re

In [12]:
# Compile the pattern: prog
prog = re.compile('\d{3}\-\d{3}\-\d{4}')

In [13]:
# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

True


In [14]:
# See if the pattern matches
result2 = prog.match('1123-456-7890')
print(bool(result2))

False


### Extracting numerical values from strings

In [15]:
# Import the regular expression module
import re

In [16]:
# Find the numeric values: matches
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

In [17]:
# Print the matches
print(matches)

['10', '1']


### Pattern matching

In [18]:
# Write the first pattern
pattern1 = bool(re.match(pattern='\d{3}\-\d{3}\-\d{4}', string='123-456-7890'))
print(pattern1)

True


In [19]:
# Write the second pattern
pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45'))
print(pattern2)

True


In [20]:
# Write the third pattern
pattern3 = bool(re.match(pattern='[A-Z]\w*', string='Australia'))
print(pattern3)

True


### Custom functions to clean data

In [21]:
# Define recode_gender()
def recode_gender(gender):

    # Return 0 if gender is 'Female'
    if gender == 'Female':
        return 0
    
    # Return 1 if gender is 'Male'    
    elif gender == 'Male':
        return 1
    # Return np.nan    
    else:
        return np.nan

In [22]:
# Apply the function to the sex column
tips['recode'] = tips['sex'].apply(recode_gender)

In [23]:
# Print the first five rows of tips
print(tips.head())

   total_bill   tip     sex smoker  day    time  size recode
0       16.99  1.01  Female     No  Sun  Dinner     2      0
1       10.34  1.66    Male     No  Sun  Dinner     3      1
2       21.01  3.50    Male     No  Sun  Dinner     3      1
3       23.68  3.31    Male     No  Sun  Dinner     2      1
4       24.59  3.61  Female     No  Sun  Dinner     4      0


### Lambda functions

In [27]:
tips['total_dollar'] = tips['total_bill'].apply(lambda x: '$'+str(x)) 

In [29]:
# Write the lambda function using replace
tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))

In [30]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,recode,total_dollar,total_dollar_replace
0,16.99,1.01,Female,No,Sun,Dinner,2,0,$16.99,16.99
1,10.34,1.66,Male,No,Sun,Dinner,3,1,$10.34,10.34
2,21.01,3.5,Male,No,Sun,Dinner,3,1,$21.01,21.01
3,23.68,3.31,Male,No,Sun,Dinner,2,1,$23.68,23.68
4,24.59,3.61,Female,No,Sun,Dinner,4,0,$24.59,24.59


In [31]:
# Write the lambda function using regular expressions
tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)[0])

In [32]:
# Print the head of tips
print(tips.head())

   total_bill   tip     sex smoker  day    time  size recode total_dollar  \
0       16.99  1.01  Female     No  Sun  Dinner     2      0       $16.99   
1       10.34  1.66    Male     No  Sun  Dinner     3      1       $10.34   
2       21.01  3.50    Male     No  Sun  Dinner     3      1       $21.01   
3       23.68  3.31    Male     No  Sun  Dinner     2      1       $23.68   
4       24.59  3.61  Female     No  Sun  Dinner     4      0       $24.59   

  total_dollar_replace total_dollar_re  
0                16.99           16.99  
1                10.34           10.34  
2                21.01           21.01  
3                23.68           23.68  
4                24.59           24.59  
