In [1]:
import numpy as np
import pandas as pd

## Load data from a csv  file

a csv file contains comma-separated values (CSV) <br>
https://en.wikipedia.org/wiki/Comma-separated_values

In [3]:
df = pd.read_csv('patient_record.csv', sep=',')
df

Unnamed: 0,Age,Sex,Tumor_size_mm
0,30,M,1.0
1,40,F,2.0
2,85,F,0.1
3,75,M,1.0
4,95,F,3.0


In [None]:
df.shape

In [None]:
df.columns # column indexes

In [None]:
df.index  # row indexes

In [None]:
df.iloc[0,:] # the first row, data type is Series

In [None]:
type(df.iloc[0,:])

In [None]:
df.iloc[0:1,:] # the first row, data type is dataframe

In [None]:
type(df.iloc[0:1,:])

### add a new column to the dataframe

In [None]:
df['smoking'] = [0, 1, 7, 1, 0]
#0: never smoke cigarettes
#1: 1 day per week
#7: everyday (7 days per week)
df

In [None]:
df=df.append({'Age':50.0, 'Sex':'F', 'Tumor_size_mm':1.0, 'smoking':7},ignore_index=True)
df

In [None]:
df['height'] = [1.6, 1.7, 1.8, 1.5, 1.9] # unit: meter
df['weight'] = [60, 70, 80, 50, 90] # unit:kg
df

In [None]:
# BMI (Body mass index) = weight /(height*height)
df['BMI'] = df['weight']/df['height']**2
df

In [None]:
df[['Age', 'BMI']] # select two columns

convert the two columns 'Age' & 'BMI' to a Numpy array

In [None]:
df[['Age', 'BMI']].values

### Save the new data to a csv file

In [None]:
#save the new Dataframe df_new to a csv file
#set index=False, so the row indexes will not be saved  
df.to_csv('patient_record_new.csv', index=False, sep=',')

## Combine two Series objects

In [None]:
s1 = pd.Series(['A', 'B', 'C'], index=[0, 1, 2])
s2 = pd.Series(['D', 'E', 'F'], index=[3, 4, 5])
s12=pd.concat([s1, s2])
s12

In [None]:
# if we do not set index ..
s1 = pd.Series(['A', 'B', 'C'])
s2 = pd.Series(['D', 'E', 'F'])
s12=pd.concat([s1, s2])
s12

In [None]:
s12[0] # an index -> two values, this is weird...

## Combine Two Dataframe Objects

In [None]:
Matrix = [[1, 2],
          [3, 4],
          [5, 6]]
df1 = pd.DataFrame(Matrix, columns=['A', 'B'], index=[0, 1, 2]) 
df2 = pd.DataFrame(Matrix, columns=['C', 'D'], index=[3, 4, 5]) 
df12=pd.concat([df1, df2], axis=0)
df12

In [None]:
Matrix = [[1, 2],
          [3, 4],
          [5, 6]]
# both df1 and df2 have the column index 'A'
df1 = pd.DataFrame(Matrix, columns=['A', 'B'], index=[0, 1, 2]) 
df2 = pd.DataFrame(Matrix, columns=['A', 'D'], index=[3, 4, 5]) 
df12=pd.concat([df1, df2], axis=0)
df12

In [None]:
Matrix = [[1, 2],
          [3, 4],
          [5, 6]]
# both df1 and df2 have the the row index 0
df1 = pd.DataFrame(Matrix, columns=['A', 'B'], index=[0, 1, 2]) 
df2 = pd.DataFrame(Matrix, columns=['C', 'D'], index=[0, 4, 5]) 
df12=pd.concat([df1, df2], axis=0)
df12

the above dataframe is weird ...
two rows have the same the row index

### Merge two dataframe objects using `pd.merge`
if some row/column indexes of two dataframes are the same, it is better to use the function pd.merge

In [None]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df1

In [None]:
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'], 
                    'hire_date': [2004, 2008, 2012, 2014]})
df2

In [None]:
df3 = pd.merge(df1, df2)
df3

In [None]:
df_weird = pd.concat([df1, df2], axis=0, sort=False)
df_weird

In [None]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
df4 

In [None]:
df3 # df3 is pd.merge(df1, df2)

df3 and df4 have the column index 'group' <br>
the 'group' columns in df3 an df4 are called key-columns  <br>
pandas will merge df3 an df4 by the key-columns

In [None]:
df34 = pd.merge(df3, df4)
df34

## Handle Missing Data

`None` is used a placeholder of missing data/object

In [None]:
A = np.array([0, None, 2, 3], dtype='object')
A

In [None]:
print(A[1]) # the value is missing

In [None]:
# try to compute the sum
A.sum() # error!

`nan` is used to represent a numerical missing value <br>
type of `nan` is float

In [None]:
A = np.array([0, np.nan, 2, 3], dtype='float64')
A

In [None]:
A.sum()

In [None]:
1 + np.nan

In [None]:
10 * np.nan

### Handle none and nan in Series

In [None]:
ser= pd.Series([1, np.nan, 'hello', None])
ser

In [None]:
ser.isnull() # null refers to nan and None

In [None]:
ser.notnull()

remove nan and None from the Series

In [None]:
ser[ser.notnull()]

If a Series only contains numerical data, we can convert it to array, <br> then handle missing data using Numpy functions

In [3]:
arr = pd.Series([0, np.nan, None, 1, 100])
arr = arr.values # None -> nan
arr

array([  0.,  nan,  nan,   1., 100.])

In [5]:
arr.dtype

dtype('float64')

In [6]:
np.isnan(arr)

array([False,  True,  True, False, False])

In [7]:
# np.is_not_nan
~np.isnan(arr) # the tilde symbol/operator performs logical not

array([ True, False, False,  True,  True])

remove nan from the array

In [8]:
arr[~np.isnan(arr)]

array([  0.,   1., 100.])

#### Find and Replace none and nan in Series

In [9]:
ser= pd.Series([1, np.nan, 'hello', None])
ser[ser.isnull()] = 'nothing'
ser

0          1
1    nothing
2      hello
3    nothing
dtype: object

In [12]:
# If a Series only contains numerical data, we can convert it to array
# then handle missing data using Numpy functions
ser = pd.Series([0, np.nan, None, 1, 100])
arr = ser.values
print(type(arr))
arr

<class 'numpy.ndarray'>


array([  0.,  nan,  nan,   1., 100.])

In [11]:
# repalce nan with 0 (or other number)
arr[np.isnan(arr)]= 0
arr

array([  0.,   0.,   0.,   1., 100.])

## Handle nan in DataFrame

In [None]:
df = pd.read_csv('patient_record_missing_data.csv', sep=',')
#null -> nana, open the file in a text editor
df

Suppose that we are developing a machine learning algorithm that will predict the outcome of brain tumor surgery based on Age and Sex <br>
<br>
First, we need to find the rows that have missing data

In [None]:
#check if a number is nan
x = np.nan
print(x == np.nan)

nan is not equal to nan

In [2]:
#check if a number is nan
x = np.nan
np.isnan(x)
df.shape[0]

NameError: name 'df' is not defined

In [None]:
# write a program to find the rows with missing data (nan and None)
bad_row_index_list=[]
for n in range(0, df.shape[0]):
    row=df.iloc[n,:].isnull().values
    if np.any(row==True):
        bad_row_index_list.append(n)
bad_row_index_list

How should we handle the data of the patient-0 (row-0) and patient-4(row-4) ?


In [None]:
#we could remove all of the 'bad' rows
df_clean = df.drop(bad_row_index_list, axis=0)
df_clean

In [None]:
#we could modify some of the 'bad' rows
row = df.iloc[4,:]
row

In [None]:
row['Age']= 75 # replace nan with the average age

In [None]:
df.iloc[4,:]=row
df

In [None]:
df_clean=df.drop(0, axis=0) # remove the first row
df_clean

Now, the data is clean and ready for machine learning