#### DataFrame.shape

In [27]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

iris_df.shape

(150, 4)

#### DataFrame.columns

In [28]:
iris_df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

#### DataFrame.info

In [29]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


#### DataFrame.describe()

In [30]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### DataFrame.head

In [31]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


#### From the example given above, it is clear that, by default, DataFrame.head returns the first five rows of a DataFrame. In the code cell below, can you figure out a way to display more than five rows?

In [32]:
iris_df.head(10)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


#### DataFrame.tail

In [33]:
iris_df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


#### None: non-float missing data

In [34]:
import numpy as np

example1 = np.array([2, None, 6, 8])
example1

array([2, None, 6, 8], dtype=object)

#### NaN: missing float values

In [36]:
np.nan + 1

nan

In [37]:
np.nan * 0

nan

#### What happens if you add np.nan and None together?

In [38]:
np.nan + None

TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

#### NaN and None: null values in pandas

In [39]:
int_series = pd.Series([1, 2, 3], dtype=int)
int_series

0    1
1    2
2    3
dtype: int32

#### Now set an element of int_series equal to None.

In [17]:
import pandas as pd

# Creating a pandas Series
int_series = pd.Series([1, 2, 3], dtype=int)

# Setting an element of the Series to None
int_series[1] = None

# Displaying the modified Series
print(int_series)



0    1.0
1    NaN
2    3.0
dtype: float64


#### Detecting null values

In [41]:
example3 = pd.Series([0, np.nan, '', None])

example3.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [42]:
example3.isnull().sum()

2

#### Try running example3[example3.notnull()].

In [18]:
import pandas as pd
import numpy as np

example3 = pd.Series([0, np.nan, '', None])

result = example3[example3.notnull()]

print(result)


0    0
2     
dtype: object


#### Dropping null values

In [43]:
example3 = example3.dropna()
example3

0    0
2     
dtype: object

In [44]:
example4 = pd.DataFrame([[1,      np.nan, 7], 
                         [2,      5,      8], 
                         [np.nan, 6,      9]])
example4

Unnamed: 0,0,1,2
0,1.0,,7
1,2.0,5.0,8
2,,6.0,9


In [45]:
example4.dropna()

Unnamed: 0,0,1,2
1,2.0,5.0,8


In [46]:
example4.dropna(axis='columns')

Unnamed: 0,2
0,7
1,8
2,9


In [47]:
example4[3] = np.nan
example4

Unnamed: 0,0,1,2,3
0,1.0,,7,
1,2.0,5.0,8,
2,,6.0,9,


####  How might you go about dropping just column 3?
#### Hint: remember that you will need to supply both the axis parameter and the how parameter.

In [62]:
import pandas as pd
import numpy as np

example4 = pd.DataFrame([[1, np.nan, 7], 
                         [2, 5,      8], 
                         [np.nan, 6,  9]])

# Drop column 3 (indexing starts from 0)
example4.drop(columns=[2], axis=1, inplace=True)

print(example4)


     0    1
0  1.0  NaN
1  2.0  5.0
2  NaN  6.0


In [63]:
example4.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1


#### Categorical Data(Non-numeric)

In [64]:
fill_with_mode = pd.DataFrame([[1,2,"True"],
                               [3,4,None],
                               [5,6,"False"],
                               [7,8,"True"],
                               [9,10,"True"]])

fill_with_mode

Unnamed: 0,0,1,2
0,1,2,True
1,3,4,
2,5,6,False
3,7,8,True
4,9,10,True


In [65]:
fill_with_mode[2].value_counts()

2
True     3
False    1
Name: count, dtype: int64

In [67]:
fill_with_mode[2].fillna('True',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fill_with_mode[2].fillna('True',inplace=True)


In [66]:
fill_with_mode

Unnamed: 0,0,1,2
0,1,2,True
1,3,4,
2,5,6,False
3,7,8,True
4,9,10,True


#### Numeric Data

In [68]:
fill_with_mean = pd.DataFrame([[-2,0,1],
                               [-1,2,3],
                               [np.nan,4,5],
                               [1,6,7],
                               [2,8,9]])

fill_with_mean

Unnamed: 0,0,1,2
0,-2.0,0,1
1,-1.0,2,3
2,,4,5
3,1.0,6,7
4,2.0,8,9


In [69]:
np.mean(fill_with_mean[0])

0.0

In [70]:
fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)
fill_with_mean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)


Unnamed: 0,0,1,2
0,-2.0,0,1
1,-1.0,2,3
2,0.0,4,5
3,1.0,6,7
4,2.0,8,9


In [71]:
fill_with_median = pd.DataFrame([[-2,0,1],
                               [-1,2,3],
                               [0,np.nan,5],
                               [1,6,7],
                               [2,8,9]])

fill_with_median

Unnamed: 0,0,1,2
0,-2,0.0,1
1,-1,2.0,3
2,0,,5
3,1,6.0,7
4,2,8.0,9


In [72]:
fill_with_median[1].median()

4.0

In [73]:
fill_with_median[1].fillna(fill_with_median[1].median(),inplace=True)
fill_with_median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fill_with_median[1].fillna(fill_with_median[1].median(),inplace=True)


Unnamed: 0,0,1,2
0,-2,0.0,1
1,-1,2.0,3
2,0,4.0,5
3,1,6.0,7
4,2,8.0,9


#### As we can see, the NaN value has been replaced by the median of the column

In [74]:
example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
example5

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

#### You can fill all of the null entries with a single value, such as 0:

In [75]:
example5.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

#### What happens if you try to fill null values with a string, like ''?

In [76]:
filled_example5 = example5.fillna('')

print(filled_example5)


a    1.0
b       
c    2.0
d       
e    3.0
dtype: object


#### You can forward-fill null values, which is to use the last valid value to fill a null:

In [77]:
example5.fillna(method='ffill')

  example5.fillna(method='ffill')


a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

#### You can also back-fill to propagate the next valid value backward to fill a null:

In [78]:
example5.fillna(method='bfill')

  example5.fillna(method='bfill')


a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

#### You can also specify an axis along which to fill null values:

In [79]:
example4.fillna(method='ffill', axis=1)

  example4.fillna(method='ffill', axis=1)


Unnamed: 0,0,1
0,1.0,1.0
1,2.0,5.0
2,,6.0


#### What output does example4.fillna(method='bfill', axis=1) produce?

In [21]:
example4.fillna(method='bfill', axis=1)


  example4.fillna(method='bfill', axis=1)


Unnamed: 0,0,1,2
0,1.0,7.0,7.0
1,2.0,5.0,8.0
2,6.0,6.0,9.0


#### What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?

In [22]:
example4.fillna(method='ffill')  # Forward fill


  example4.fillna(method='ffill')  # Forward fill


Unnamed: 0,0,1,2
0,1.0,,7
1,2.0,5.0,8
2,2.0,6.0,9


In [23]:
example4.fillna(method='bfill')  # Backward fill

  example4.fillna(method='bfill')  # Backward fill


Unnamed: 0,0,1,2
0,1.0,5.0,7
1,2.0,5.0,8
2,,6.0,9


#### Can you think of a longer code snippet to write that can fill all of the null values in example4?

In [24]:
import pandas as pd
import numpy as np

example4 = pd.DataFrame([[1, np.nan, 7], 
                         [2, 5,      8], 
                         [np.nan, 6, 9]])

# Fill null values using forward fill along columns, then backward fill along columns
filled_columns = example4.fillna(method='ffill', axis=0).fillna(method='bfill', axis=0)

# Fill remaining null values using forward fill along rows, then backward fill along rows
filled_all = filled_columns.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)

print(filled_all)


     0    1    2
0  1.0  5.0  7.0
1  2.0  5.0  8.0
2  2.0  6.0  9.0


  filled_columns = example4.fillna(method='ffill', axis=0).fillna(method='bfill', axis=0)
  filled_all = filled_columns.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
  filled_all = filled_columns.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)


In [80]:
example4.fillna(example4.mean())

Unnamed: 0,0,1
0,1.0,5.5
1,2.0,5.0
2,1.5,6.0


#### LABEL ENCODING

In [81]:
label = pd.DataFrame([
                      [10,'business class'],
                      [20,'first class'],
                      [30, 'economy class'],
                      [40, 'economy class'],
                      [50, 'economy class'],
                      [60, 'business class']
],columns=['ID','class'])
label

Unnamed: 0,ID,class
0,10,business class
1,20,first class
2,30,economy class
3,40,economy class
4,50,economy class
5,60,business class


#### To perform label encoding on the 1st column, we have to first describe a mapping from each class to a number, before replacing

In [82]:
class_labels = {'business class':0,'economy class':1,'first class':2}
label['class'] = label['class'].replace(class_labels)
label

  label['class'] = label['class'].replace(class_labels)


Unnamed: 0,ID,class
0,10,0
1,20,2
2,30,1
3,40,1
4,50,1
5,60,0


#### ONE HOT ENCODING

In [83]:
one_hot = pd.DataFrame([
                      [10,'business class'],
                      [20,'first class'],
                      [30, 'economy class'],
                      [40, 'economy class'],
                      [50, 'economy class'],
                      [60, 'business class']
],columns=['ID','class'])
one_hot


Unnamed: 0,ID,class
0,10,business class
1,20,first class
2,30,economy class
3,40,economy class
4,50,economy class
5,60,business class


#### Let us perform one hot encoding on the 1st column

In [84]:
one_hot_data = pd.get_dummies(one_hot,columns=['class'])
one_hot_data

Unnamed: 0,ID,class_business class,class_economy class,class_first class
0,10,True,False,False
1,20,False,False,True
2,30,False,True,False
3,40,False,True,False
4,50,False,True,False
5,60,True,False,False


#### Identifying duplicates: duplicated

In [85]:
example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],
                         'numbers': [1, 2, 1, 3, 3]})
example6

Unnamed: 0,letters,numbers
0,A,1
1,B,2
2,A,1
3,B,3
4,B,3


In [86]:
example6.duplicated()

0    False
1    False
2     True
3    False
4     True
dtype: bool

#### Dropping duplicates: drop_duplicates

In [87]:
example6.drop_duplicates()

Unnamed: 0,letters,numbers
0,A,1
1,B,2
3,B,3


#### Dropping duplicate letters


In [88]:
example6.drop_duplicates(['letters'])

Unnamed: 0,letters,numbers
0,A,1
1,B,2
