In [None]:
import numpy as np
import pandas as pd

- In Data Analysis, a significant amount of time is spent on data preparation: loading, cleaning, transforming, and rearranging.
    - Such tasks are often reported to take up 80% or more of an analyst’s time.
- Pandas provides you with a high-level, flexible, and fast set of tools to enable you to manipulate data into the right form.
- Much of the design and implementation of pandas has been driven by the needs of real-world applications.

## Handling Missing Data
- Missing data occurs commonly in many data analysis applications.
    - All of the descriptive statistics on pandas objects exclude missing data by default.
- The missing data is represented in pandas with the floating-point value NaN (Not a Number).
    - We call this a sentinel value that can be easily detected.

- In pandas, missing data is also referred  as  NA (i.e., "not available").
- In statistics applications, NA means
    - data that does not exist
    - data that exists but was not observed

- When cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data
collection problems or potential biases in the data caused by missing data.
- The built-in Python None value is also treated as NA

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [None]:
string_data

In [None]:
string_data.isnull()

In [None]:
string_data[0] = None

In [None]:
string_data

In [None]:
string_data.isnull()

### Filtering Out Missing Data

- dropna: Remove missing values.

In [None]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [None]:
data

In [None]:
data.dropna()

In [None]:
data

In [None]:
data.notnull()

In [None]:
#dropna is equivalent to
data[data.notnull()]

- dropna in Dataframe

In [None]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [None]:
data

In [None]:
data.dropna()

In [None]:
cleaned = data.dropna(axis=1)

In [None]:
cleaned

In [None]:
data

- Passing how='all' will only drop rows that are all NA

In [None]:
data.dropna(how='all')

In [None]:
data.dropna(how='any')    # default

- if you want to keep only rows containing a certain number of observations.

In [None]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [None]:
df

In [None]:
df.dropna(thresh=2)

### Filling In Missing Data

- *fillna*: Fill NA/NaN values using the specified method

In [None]:
df

In [None]:
df.fillna(42)

In [None]:
df

In [None]:
#if the value to fill in depends on the column
df.fillna({1: 0.5, 2: 0})

In [None]:
df

In [None]:
df.fillna(0, inplace=True)

In [None]:
df

- computing a value

In [None]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

In [None]:
df.fillna(method='ffill')

In [None]:
#limit for forward and backward filling, maximum number of consecutive periods to fill
df.fillna(method='ffill', limit=2)

In [None]:
#we can use other functions
df.fillna(df.mean())

## Data Transformation


### Removing Duplicates

- *duplicated*
- drop_duplicated

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
data.duplicated()

In [None]:
# data['k1'].duplicated() is equivalent to
data.duplicated(['k1'])

In [None]:
data.drop_duplicates()

In [None]:
data['v1'] = range(7)
data

In [None]:
data.drop_duplicates(['k1', 'k2'], keep='last')

### Transforming Data Using a Function or Mapping

- map: Map values of Series according to input correspondence.

In [None]:
data = pd.DataFrame({
    'device': ['iPhone', 'Galaxy S21', 'MacBook Pro', 'Pixel 6',
               'ThinkPad', 'iPad', 'Surface Pro', 'galaxy tab', 'Macbook air'],
    'price': [999, 799, 1299, 599, 1099, 799, 999, 649, 999]
})

1. passing dictionary

In [None]:
device_to_brand = {
    'iphone': 'Apple',
    'macbook pro': 'Apple',
    'macbook air': 'Apple',
    'ipad': 'Apple',
    'galaxy s21': 'Samsung',
    'galaxy tab': 'Samsung',
    'pixel 6': 'Google',
    'thinkpad': 'Lenovo',
    'surface pro': 'Microsoft'
}

In [None]:
data['brand'] = data['device'].str.lower().map(device_to_brand)
data

### Replacing Values

- *replace*: Replace values given in *to_replace* with value.

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

In [None]:
data.replace(-999, np.nan)

In [None]:
data.replace([-999, -1000], np.nan)

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})

### Discretization and Binning

- *cut*: Bin values into discrete intervals. Use cut when you need to segment and sort data values into bins

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 59, 45, 41, 32]

In [None]:
bins = [18, 25, 35, 60, 100]

In [None]:
values = pd.cut(ages, bins)

In [None]:
values

In [None]:
type(values)

In [None]:
values.codes

In [None]:
values.categories

In [None]:
values

In [None]:
#These are the bin counts for the result of pandas.cut.
pd.value_counts(values)

In [None]:
#parenthesis means that the side is open,
#while the square bracket means it is closed (inclusive).
#You can change which side is closed by passing right=False:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

In [None]:
#You can also pass your own bin names by passing a list or array to the labels option:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
new_ages = pd.cut(ages, bins, labels=group_names)

In [None]:
new_ages.categories

In [None]:
pd.value_counts(new_ages)

In [None]:
# passing an integer, it will compute
#equal-length bins based on the minimum and maximum values in the data.
data = np.random.rand(20)
data

In [None]:
pd.cut(data, 13).codes

### Computing Indicator/Dummy Variables/ One Hot Encoding (OHE)
- This converts a categorical variable into a “dummy” or “indicator” matrix.

- If a column in a DataFrame has k distinct values,
    - you would derive a matrix or DataFrame with k columns containing all 1s and 0s.
- pandas has a get_dummies function for doing this

In [None]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})

In [None]:
df

In [None]:
pd.get_dummies(df['key'])

In [None]:
pd.get_dummies(df['key'], dtype='int')

In [None]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [None]:
dummies

In [None]:
df[['data1']]

In [None]:
#to join the result in the original dataframe
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy