# JedLueng 18 Feb 2022

In [1]:
import pandas as pd
import numpy as np
#import matplotlib

# Create Pandas Series


**A series can be created using pandas function Series with python list or numpy 1-D array as the argument. By default, each item will receive an numeric index label starting from 0.**


In [3]:
# Pandas +  Numpy 
s1 = pd.Series([1, 2, 3])
s2 = pd.Series(np.array([1, 2, 3, 4, 5]))
print(s1, s2)

0    1
1    2
2    3
dtype: int64 0    1
1    2
2    3
3    4
4    5
dtype: int64


**You may compare Series with ndarray.**

In [4]:
np.array([1, 2, 3, 4, 5])

array([1, 2, 3, 4, 5])

**An explicit index can also be specified when creating the series by providing the index with a list as the second argument. This is often called label.**

In [5]:
s3 = pd.Series([1, 2, 3, 'a', 'b', 'c'], index=['A', 'B', 'C', 'D', 'E', 'F'])
s3

A    1
B    2
C    3
D    a
E    b
F    c
dtype: object

**When a dictionary is provided as the argument, the key will be used as the index.**

In [6]:
s4 = pd.Series({'A': 1, 'B': 2, 'C': 3})
s4

A    1
B    2
C    3
dtype: int64

Each index label needs to be unique?


In [7]:
s5 = pd.Series({'A': 1, 'B': 2, 'A': 3})
s5

A    3
B    2
dtype: int64

**Data in the series can be accessed similar to that in a Python list when having the default numeric index.**

In [8]:
s2[2]

3

In [9]:
s2[:2]

0    1
1    2
dtype: int64

**Data in the series can be accessed similar to that in a Python dictionary when having specified index label.**

In [10]:
s3['A']

1

**You can retrieve multiple data by providing a list of "keys"/labels.**

In [11]:
s3['A', 'B']  #two pairs of square brackets

KeyError: 'key of type tuple not found and not a MultiIndex'

# Create DataFrame

**You can create a DataFrame from dictionary of narrays/lists/series. Keys will be used as the column labels by default. Values become the columns corresponding to the key.**



In [12]:
d1 = pd.DataFrame([1, 2, 3])

In [13]:
d1 = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})
d1

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4


**You may also specify index label for the rows.**

In [14]:
d2 = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}, index=['X', 'Y', 'Z'])
d2

Unnamed: 0,A,B
X,1,2
Y,2,3
Z,3,4


In [15]:
d3 = pd.DataFrame({
    'A': np.array([1, 2, 3]),
    'B': [2, 3, 4]
},
                  index=['X', 'Y', 'Z'])  #dictionary with mixed data types
d3

Unnamed: 0,A,B
X,1,2
Y,2,3
Z,3,4


In [None]:
d4 = pd.DataFrame({'A': [1, 2, 3], 'B': s1})
d4

**Items in the dictionary must have the same length unless they are all series.**

In [None]:
d5 = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4, 5]})  #error

In [None]:
d6 = pd.DataFrame({'A': s1, 'B': [1, 2]})
d6

In [None]:
s2

In [None]:
d6 = pd.DataFrame({'A': s1, 'B': s2})
d6

**When series have different length, Python will try to match their index to create the dataframe and NaN (Not a Number) is appended in missing areas.**

In [None]:
d7 = pd.DataFrame({'A': s1, 'B': s2})  #using default numeric index
d7

In [None]:
data = {
    'A': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    'B': pd.Series([1, 2, 3, 4], index=['b', 'c', 'd', 'e'])
}  #using specified index
df8 = pd.DataFrame(data)
df8

**A DataFrame can be created using a single list or a list of lists.**

In [None]:
d9 = pd.DataFrame([1, 2, 3, 4, 5, 6])
d9

In [None]:
s9 = pd.Series([1, 2, 3, 4, 5, 6])  #compare with a series
s9

In [None]:
d10 = pd.DataFrame([[1, 2, 3], [2, 3, 4], [3, 4, 5]])  #a list of lists
d10

Numeric labels will be created for row and column by default. You can also specify the labels for columns and index (row).

In [None]:
d11 = pd.DataFrame([[1, 2, 3], [2, 3, 4], [3, 4, 5]], columns=['A', 'B', 'C'])
d11

In [None]:
d12 = pd.DataFrame([[1, 2, 3], [2, 3, 4], [3, 4, 5]],
                   columns=['A', 'B', 'C'],
                   index=['X', 'Y', 'Z'])
d12

**You can create a DataFrame from a list of dictionaries. Keys will be used as the column labels by default.**

In [None]:
d13 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 5, 'b': 10}])
d13

In [None]:
d14 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 5, 'b': 10}], index=['A', 'B'])
d14

**Each item in the list is like a row in a table. Items in the list can have different length.**
What would the df_test look like?

In [None]:
df_test = pd.DataFrame([{'a': 1, 'b': 2}, {'b': 5, 'c': 10, 'd': 15}])

df_test

In [None]:
df_test = pd.DataFrame([[1, 2], [2, 3], [3, 4, 5]])
df_test

**When no specific column label is provided, Python will match the default labels (number index or keys) to create the dataframe and NaN is appended in missing areas.**

**When column labels are specified, Python will create DataFrame based on the column labels and try to match keys with the labels. Values with non-match keys will be ignored. **

In [None]:
df_test = pd.DataFrame([{
    'a': 1,
    'b': 2
}, {
    'b': 5,
    'c': 10,
    'd': 15
}],
                       columns=['b', 'd', 'e'])
df_test

In [None]:
df_test = pd.DataFrame({
    'A': np.array([1, 2, 3]),
    'B': [2, 3, 4]
},
                       index=['X', 'Y', 'Z'],
                       columns=['aa', 'bb'])
df_test

# Create DataFrame from files

**Pandas can read data directly from a wide range of file formats, such as csv, Excel, JSON, SQL database, Stata, SAS, etc. We will focus on csv files in this class. **

**Use read_csv() function. Filename is the only required argument. **

In [19]:
df_tips = pd.read_csv('tips.csv')
df_tips

FileNotFoundError: [Errno 2] No such file or directory: 'tips.csv'

In [20]:
df_tips = pd.read_csv('tips.csv', header=None)
df_tips

Unnamed: 0,0,1,2,3,4,5,6
0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2,Female,Yes,Sat,Dinner,2
242,22.67,2,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
df_tips = pd.read_csv('tips.csv', header=None, names=[1, 2])
df_tips

In [None]:
df_tips = pd.read_csv('tips.csv', index_col=0)
df_tips

In [None]:
df_tips = pd.read_csv('tips.csv', index_col='tip')
df_tips

In [None]:
df_tips = pd.read_csv('tips.csv', usecols=[1, 2, 3])
df_tips

In [None]:
df_tips = pd.read_csv('tips.csv', skiprows=3, nrows=10)
df_tips

# Conversion between DataFrame and ndarray

In [None]:
import numpy as np
array = np.array([[1, 2, 3], [3, 4, 5], [6, 7, 7]])
df_array = pd.DataFrame(data=array)
df_array

In [None]:
df_array.to_numpy()

# **Column selection and deletion**

**Column selection using the column label**

In [16]:
df_tips['tip']  #column label as the key, return a series.

NameError: name 'df_tips' is not defined

In [None]:
df_tips[['tip']]  #return a dataframe.

**Add new column with label, similar as adding new item to a dictionary**

In [None]:
df_tips['f'] = pd.Series([10, 10])
df_tips

**New column can be added by calculating existing columns**

In [None]:
df_tips['total'] = df_tips['total_bill'] + df_tips['tip']
df_tips

**del to delete a column**

In [None]:
del df_tips['f']
df_tips

# Row Selection

**Row selection by passing row labels to loc[] method. The row will be returned as a series or a dataframe:**

In [None]:
df_tips.loc[1]  #column labels will be used as row index.

**Multiple rows can be selected**

In [None]:
df_tips.loc[[1, 2, 3]]  #a list of row indexes/labels, returns a dataframe

In [None]:
df_tips.loc[1:3]  #slice, both start and end included.

**Column labels can be provided to filter the results**

In [None]:
df_tips.loc[[1, 2, 3], 'tip']

**Select rows with Boolean list indicating whether to be selected**

In [None]:
df_tips.loc[[True, False, False, True, False]]  #error

**Select rows with Boolean expression passed as series**

In [None]:
df_tips['tip'] > 2  #element-wise operation

In [None]:
df_tips.loc[df_tips['tip'] > 2]

In [None]:
df_tips.loc[df_tips['tip'] > 2, 'total']

**You may also try:**

In [None]:
df_tips[df_tips['tip'] > 2]

**Select rows by passing integer position (index) to method iloc[].**

In [None]:
df_tips.iloc[1]

In [None]:
df_tips.loc[1]

In [None]:
df_tips.iloc[1:3]

In [None]:
df_tips.loc[1:3]

# **Add Rows**

**Add new rows with append() method at the end of a dataframe.**

In [None]:
df_tips.append([99, 99])  #existing dataframe not updated.
df_tips

In [None]:
df_tips2 = df_tips.append([99, 99])
df_tips2

In [None]:
df_tips2 = df_tips.append(pd.DataFrame([99, 99]))
df_tips2

**Python will align matching columns. **

In [None]:
df_tips2 = df_tips.append(pd.DataFrame([99, 99], columns=['size']))  #error
df_tips2

In [None]:
df_tips2 = df_tips.append([[99, 99]])
df_tips2

In [None]:
df_tips2 = df_tips.append(pd.DataFrame([[99, 99]], columns=['A', 'B']))
df_tips2

**You can reset row labels by setting ignore_index=True (default is False).**

In [None]:
df_tips2 = df_tips.append(pd.DataFrame([[99, 99]], columns=['A', 'B']),
                          ignore_index=True)
df_tips2

# **Row Deletion**

**You can remove rows by drop() method with a list of row labels.**

In [None]:
df_tips.drop([2])  #same as df_tips.drop(4)
df_tips

In [None]:
df_tips.drop([0:2]) #error [0:2] is not a list. Alternatively, df_tips = df_tips[2:].
df_tips

In [None]:
df_tips.drop([0, 1, 2])
df_tips

In [None]:
df_tips.drop(range(3))  #existing dataframe not updated
df_tips

In [None]:
df_tips = df_tips.drop([2])
df_tips

In [None]:
df_tips.drop([4], inplace=True)  # existing dataframe updated.
df_tips

# **Important DataFrame Attributes**

**.index returns a "list" of row indexes.**

In [None]:
df_tips.index

In [None]:
df_tips.loc[df_tips['tip'] > 2].index

**Very handy to remove rows based on condition.**

In [None]:
df_tips3 = df_tips.drop(df_tips.loc[df_tips['tip'] > 2].index)
df_tips3

# **Scalar operation**

**Basic arithmetic and Boolean operations with scalar data are element-wise.**

In [None]:
df_tips * 2  #broadcasting

In [None]:
df_tips.add(2)  #error

In [None]:
df_tips > 2  #error

In [None]:
df_tips['tip'] * 2

**Arithmetic and Boolean operations with another list or Series will be performed based on matching labels (columns). **

In [None]:
d10 = pd.DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]])

In [None]:
d10 - [1, 2, 3]  #default to compare by column.

In [None]:
d10 > [3, 3, 3]

In [None]:
d10 - [1, 2]  #error, different length

In [None]:
d10 - pd.Series([1, 2])  # NaN for no-match column.

**For operations by row, you need to specify axis to be "index".**

In [None]:
d10.sub(pd.Series([1, 2, 3]), axis='index')

In [None]:
d10.sub(pd.Series([1, 2, 3], index=[1, 2, 3]),
        axis='index')  # NaN for no-match rows.

In [None]:
d10.mul(pd.Series([1, 2, 3], index=[1, 2, 3]),
        axis='index')  # NaN for no-match rows.

# **Data Processing**

## **Explore Dataset**

In [None]:
df_tips.shape

In [None]:
df_tips.size

In [None]:
df_tips.info()

In [None]:
df_tips

In [None]:
df_tips.head(3)

In [None]:
df_tips.tail(3)

## **Titanic Case**

In [None]:
df_titan = pd.read_csv('titanic_train.csv')
df_titan.head()

In [None]:
df_titan.info()

**Descriptive statistics**

In [None]:
df_titan.describe()

In [None]:
df_titan.describe(include='all')

In [None]:
df_titan.describe(include='object')

In [None]:
df_titan.describe(include='number')

In [None]:
df_titan.mean()

By default, these statistics are for each column. You can get row-based statistics by specifying axis=1. 


In [None]:
df_titan.mean(axis=1)  # default axis = 0

To get the unique values and the corresponding counts for a column:


In [None]:
df_titan['Pclass'].value_counts()  #only works for individual column.

## **Exploration via Visualization**

In [None]:
df_titan.hist()

In [None]:
df_titan.hist('Survived', by='Sex')

## **You can also try seaborn**

In [None]:
import seaborn as sns
sns.catplot(x='Survived', kind='count', hue='Sex', data=df_titan)

In [None]:
sns.distplot(df_titan['Age'])

In [None]:
sns.distplot(df_titan['Fare'])

In [None]:
sns.pairplot(df_titan, hue='Sex')

In [None]:
sns.catplot(x='Survived', y='Fare', kind='box', data=df_titan)

# DataCleaning

## **Missing Data**

**Check if there is any missing data**

From descriptive statistics

In [None]:
df_titan.describe(include='all')

Another quick way to find out.

In [None]:
df_titan.isnull().sum()

## **Deal with Missing Data by Removing**


Removing rows with missing data by using dropna().




In [None]:
df_titan.dropna()  # original dataframe will not be replaced automatically.

In [None]:
df_titan.dropna().isnull().sum()

In [None]:
df_titan.dropna().describe()

Removing columns with missing data by specifying axis=1

In [None]:
df_tips.dropna(axis=1)  # age and cabin dropped.

In [None]:
df_titan.dropna(axis=1).isnull().sum()

If you know which column to drop.

In [None]:
df_titan.drop(columns=['Age']).isnull().sum()

Set threshhold to remove.


In [None]:
# keep rows with at least 11 non-missing data. i.e. rows missing both age and cabin get dropped.
df_titan.dropna(thresh=11).isnull().sum()

In [None]:
df_titan.dropna(thresh=11).info()

Original dataframe remains unchanged

In [None]:
df_titan.isnull().sum()  #inplace=True

## **Dealing with Missing Data by Imputing**

### Impute with a constant with fillna().

In [None]:
df_titan.fillna(1)  #fill all NaN with 1.

In [None]:
# different value for two columns.
df_titan.fillna({'Age': 20, 'Cabin': 'B96'})

In [None]:
df_titan.fillna({'Age': 20, 'Cabin': 'B96'}).isnull().sum()

In [None]:
df_titan.fillna({'Age': 20, 'Cabin': 'B96'}).describe()

In [None]:
df_titan.describe()

### Impute with mean/median/mode.


In [None]:
df_titan.fillna({
    'Age': df_titan['Age'].mean(),
    "Cabin": df_titan['Cabin'].mode()
}).head()

In [None]:
df_titan.fillna({
    'Age': df_titan['Age'].mean(),
    "Cabin": df_titan['Cabin'].mode()
}).head().describe()

### Exercise

Replace all missing ages with mean and drop the Cabin column.


In [None]:
df_titan1 = df_titan.fillna({'Age': df_titan['Age'].mean()}).dropna(axis=1)

In [None]:
df_titan1.isnull().sum()

### Removing Unnecessary Data

Remove irrelevant columns (variables) using drop(columns = [column names]). 


In [None]:
df_titan.drop(columns=['Ticket']).head()

In [None]:
df_titan.info()

In [None]:
df_titan_cleaned = df_titan.fillna({
    'Age': df_titan['Age'].mean(),
    'Embarked': df_titan['Embarked'].mode()
}).dropna(axis=1)
df_titan_cleaned.head()

In [None]:
df_titan_cleaned.info()

In [None]:
df_titan_cleaned.drop(columns=['Ticket'], inplace=True)

In [None]:
df_titan_cleaned.info()

### Dealing with Outliers

In [None]:
top = df_titan['Age'].mean() + 2 * df_titan['Age'].std()
top

In [None]:
bot = df_titan['Age'].mean() - 2 * df_titan['Age'].std()
bot

In [None]:
df_titan['Age'].quantile()

In [None]:
df_titan_cleaned[df_titan_cleaned['Age'] > top].info()

In [None]:
df_titan_cleaned[df_titan_cleaned['Age'] < bot].info()

In [None]:
df_titan_cleaned = df_titan_cleaned.drop(
    df_titan_cleaned[df_titan_cleaned['Age'] > top].index)
df_titan_cleaned = df_titan_cleaned.drop(
    df_titan_cleaned[df_titan_cleaned['Age'] < bot].index)
df_titan_cleaned.info()

In [None]:
df_titan_cleaned.describe(include='all')