# Recall By Judin

## What is pandas?

pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.

In [1]:
import pandas as pd
import numpy as np

## Data structures in pandas

- Series - "column"
- DataFrame - "table"

In [5]:
name = ["Ah Meng", "Abu", "Ahmad", "Siva", "Ah Kau"]
grades = ["A", "B", "C", "A", "A"]
#Create column using pd.Series
name_series = pd.Series(name)
grades_series = pd.Series(grades)
#Create table using pd.DataFrame
student_df = pd.DataFrame({"name": name_series, "grade": grades_series})
student_df

Unnamed: 0,name,grade
0,Ah Meng,A
1,Abu,B
2,Ahmad,C
3,Siva,A
4,Ah Kau,A


## Index

- To reference the row
- By default the index is numeric starting from 0
- Able to set which series to be the index

In [6]:
student_df_name_index = student_df.set_index("name")
student_df_name_index

Unnamed: 0_level_0,grade
name,Unnamed: 1_level_1
Ah Meng,A
Abu,B
Ahmad,C
Siva,A
Ah Kau,A


## Locating data

- Coordinate system” for DataFrame By referencing the row and column, we can get to the data that we wanted
- Could be referenced by numeric index, or word keys
- df.iloc and df.loc

In [7]:
ahmeng_grade = student_df.iloc[0, 1] # A
abu_grade = student_df.iloc[1, 1] # B+
all_grades = student_df.iloc[:, 1]
print("Ah Meng's grade", ahmeng_grade)
print("Abu's grade", abu_grade)
print("All Grades")
print(all_grades)

Ah Meng's grade A
Abu's grade B
All Grades
0    A
1    B
2    C
3    A
4    A
Name: grade, dtype: object


In [8]:
print(ahmeng_grade == student_df.loc[0, 'grade'])
print(abu_grade == student_df.loc[1, 'grade'])
print(all_grades == student_df.loc[:, 'grade'])

True
True
0    True
1    True
2    True
3    True
4    True
Name: grade, dtype: bool


In [9]:
student_with_a = student_df[student_df.grade == 'A']
print(student_df.grade == 'A')
student_with_a

0     True
1    False
2    False
3     True
4     True
Name: grade, dtype: bool


Unnamed: 0,name,grade
0,Ah Meng,A
3,Siva,A
4,Ah Kau,A


## Difference of iloc and loc

In [10]:
ahmeng_to_ahmad = student_df.iloc[0:3, 0]
ahmeng_to_ahmad

0    Ah Meng
1        Abu
2      Ahmad
Name: name, dtype: object

In [11]:
ahmeng_to_ahmad_take_two = student_df.loc[0:2, "name"]
ahmeng_to_ahmad_take_two

0    Ah Meng
1        Abu
2      Ahmad
Name: name, dtype: object

## Adding and dropping columns

- Adding new columns is similar to adding a new key-value pair in dict
- Dropping columns with pd drop function

In [12]:
#Add column
student_df["passed"] = student_df.grade == "A"
student_df

Unnamed: 0,name,grade,passed
0,Ah Meng,A,True
1,Abu,B,False
2,Ahmad,C,False
3,Siva,A,True
4,Ah Kau,A,True


In [13]:
#Drop column
student_df = student_df.drop("passed", axis=1)
student_df

Unnamed: 0,name,grade
0,Ah Meng,A
1,Abu,B
2,Ahmad,C
3,Siva,A
4,Ah Kau,A


In [14]:
#Drop column ( Ah Meng)
student_df = student_df.drop(0, axis=0)
student_df

Unnamed: 0,name,grade
1,Abu,B
2,Ahmad,C
3,Siva,A
4,Ah Kau,A


## Merging dataframes

In [15]:
df_a = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
df_b = pd.DataFrame({"b": [3, 4, 5], "c": [1, 2, 3]})

In [16]:
print(df_a)
print(df_b)

   a  b
0  1  3
1  2  4
2  3  5
   b  c
0  3  1
1  4  2
2  5  3


In [17]:
pd.concat([df_a, df_b], axis=1)

Unnamed: 0,a,b,b.1,c
0,1,3,3,1
1,2,4,4,2
2,3,5,5,3


In [19]:
df_a.merge(df_b, on="b")

Unnamed: 0,a,b,c
0,1,3,1
1,2,4,2
2,3,5,3


## Data Exploration

In [20]:
student_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 1 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   grade   4 non-null      object
dtypes: object(2)
memory usage: 96.0+ bytes


In [22]:
student_df.describe()

Unnamed: 0,name,grade
count,4,4
unique,4,3
top,Abu,A
freq,1,2


In [23]:
student_df.corr(method='pearson')

In [24]:
student_df.grade.unique()

array(['B', 'C', 'A'], dtype=object)

In [25]:
student_df.grade.value_counts()

A    2
B    1
C    1
Name: grade, dtype: int64

## Missing Values

In [26]:
student_df["mark"] = [96, 79, 68, np.nan]
student_df

Unnamed: 0,name,grade,mark
1,Abu,B,96.0
2,Ahmad,C,79.0
3,Siva,A,68.0
4,Ah Kau,A,


In [27]:
student_df.dropna()

Unnamed: 0,name,grade,mark
1,Abu,B,96.0
2,Ahmad,C,79.0
3,Siva,A,68.0


In [28]:
student_df.fillna(-1)

Unnamed: 0,name,grade,mark
1,Abu,B,96.0
2,Ahmad,C,79.0
3,Siva,A,68.0
4,Ah Kau,A,-1.0


## Aggregation

In [29]:
student_df.groupby("grade").count()

Unnamed: 0_level_0,name,mark
grade,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,1
B,1,1
C,1,1


In [30]:
for idx, gp in student_df.groupby("grade"):
  print(idx, gp);

A      name grade  mark
3    Siva     A  68.0
4  Ah Kau     A   NaN
B   name grade  mark
1  Abu     B  96.0
C     name grade  mark
2  Ahmad     C  79.0
