# Session 15 : Dictionary and Pandas

1. Understanding Dictionaries
2. Creating Dictionary
3. Accessing Dictionary
4. Dictionary Manipulation
5. Understanding Pandas
6. DataFrame
7. CSV to DataFrame
8. Using Square Brackets
9. Using loc and iloc

In [1]:
import numpy as np
import pandas as pd

## Dictionary

In [2]:
dict_score = {'Alex':80, 'Ben':97, 'Charlie':85, 'Daniel':95, 'Edy':80} 
dict_score

{'Alex': 80, 'Ben': 97, 'Charlie': 85, 'Daniel': 95, 'Edy': 80}

### Change Values

In [3]:
# Change score for Alex
dict_score['Alex'] = 85
dict_score

{'Alex': 85, 'Ben': 97, 'Charlie': 85, 'Daniel': 95, 'Edy': 80}

### Remove Values

In [4]:
# Remove data for Ben
dict_score.pop('Ben')
dict_score

{'Alex': 85, 'Charlie': 85, 'Daniel': 95, 'Edy': 80}

In [5]:
# Remove last data
dict_score.popitem()

('Edy', 80)

In [6]:
dict_score

{'Alex': 85, 'Charlie': 85, 'Daniel': 95}

In [7]:
del dict_score['Alex']

In [8]:
dict_score

{'Charlie': 85, 'Daniel': 95}

### Nested Dictionary

In [9]:
my_family = {
    'child1':{'nama':'Andi','umur':20},
    'child2':{'nama':'Budi','umur':18},
    'child3':{'nama':'Caca','umur':23}
}

my_family

{'child1': {'nama': 'Andi', 'umur': 20},
 'child2': {'nama': 'Budi', 'umur': 18},
 'child3': {'nama': 'Caca', 'umur': 23}}

In [10]:
my_family['child1']

{'nama': 'Andi', 'umur': 20}

In [11]:
my_family['child1']['nama']

'Andi'

## Series

#### Creating Series Using Various Ways

In [12]:
list_num = [100, 200, 300]
array_num = np.array(list_num)
the_label = ['Pertama', 'Kedua', 'Ketiga'] 
dict_num = {'Pertama':100, 'Kedua':200, 'Ketiga':300}

In [13]:
pd.Series(data=list_num, index=the_label)

Pertama    100
Kedua      200
Ketiga     300
dtype: int64

In [14]:
pd.Series(data=array_num, index=the_label)

Pertama    100
Kedua      200
Ketiga     300
dtype: int32

In [15]:
pd.Series(dict_num)

Pertama    100
Kedua      200
Ketiga     300
dtype: int64

#### Accessing Values From A Series

In [16]:
series_harga = pd.Series([35000, 20000, 40000, 15000], index=['Sepatu', 'Tas', 'Celana', 'Kemeja'])
series_harga

Sepatu    35000
Tas       20000
Celana    40000
Kemeja    15000
dtype: int64

In [17]:
series_harga.values

array([35000, 20000, 40000, 15000], dtype=int64)

In [18]:
series_harga.keys()

Index(['Sepatu', 'Tas', 'Celana', 'Kemeja'], dtype='object')

In [19]:
series_harga['Sepatu']

35000

In [20]:
series_harga[['Sepatu', 'Celana']]

Sepatu    35000
Celana    40000
dtype: int64

## DataFrame

### Creating DataFrame

In [21]:
dict_tinggi = {'Adi':180, 'Budi':175, 'Citra':167}
series_tinggi = pd.Series(dict_tinggi)
df_tinggi = pd.DataFrame(series_tinggi, columns=['tinggi_badan'])

In [22]:
df_tinggi

Unnamed: 0,tinggi_badan
Adi,180
Budi,175
Citra,167


In [23]:
df_tinggi['berat_badan'] = [65, 50, 60]
df_tinggi['universitas'] = ['ITB', 'UI', 'UGM']
df_tinggi['ipk'] = [3.7, 3.9, 3.4]
df_tinggi['umur'] = [23, 28, 19]
df_tinggi

Unnamed: 0,tinggi_badan,berat_badan,universitas,ipk,umur
Adi,180,65,ITB,3.7,23
Budi,175,50,UI,3.9,28
Citra,167,60,UGM,3.4,19


### Import CVS File

In [24]:
df_titanic = pd.read_csv('data_titanic.csv')
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [25]:
# Slicing dataframe by index location
df_titanic.iloc[:10, :3]

Unnamed: 0,PassengerId,Survived,Pclass
0,1,0,3
1,2,1,1
2,3,1,3
3,4,1,1
4,5,0,3
5,6,0,3
6,7,0,1
7,8,0,3
8,9,1,3
9,10,1,2


In [26]:
# Slicing dataframe by index location
df_titanic.iloc[:10, [3, 5, 6]]

Unnamed: 0,Name,Age,SibSp
0,"Braund, Mr. Owen Harris",22.0,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1
2,"Heikkinen, Miss. Laina",26.0,0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1
4,"Allen, Mr. William Henry",35.0,0
5,"Moran, Mr. James",,0
6,"McCarthy, Mr. Timothy J",54.0,0
7,"Palsson, Master. Gosta Leonard",2.0,3
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1


In [27]:
# Slicing dataframe by column name
df_titanic.loc[:10, ['Name', 'Sex', 'Age']]

Unnamed: 0,Name,Sex,Age
0,"Braund, Mr. Owen Harris",male,22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
2,"Heikkinen, Miss. Laina",female,26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0
4,"Allen, Mr. William Henry",male,35.0
5,"Moran, Mr. James",male,
6,"McCarthy, Mr. Timothy J",male,54.0
7,"Palsson, Master. Gosta Leonard",male,2.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0


In [28]:
# Slicing dataframe by column name
df_titanic.loc[:10, 'Name':'Ticket']

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Ticket
0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599
2,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803
4,"Allen, Mr. William Henry",male,35.0,0,0,373450
5,"Moran, Mr. James",male,,0,0,330877
6,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463
7,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736


In [29]:
# Get the top five data
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
# Get the top five data
df_titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [31]:
# Statictics description for numerical data
df_titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [32]:
# Statictics description for categorical data
df_titanic.describe(exclude='number')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Moran, Mr. James",male,1601,G6,S
freq,1,577,7,4,644


In [33]:
# Check dataframe information
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [34]:
df_titanic['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

## Latihan

In [35]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


1. Passenger paling banyak berada pada passenger class mana?

In [36]:
df_titanic['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

2. berapa banyak passenger male dan female yang survived dan tidak?

In [37]:
df_titanic[df_titanic['Survived'] == 1]['Sex'].value_counts()

female    233
male      109
Name: Sex, dtype: int64

In [38]:
df_titanic[df_titanic['Survived'] == 0]['Sex'].value_counts()

male      468
female     81
Name: Sex, dtype: int64

3. berapa banyak passenger yang umurnya dibawah 25 tahun

In [39]:
len(df_titanic[df_titanic['Age'] < 25])

278

4. munculkan semua row yang terdapat satu saja nilai NaN!

In [40]:
df_titanic[(df_titanic['Cabin'].isna()) | (df_titanic['Age'].isna()) | (df_titanic['Embarked'].isna())]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [41]:
df_titanic[df_titanic.isna().any(axis=1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [42]:
df_titanic.isna().any(axis=0)

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool