# Pandas - Data Analysis Library

## Import Pandas

In [2]:
import pandas as pd

In [3]:
# Cek versi
pd.__version__

'1.1.3'

## Series

In [4]:
# Membuat Series
# Data dalam bentuk list
pd.Series(data=[1,2,3,4])

0    1
1    2
2    3
3    4
dtype: int64

## DataFrame

In [5]:
# Membuat DataFrame
# Data dalam bentuk dictionary
pd.DataFrame(data = {'Nama':['Tariq', 'Fitria'], 'Umur':[12,13]})

Unnamed: 0,Nama,Umur
0,Tariq,12
1,Fitria,13


### Membuat Data dari Dictionary

In [6]:
df = {'col 1':[3,2,1,0], 'col 2':['a','b','c','d']}
df0 = pd.DataFrame(df)
df0

Unnamed: 0,col 1,col 2
0,3,a
1,2,b
2,1,c
3,0,d


In [7]:
# Menggunakan orient='index' untuk mengubah orientasi DF
df1 = pd.DataFrame.from_dict(df, orient = 'index')
df1

Unnamed: 0,0,1,2,3
col 1,3,2,1,0
col 2,a,b,c,d


In [8]:
# Ketika menggunakan index, nama kolom juga dapat diubah 
# Dengan columns = []
df2 = pd.DataFrame.from_dict(df, orient = 'index', columns = ['A','B','C','D'])
df2

Unnamed: 0,A,B,C,D
col 1,3,2,1,0
col 2,a,b,c,d


### Mengubah Nama Kolom

In [9]:
# Cek kolom DF
df2.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

#### Cara 1: Assignment

In [10]:
df2.columns = ['first','second','third','fourth']
df2

Unnamed: 0,first,second,third,fourth
col 1,3,2,1,0
col 2,a,b,c,d


#### Cara 2 :Indexing

In [11]:
# Menggunakan method values dan index
df2.columns.values[0] = 'zero'
df2

Unnamed: 0,zero,second,third,fourth
col 1,3,2,1,0
col 2,a,b,c,d


#### Cara 3 : Dictionary

In [12]:
df2.columns

Index(['zero', 'second', 'third', 'fourth'], dtype='object')

In [13]:
# Definisikan dictionary dengan key = nama kolom yang ingin diganti
# dan Value = nama untuk mengganti
# Pakai method rename untuk mengganti
col = {'zero':'satu'}
df2.rename(columns = col)

Unnamed: 0,satu,second,third,fourth
col 1,3,2,1,0
col 2,a,b,c,d


In [14]:
df2

Unnamed: 0,zero,second,third,fourth
col 1,3,2,1,0
col 2,a,b,c,d


In [15]:
# Dapat dilihat bahaw nama kolom belum ganti ketika dipanggil
# Tambahkan argumen inplace = True untuk mengganti permanen
df2.rename(columns = col, inplace = True)
df2

Unnamed: 0,satu,second,third,fourth
col 1,3,2,1,0
col 2,a,b,c,d


## Excercise 1

1. Create the matrix

In [16]:
data = {'Age':[24,13,53], 'Location':['New York','Paris','Berlin'], 'Name':['John','Anna','Peter']}
df3 = pd.DataFrame(df3)
df3

NameError: name 'df3' is not defined

2. Change Location to City

In [17]:
df3.columns.values[1] = 'City'
df3

NameError: name 'df3' is not defined

## Open CSV File

Untuk mencari data bisa dari Kaggle.com. 

Kaggle adalah platform pembelajaran dan pengumpulan data dari Google yang cocok jadi wadah pembelajaran ML.

In [18]:
# Di sini akan digunakan data Uber Drive
# Gunakan method read_csv
# Apabila tidak ada dalam suatu folder yang sama -> tulis nama foldernya
# nama_folder/nama_file_csv
data = pd.read_csv('datasets/My Uber Drives - 2016.csv')
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


## Basic Operation

### Head

In [19]:
# Melihat data teratas - default 5 teratas
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [20]:
# Memasukkan jumlah data teratas
data.head(8)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
5,1/6/2016 17:15,1/6/2016 17:19,Business,West Palm Beach,West Palm Beach,4.3,Meal/Entertain
6,1/6/2016 17:30,1/6/2016 17:35,Business,West Palm Beach,Palm Beach,7.1,Meeting
7,1/7/2016 13:27,1/7/2016 13:33,Business,Cary,Cary,0.8,Meeting


### Tail

In [21]:
# Melihat data terbawah
data.tail()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


### Shape

In [22]:
# Melihat bentuk DF
data.shape

(1156, 7)

### dtypes

In [23]:
# MElihat tipe data tiap kolom
data.dtypes

START_DATE*     object
END_DATE*       object
CATEGORY*       object
START*          object
STOP*           object
MILES*         float64
PURPOSE*        object
dtype: object

## Konversi Tipe Data

In [24]:
# Gunakan data dummy
data1 = pd.DataFrame({"Cost":["5","5","7"],"Amount":[11,12,13],"Date": ["11-10-2020","12-10-2020","13-10-2020"]})
data1

Unnamed: 0,Cost,Amount,Date
0,5,11,11-10-2020
1,5,12,12-10-2020
2,7,13,13-10-2020


In [25]:
data1.dtypes

Cost      object
Amount     int64
Date      object
dtype: object

### to_numeric

In [26]:
# Akan diubah tipe data Cost dari 'object' ke 'integer/numerik'
data1['Cost'] = pd.to_numeric(data1['Cost'])

### to_datetime

In [27]:
# Akan diubah tipe data Cost dari 'object' ke 'datetime'
data1['Date'] = pd.to_datetime(data1['Date'])

In [28]:
data1

Unnamed: 0,Cost,Amount,Date
0,5,11,2020-11-10
1,5,12,2020-12-10
2,7,13,2020-10-13


In [29]:
data1.dtypes

Cost               int64
Amount             int64
Date      datetime64[ns]
dtype: object

#### Aplikasikan cara di atas ke DF sebelumnya

In [30]:
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [31]:
data.tail()
# Lihat bahwa ada data 'Total' di bawah

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


In [32]:
data.dtypes

START_DATE*     object
END_DATE*       object
CATEGORY*       object
START*          object
STOP*           object
MILES*         float64
PURPOSE*        object
dtype: object

In [33]:
# Konversi data START_DATE* dari 'object' ke 'datetime'
# format dapat digunakan untuk mengubah tampilan datetime
# %m:bulan; %d:tanggal; %Y:Tahun lengkap; %H:jam; %M:menit
data['START_DATE*'] = pd.to_datetime(data['START_DATE*'], format='%m/%d/%Y %H:%M')

ValueError: time data 'Totals' does not match format '%m/%d/%Y %H:%M' (match)

In [34]:
# Data total membuat error
# Gunakan errors = 'coerce' untuk mengabaikan error
data['START_DATE*'] = pd.to_datetime(data['START_DATE*'], format='%m/%d/%Y %H:%M', errors = 'coerce')
data['END_DATE*'] = pd.to_datetime(data['END_DATE*'], format='%m/%d/%Y %H:%M', errors = 'coerce')

In [35]:
data.dtypes

START_DATE*    datetime64[ns]
END_DATE*      datetime64[ns]
CATEGORY*              object
START*                 object
STOP*                  object
MILES*                float64
PURPOSE*               object
dtype: object

In [36]:
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


## Dataset Summarization

### Describe

In [37]:
# Menampilkan parameter statistika DF
# Secara default menampilkan data numerik
data.describe()

Unnamed: 0,MILES*
count,1156.0
mean,21.115398
std,359.299007
min,0.5
25%,2.9
50%,6.0
75%,10.4
max,12204.7


In [38]:
# include = 'all' untuk menampilkan seluruh DF
data.describe(include = 'all')

  data.describe(include = 'all')
  data.describe(include = 'all')


Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
count,1155,1155,1155,1155,1155,1156.0,653
unique,1154,1154,2,177,188,,10
top,2016-06-28 23:34:00,2016-06-28 23:59:00,Business,Cary,Cary,,Meeting
freq,2,2,1078,201,203,,187
first,2016-01-01 21:11:00,2016-01-01 21:17:00,,,,,
last,2016-12-31 22:08:00,2016-12-31 23:51:00,,,,,
mean,,,,,,21.115398,
std,,,,,,359.299007,
min,,,,,,0.5,
25%,,,,,,2.9,


### Info

In [39]:
# Menampilkan informasi data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   START_DATE*  1155 non-null   datetime64[ns]
 1   END_DATE*    1155 non-null   datetime64[ns]
 2   CATEGORY*    1155 non-null   object        
 3   START*       1155 non-null   object        
 4   STOP*        1155 non-null   object        
 5   MILES*       1156 non-null   float64       
 6   PURPOSE*     653 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 63.3+ KB


### Value_counts

In [40]:
# Menampilkan nilai unik dari kolom
data['START*'].value_counts()

Cary                   201
Unknown Location       148
Morrisville             85
Whitebridge             68
Islamabad               57
                      ... 
Austin                   1
Columbia Heights         1
Daytona Beach            1
Jamestown Court          1
Lake Wellingborough      1
Name: START*, Length: 177, dtype: int64

## Excercise 2

1. Buat DF berikut dengan tipe data 'Umur' diubah dari 'object' ke 'integer'

In [41]:
df = {'Nama':['Ahmad','Joko','Adi'], 'Umur':['12','13','15'], 'Kelas':[6,7,8]}
data2 = pd.DataFrame(df)
data2

Unnamed: 0,Nama,Umur,Kelas
0,Ahmad,12,6
1,Joko,13,7
2,Adi,15,8


In [42]:
data2.dtypes

Nama     object
Umur     object
Kelas     int64
dtype: object

In [43]:
data2['Umur']=pd.to_numeric(data2['Umur'])
data2.dtypes

Nama     object
Umur      int64
Kelas     int64
dtype: object

2. Pergi ke Kaggle dan unduh data Titanic dan lakukan eksplorasi data dasar: head, tail, describe, info, size, shape

In [44]:
data3 = pd.read_csv('datasets/train.csv')
data3.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
data3.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [46]:
data3.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [47]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [48]:
data3.size

10692

In [49]:
data3.shape

(891, 12)

## Data Manipulation Task

### 1. Selecting/Indexing

In [50]:
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


#### iloc : Positional indexing (berdasarkan index)

In [51]:
# Mengambil baris 1-2, kolom 1-2
data.iloc[1:3, [1,3]]

Unnamed: 0,END_DATE*,START*
1,2016-01-02 01:37:00,Fort Pierce
2,2016-01-02 20:38:00,Fort Pierce


In [52]:
# Mengambil semua baris, kolom 3-5
data.iloc[:, 3:6]

Unnamed: 0,START*,STOP*,MILES*
0,Fort Pierce,Fort Pierce,5.1
1,Fort Pierce,Fort Pierce,5.0
2,Fort Pierce,Fort Pierce,4.8
3,Fort Pierce,Fort Pierce,4.7
4,Fort Pierce,West Palm Beach,63.7
...,...,...,...
1151,Kar?chi,Unknown Location,3.9
1152,Unknown Location,Unknown Location,16.2
1153,Katunayake,Gampaha,6.4
1154,Gampaha,Ilukwatta,48.2


#### loc : label Indexing (berdasarkan label

In [53]:
# Di sini, 0:5 dibaca sebagai label, bukan index
# Sehingga semua bilangan masuk
# Mengambil semua kolom awal sebelum START*
data.loc[0:5, :'START*']

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce
5,2016-01-06 17:15:00,2016-01-06 17:19:00,Business,West Palm Beach


In [54]:
data.loc[:, ['START_DATE*', 'MILES*']]

Unnamed: 0,START_DATE*,MILES*
0,2016-01-01 21:11:00,5.1
1,2016-01-02 01:25:00,5.0
2,2016-01-02 20:25:00,4.8
3,2016-01-05 17:31:00,4.7
4,2016-01-06 14:42:00,63.7
...,...,...
1151,2016-12-31 13:24:00,3.9
1152,2016-12-31 15:03:00,16.2
1153,2016-12-31 21:32:00,6.4
1154,2016-12-31 22:08:00,48.2


#### Tanpa loc dan iloc

In [55]:
data[['START_DATE*', 'MILES*']]

Unnamed: 0,START_DATE*,MILES*
0,2016-01-01 21:11:00,5.1
1,2016-01-02 01:25:00,5.0
2,2016-01-02 20:25:00,4.8
3,2016-01-05 17:31:00,4.7
4,2016-01-06 14:42:00,63.7
...,...,...
1151,2016-12-31 13:24:00,3.9
1152,2016-12-31 15:03:00,16.2
1153,2016-12-31 21:32:00,6.4
1154,2016-12-31 22:08:00,48.2


#### Pengaruh Bentuk List pada Tipe Data

In [56]:
a = data.loc[:,'START*']
a

0            Fort Pierce
1            Fort Pierce
2            Fort Pierce
3            Fort Pierce
4            Fort Pierce
              ...       
1151             Kar?chi
1152    Unknown Location
1153          Katunayake
1154             Gampaha
1155                 NaN
Name: START*, Length: 1156, dtype: object

In [57]:
type(a)

pandas.core.series.Series

In [58]:
# Masukkan START* dalam bentuk list
b = data.loc[:,['START*']]
b

Unnamed: 0,START*
0,Fort Pierce
1,Fort Pierce
2,Fort Pierce
3,Fort Pierce
4,Fort Pierce
...,...
1151,Kar?chi
1152,Unknown Location
1153,Katunayake
1154,Gampaha


In [59]:
type(b)

pandas.core.frame.DataFrame

## Excercise 3

1. Select columns : START_DATE*, START*, STOP* using loc and iloc

In [60]:
# Menggunakan loc
data.loc[:, ['START_DATE*', 'START*', 'STOP*']]

Unnamed: 0,START_DATE*,START*,STOP*
0,2016-01-01 21:11:00,Fort Pierce,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce,Fort Pierce
3,2016-01-05 17:31:00,Fort Pierce,Fort Pierce
4,2016-01-06 14:42:00,Fort Pierce,West Palm Beach
...,...,...,...
1151,2016-12-31 13:24:00,Kar?chi,Unknown Location
1152,2016-12-31 15:03:00,Unknown Location,Unknown Location
1153,2016-12-31 21:32:00,Katunayake,Gampaha
1154,2016-12-31 22:08:00,Gampaha,Ilukwatta


In [61]:
# Menggunakan iloc
data.iloc[:, [0,3,4]]

Unnamed: 0,START_DATE*,START*,STOP*
0,2016-01-01 21:11:00,Fort Pierce,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce,Fort Pierce
3,2016-01-05 17:31:00,Fort Pierce,Fort Pierce
4,2016-01-06 14:42:00,Fort Pierce,West Palm Beach
...,...,...,...
1151,2016-12-31 13:24:00,Kar?chi,Unknown Location
1152,2016-12-31 15:03:00,Unknown Location,Unknown Location
1153,2016-12-31 21:32:00,Katunayake,Gampaha
1154,2016-12-31 22:08:00,Gampaha,Ilukwatta


2. Extract the first and last 10 rows of the previous columns

In [62]:
# 10 baris awal
data.loc[:10, ['START_DATE*', 'START*', 'STOP*']]

Unnamed: 0,START_DATE*,START*,STOP*
0,2016-01-01 21:11:00,Fort Pierce,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce,Fort Pierce
3,2016-01-05 17:31:00,Fort Pierce,Fort Pierce
4,2016-01-06 14:42:00,Fort Pierce,West Palm Beach
5,2016-01-06 17:15:00,West Palm Beach,West Palm Beach
6,2016-01-06 17:30:00,West Palm Beach,Palm Beach
7,2016-01-07 13:27:00,Cary,Cary
8,2016-01-10 08:05:00,Cary,Morrisville
9,2016-01-10 12:17:00,Jamaica,New York


In [63]:
# 10 baris akhir
data.iloc[-10:, [0,3,4]]

Unnamed: 0,START_DATE*,START*,STOP*
1146,2016-12-30 11:31:00,Kar?chi,Kar?chi
1147,2016-12-30 15:41:00,Kar?chi,Kar?chi
1148,2016-12-30 16:45:00,Kar?chi,Kar?chi
1149,2016-12-30 23:06:00,Kar?chi,Kar?chi
1150,2016-12-31 01:07:00,Kar?chi,Kar?chi
1151,2016-12-31 13:24:00,Kar?chi,Unknown Location
1152,2016-12-31 15:03:00,Unknown Location,Unknown Location
1153,2016-12-31 21:32:00,Katunayake,Gampaha
1154,2016-12-31 22:08:00,Gampaha,Ilukwatta
1155,NaT,,


### 2. Filtering

In [64]:
# Argumen 1 : Conditional
# Argumen 2 : Kolom yang mau ditampilkan
# Menampilkan kolom MILES* untuk jarak tempuh di atas 10 Miles
df4 = data.loc[data['MILES*']>10, ['MILES*']]
df4

Unnamed: 0,MILES*
4,63.7
9,16.5
10,10.8
22,15.1
23,11.2
...,...
1134,11.9
1144,12.9
1152,16.2
1154,48.2


In [65]:
# Menampilkan kolom START*
df5 = data.loc[data['MILES*']>10, ['START*']]
df5

Unnamed: 0,START*
4,Fort Pierce
9,Jamaica
10,New York
22,New York
23,Downtown
...,...
1134,Unknown Location
1144,Unknown Location
1152,Unknown Location
1154,Gampaha


#### Pengaruh filtering pada indexing

In [66]:
# Dapat dilihat bahwa hasil filter membuat beberapa baris hilang
# Dengan loc, maka tidak akan muncul karena label hilang
df6 = df5.loc[0:3]
df6

Unnamed: 0,START*


In [67]:
# Sementara dengan iloc, data akan muncul karena berbasis index
df6 = df5.iloc[0:5]
df6

Unnamed: 0,START*
4,Fort Pierce
9,Jamaica
10,New York
22,New York
23,Downtown


#### Find all rids that is greater tan 10 miles

In [68]:
# Jika argumen 2 tidak diisi, maka semua kolom tertampil
data.loc[data['MILES*']>10]

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
9,2016-01-10 12:17:00,2016-01-10 12:44:00,Business,Jamaica,New York,16.5,Customer Visit
10,2016-01-10 15:08:00,2016-01-10 15:51:00,Business,New York,Queens,10.8,Meeting
22,2016-01-12 16:02:00,2016-01-12 17:00:00,Business,New York,Queens County,15.1,Meeting
23,2016-01-13 13:54:00,2016-01-13 14:07:00,Business,Downtown,Gulfton,11.2,Meeting
...,...,...,...,...,...,...,...
1134,2016-12-29 11:28:00,2016-12-29 12:00:00,Business,Unknown Location,Kar?chi,11.9,Meal/Entertain
1144,2016-12-29 23:14:00,2016-12-29 23:47:00,Business,Unknown Location,Kar?chi,12.9,Meeting
1152,2016-12-31 15:03:00,2016-12-31 15:38:00,Business,Unknown Location,Unknown Location,16.2,Meeting
1154,2016-12-31 22:08:00,2016-12-31 23:51:00,Business,Gampaha,Ilukwatta,48.2,Temporary Site


#### Find All Rides from New York

In [69]:
# All Columns
data.loc[data['START*'] == 'New York']

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
10,2016-01-10 15:08:00,2016-01-10 15:51:00,Business,New York,Queens,10.8,Meeting
22,2016-01-12 16:02:00,2016-01-12 17:00:00,Business,New York,Queens County,15.1,Meeting
106,2016-02-14 16:35:00,2016-02-14 17:02:00,Business,New York,Long Island City,13.0,Meeting
423,2016-06-10 15:19:00,2016-06-10 16:28:00,Business,New York,Jamaica,16.3,Meeting


In [70]:
# Columns : MILES* and STOP*
data.loc[data['START*'] == 'New York', ['MILES*', 'STOP*']]

Unnamed: 0,MILES*,STOP*
10,10.8,Queens
22,15.1,Queens County
106,13.0,Long Island City
423,16.3,Jamaica


#### Find All RIdes from Cary and Morrisville

#### isin : untuk kondisi jamak

In [71]:
# match multiple condition -> gunakan method isin
st = data[data['START*'].isin(['Cary', 'Morrisville'])]
st

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
7,2016-01-07 13:27:00,2016-01-07 13:33:00,Business,Cary,Cary,0.8,Meeting
8,2016-01-10 08:05:00,2016-01-10 08:25:00,Business,Cary,Morrisville,8.3,Meeting
27,2016-01-15 00:41:00,2016-01-15 01:01:00,Business,Morrisville,Cary,8.0,Errand/Supplies
28,2016-01-15 11:43:00,2016-01-15 12:03:00,Business,Cary,Durham,10.4,Meal/Entertain
30,2016-01-18 14:55:00,2016-01-18 15:06:00,Business,Cary,Cary,4.8,Meal/Entertain
...,...,...,...,...,...,...,...
1050,2016-12-14 16:52:00,2016-12-14 17:10:00,Business,Cary,Cary,3.4,
1051,2016-12-14 17:22:00,2016-12-14 17:34:00,Business,Cary,Cary,3.3,
1052,2016-12-14 17:50:00,2016-12-14 18:00:00,Business,Cary,Morrisville,3.0,Meal/Entertain
1053,2016-12-14 20:24:00,2016-12-14 20:40:00,Business,Morrisville,Cary,3.1,Customer Visit


#### reset_index : untuk membuat index baru

In [72]:
st.reset_index()

Unnamed: 0,index,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,7,2016-01-07 13:27:00,2016-01-07 13:33:00,Business,Cary,Cary,0.8,Meeting
1,8,2016-01-10 08:05:00,2016-01-10 08:25:00,Business,Cary,Morrisville,8.3,Meeting
2,27,2016-01-15 00:41:00,2016-01-15 01:01:00,Business,Morrisville,Cary,8.0,Errand/Supplies
3,28,2016-01-15 11:43:00,2016-01-15 12:03:00,Business,Cary,Durham,10.4,Meal/Entertain
4,30,2016-01-18 14:55:00,2016-01-18 15:06:00,Business,Cary,Cary,4.8,Meal/Entertain
...,...,...,...,...,...,...,...,...
281,1050,2016-12-14 16:52:00,2016-12-14 17:10:00,Business,Cary,Cary,3.4,
282,1051,2016-12-14 17:22:00,2016-12-14 17:34:00,Business,Cary,Cary,3.3,
283,1052,2016-12-14 17:50:00,2016-12-14 18:00:00,Business,Cary,Morrisville,3.0,Meal/Entertain
284,1053,2016-12-14 20:24:00,2016-12-14 20:40:00,Business,Morrisville,Cary,3.1,Customer Visit


In [73]:
# Dapat dilihat bahwa index lama masih ada
# Untuk menghapus ini gunakan drop = True dan inplace=True
st.reset_index(drop = True, inplace = True)

In [74]:
st

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,2016-01-07 13:27:00,2016-01-07 13:33:00,Business,Cary,Cary,0.8,Meeting
1,2016-01-10 08:05:00,2016-01-10 08:25:00,Business,Cary,Morrisville,8.3,Meeting
2,2016-01-15 00:41:00,2016-01-15 01:01:00,Business,Morrisville,Cary,8.0,Errand/Supplies
3,2016-01-15 11:43:00,2016-01-15 12:03:00,Business,Cary,Durham,10.4,Meal/Entertain
4,2016-01-18 14:55:00,2016-01-18 15:06:00,Business,Cary,Cary,4.8,Meal/Entertain
...,...,...,...,...,...,...,...
281,2016-12-14 16:52:00,2016-12-14 17:10:00,Business,Cary,Cary,3.4,
282,2016-12-14 17:22:00,2016-12-14 17:34:00,Business,Cary,Cary,3.3,
283,2016-12-14 17:50:00,2016-12-14 18:00:00,Business,Cary,Morrisville,3.0,Meal/Entertain
284,2016-12-14 20:24:00,2016-12-14 20:40:00,Business,Morrisville,Cary,3.1,Customer Visit


## Excercise 4

1. Find all trips that is greater than 10 miles and originated from New York and Morrisville

In [103]:
exrcs_data0 = data[data['START*'].isin(['New York','Morrisville'])]
exrcs_data0 = exrcs_data0[exrcs_data0['MILES*']>10]
exrcs_data0.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*,DISTANCE*
10,2016-01-10 15:08:00,2016-01-10 15:51:00,Business,New York,Queens,10.8,Meeting,Long Trip
22,2016-01-12 16:02:00,2016-01-12 17:00:00,Business,New York,Queens County,15.1,Meeting,Long Trip
97,2016-02-12 11:14:00,2016-02-12 11:35:00,Business,Morrisville,Raleigh,17.0,Customer Visit,Long Trip
100,2016-02-12 15:33:00,2016-02-12 16:06:00,Business,Morrisville,Cary,11.5,Customer Visit,Long Trip
106,2016-02-14 16:35:00,2016-02-14 17:02:00,Business,New York,Long Island City,13.0,Meeting,Long Trip


### 3. Sorting

#### sort_values : mengurutkan data

In [75]:
# Mengurutkan data dari nilai terbesar ke terkecil (descends)
data.sort_values(by=['MILES*'], ascending = False)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
1155,NaT,NaT,,,,12204.7,
269,2016-03-25 16:52:00,2016-03-25 22:22:00,Business,Latta,Jacksonville,310.3,Customer Visit
270,2016-03-25 22:54:00,2016-03-26 01:39:00,Business,Jacksonville,Kissimmee,201.0,Meeting
881,2016-10-30 15:22:00,2016-10-30 18:23:00,Business,Asheville,Mebane,195.9,
776,2016-09-27 21:01:00,2016-09-28 02:37:00,Business,Unknown Location,Unknown Location,195.6,
...,...,...,...,...,...,...,...
1121,2016-12-27 12:53:00,2016-12-27 12:57:00,Business,Kar?chi,Kar?chi,0.6,Meal/Entertain
1110,2016-12-24 22:04:00,2016-12-24 22:09:00,Business,Lahore,Lahore,0.6,Errand/Supplies
44,2016-01-26 17:27:00,2016-01-26 17:29:00,Business,Cary,Cary,0.5,Errand/Supplies
420,2016-06-08 17:16:00,2016-06-08 17:18:00,Business,Soho,Tribeca,0.5,Errand/Supplies


In [76]:
# Mengurutkan dari data terkecil ke terbesar (ascends)
data.sort_values(by=['MILES*'], ascending = True)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
420,2016-06-08 17:16:00,2016-06-08 17:18:00,Business,Soho,Tribeca,0.5,Errand/Supplies
44,2016-01-26 17:27:00,2016-01-26 17:29:00,Business,Cary,Cary,0.5,Errand/Supplies
120,2016-02-17 16:38:00,2016-02-17 16:43:00,Business,Katunayaka,Katunayaka,0.5,Errand/Supplies
1111,2016-12-25 00:10:00,2016-12-25 00:14:00,Business,Lahore,Lahore,0.6,Errand/Supplies
1110,2016-12-24 22:04:00,2016-12-24 22:09:00,Business,Lahore,Lahore,0.6,Errand/Supplies
...,...,...,...,...,...,...,...
776,2016-09-27 21:01:00,2016-09-28 02:37:00,Business,Unknown Location,Unknown Location,195.6,
881,2016-10-30 15:22:00,2016-10-30 18:23:00,Business,Asheville,Mebane,195.9,
270,2016-03-25 22:54:00,2016-03-26 01:39:00,Business,Jacksonville,Kissimmee,201.0,Meeting
269,2016-03-25 16:52:00,2016-03-25 22:22:00,Business,Latta,Jacksonville,310.3,Customer Visit


### 4. Conditionally Adding Column

In [97]:
# Gunakan Numpy
import numpy as np

#### np.where : menambah data sesuai kondisi

In [78]:
# argumen (kondisi, nilai jika benar, nilai jika salah)
data['DISTANCE*'] = np.where(data['MILES*'] > 5, 'Long Trip', 'Short Trip')
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*,DISTANCE*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,Long Trip
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,,Short Trip
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,Short Trip
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting,Short Trip
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,Long Trip
...,...,...,...,...,...,...,...,...
1151,2016-12-31 13:24:00,2016-12-31 13:42:00,Business,Kar?chi,Unknown Location,3.9,Temporary Site,Short Trip
1152,2016-12-31 15:03:00,2016-12-31 15:38:00,Business,Unknown Location,Unknown Location,16.2,Meeting,Long Trip
1153,2016-12-31 21:32:00,2016-12-31 21:50:00,Business,Katunayake,Gampaha,6.4,Temporary Site,Long Trip
1154,2016-12-31 22:08:00,2016-12-31 23:51:00,Business,Gampaha,Ilukwatta,48.2,Temporary Site,Long Trip


In [79]:
x = np.where(data['MILES*'] > 5, 'long trip', 'short trip')
x

array(['long trip', 'short trip', 'short trip', ..., 'long trip',
       'long trip', 'long trip'], dtype='<U10')

#### insert : memasukkan data ke DF

In [80]:
# Memasukkan data dengan label 'CAT_DISTANCE*'
# Di kolom urutan ke 7 (index ke-6)
# Dengan nilai dari array x
data.insert(6, 'CAT_DISTANCE*', x)

In [81]:
# Lihat data yang dibuat
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,CAT_DISTANCE*,PURPOSE*,DISTANCE*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,long trip,Meal/Entertain,Long Trip
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,short trip,,Short Trip
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,short trip,Errand/Supplies,Short Trip
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,short trip,Meeting,Short Trip
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,long trip,Customer Visit,Long Trip


In [82]:
data['DISTANCE*'].value_counts()

Long Trip     654
Short Trip    502
Name: DISTANCE*, dtype: int64

#### Menggunakan np.array

In [83]:
data['Year*'] = np.array('2020')
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,CAT_DISTANCE*,PURPOSE*,DISTANCE*,Year*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,long trip,Meal/Entertain,Long Trip,2020
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,short trip,,Short Trip,2020
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,short trip,Errand/Supplies,Short Trip,2020
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,short trip,Meeting,Short Trip,2020
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,long trip,Customer Visit,Long Trip,2020
...,...,...,...,...,...,...,...,...,...,...
1151,2016-12-31 13:24:00,2016-12-31 13:42:00,Business,Kar?chi,Unknown Location,3.9,short trip,Temporary Site,Short Trip,2020
1152,2016-12-31 15:03:00,2016-12-31 15:38:00,Business,Unknown Location,Unknown Location,16.2,long trip,Meeting,Long Trip,2020
1153,2016-12-31 21:32:00,2016-12-31 21:50:00,Business,Katunayake,Gampaha,6.4,long trip,Temporary Site,Long Trip,2020
1154,2016-12-31 22:08:00,2016-12-31 23:51:00,Business,Gampaha,Ilukwatta,48.2,long trip,Temporary Site,Long Trip,2020


In [84]:
data['TIME_CAT*'] = np.where(data['START_DATE*'] > '2016-05-01', 'New Trip', 'Old Trip')
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,CAT_DISTANCE*,PURPOSE*,DISTANCE*,Year*,TIME_CAT*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,long trip,Meal/Entertain,Long Trip,2020,Old Trip
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,short trip,,Short Trip,2020,Old Trip
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,short trip,Errand/Supplies,Short Trip,2020,Old Trip
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,short trip,Meeting,Short Trip,2020,Old Trip
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,long trip,Customer Visit,Long Trip,2020,Old Trip
...,...,...,...,...,...,...,...,...,...,...,...
1151,2016-12-31 13:24:00,2016-12-31 13:42:00,Business,Kar?chi,Unknown Location,3.9,short trip,Temporary Site,Short Trip,2020,New Trip
1152,2016-12-31 15:03:00,2016-12-31 15:38:00,Business,Unknown Location,Unknown Location,16.2,long trip,Meeting,Long Trip,2020,New Trip
1153,2016-12-31 21:32:00,2016-12-31 21:50:00,Business,Katunayake,Gampaha,6.4,long trip,Temporary Site,Long Trip,2020,New Trip
1154,2016-12-31 22:08:00,2016-12-31 23:51:00,Business,Gampaha,Ilukwatta,48.2,long trip,Temporary Site,Long Trip,2020,New Trip


## Excercise 5

Create a new column with the following condition: \
a) >10 : Long Trip \
b) 5-10 : Medium Trip \
c) <5 : Short Trip 

In [98]:
data['DISTANCE*'] = np.where(data['MILES*'] < 5, 'Short Trip', np.where(data['MILES*'] > 10, 'Long Trip', 'Medium Trip'))
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*,DISTANCE*
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,Medium Trip
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,,Medium Trip
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,Short Trip
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting,Short Trip
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,Long Trip
...,...,...,...,...,...,...,...,...
1151,2016-12-31 13:24:00,2016-12-31 13:42:00,Business,Kar?chi,Unknown Location,3.9,Temporary Site,Short Trip
1152,2016-12-31 15:03:00,2016-12-31 15:38:00,Business,Unknown Location,Unknown Location,16.2,Meeting,Long Trip
1153,2016-12-31 21:32:00,2016-12-31 21:50:00,Business,Katunayake,Gampaha,6.4,Temporary Site,Medium Trip
1154,2016-12-31 22:08:00,2016-12-31 23:51:00,Business,Gampaha,Ilukwatta,48.2,Temporary Site,Long Trip


### Groupby Summarize

#### groupby : mengelompokkan data berdasar kriteria

In [85]:
# Mengelompokkan data di kolom START* dari rata-rata kolom MILES*
data.groupby('START*')['MILES*'].agg(['mean'])

Unnamed: 0_level_0,mean
START*,Unnamed: 1_level_1
Agnew,2.775000
Almond,15.200000
Apex,5.341176
Arabi,17.000000
Arlington,4.900000
...,...
West University,2.200000
Weston,4.000000
Westpark Place,2.182353
Whitebridge,4.020588


In [86]:
# Menemukan nilai mean dan total distance travelled
data.groupby('START*')['MILES*'].agg(['mean', 'sum'])

Unnamed: 0_level_0,mean,sum
START*,Unnamed: 1_level_1,Unnamed: 2_level_1
Agnew,2.775000,11.1
Almond,15.200000,15.2
Apex,5.341176,90.8
Arabi,17.000000,17.0
Arlington,4.900000,4.9
...,...,...
West University,2.200000,4.4
Weston,4.000000,8.0
Westpark Place,2.182353,37.1
Whitebridge,4.020588,273.4


In [87]:
# Mengambil 5 data teratas
data.groupby('START*')['MILES*'].agg(['mean', 'sum']).head()

Unnamed: 0_level_0,mean,sum
START*,Unnamed: 1_level_1,Unnamed: 2_level_1
Agnew,2.775,11.1
Almond,15.2,15.2
Apex,5.341176,90.8
Arabi,17.0,17.0
Arlington,4.9,4.9


## Save DF to CSC file

In [88]:
# Menyimpan dengan nama data baru
st.to_csv('data_baru.csv')

In [89]:
# Menyimpan tanpa index
st.to_csv('data_baru2.csv', index = False)