# Install and import Pandas

In [1]:
!pip install pandas



In [2]:
import pandas as pd

# Creating DataFrames from scratch

In [3]:
data = {
       'apples': [3, 2, 0, 1], 
       'oranges': [0, 3, 7, 2]
       }

In [4]:
purchases = pd.DataFrame(data)

In [5]:
purchases

Unnamed: 0,apples,oranges
0,3,0
1,2,3
2,0,7
3,1,2


In [6]:
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])

In [7]:
purchases

Unnamed: 0,apples,oranges
June,3,0
Robert,2,3
Lily,0,7
David,1,2


In [8]:
purchases.loc['Robert']

apples     2
oranges    3
Name: Robert, dtype: int64

# How to read in data

**Reading data from CSVs**

In [9]:
df = pd.read_csv('purchases.csv') # relative path

In [10]:
df

Unnamed: 0.1,Unnamed: 0,apples,oranges
0,June,3,0
1,Robert,2,3
2,Lily,0,7
3,David,1,2


In [11]:
import pandas as pd
df2 = pd.read_csv('D:/23001042-15-val-project-gold/CH3/purchases.csv', index_col=0)  # absolute path

In [12]:
df2

Unnamed: 0,apples,oranges
June,3,0
Robert,2,3
Lily,0,7
David,1,2


**Reading data from JSON**

In [13]:
df = pd.read_json('purchases.json')

In [14]:
df

Unnamed: 0,apples,oranges
June,3,0
Robert,2,3
Lily,0,7
David,1,2


**Reading from SqlLite**

In [15]:
import sqlite3
import pandas as pd

cnx = sqlite3.connect('data.db')

df = pd.read_sql_query("SELECT * FROM tweets", cnx)

In [16]:
df

Unnamed: 0,username,text,retweet_count
0,kembarannyaimas,RT @septian: Sekadar mengingatkan: hasil uji k...,0
1,Rian170363Az,"RT @MrsRachelIn: Vaksin corona tiba, Jokowi mi...",0
2,ayu_yude,"RT @jokowi: Semenjak dari awal, saya mengingat...",0
3,cak_iwan,RT @septian: Sekadar mengingatkan: hasil uji k...,0
4,vayakikin,RT @AndreasMarbun_: tanggung jawab pemberantas...,0
...,...,...,...
5634,endanghidayat,"RT @CNNIndonesia: Bentrok Polisi-FPI, KAMI Des...",0
5635,Akun_Purwokerto,RT @SOERYAWAD1: Dimohon kpd para buruh dan tem...,0
5636,AA_Gum_,RT @MegaSimarmata: Mundurlah Bapak Kapolda Met...,0
5637,BaksoAc10516490,@detikcom seiring di sahkannya omnibuslaw ini ...,0


In [17]:
df_2 = pd.read_sql_query("SELECT username FROM tweets", cnx)
df_2

Unnamed: 0,username
0,kembarannyaimas
1,Rian170363Az
2,ayu_yude
3,cak_iwan
4,vayakikin
...,...
5634,endanghidayat
5635,Akun_Purwokerto
5636,AA_Gum_
5637,BaksoAc10516490


# Converting back to a CSV, JSON, or SQL

In [18]:
!pip install openpyxl 



In [19]:
df.to_excel('new_purchases.xlsx')

In [20]:
df.to_json('new_purchases.json')

# Most important DataFrame operations

In [21]:
import pandas as pd
df = pd.read_csv("IMDB-Movie-Data.csv")

**Viewing your data**

In [22]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [23]:
# bowo
# df.head(0:2)

In [24]:
# ghiyats
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


In [25]:
df.tail(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0
999,1000,Nine Lives,"Comedy,Family,Fantasy",A stuffy businessman finds himself trapped ins...,Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,12435,19.64,11.0


**Getting info about your data**

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [27]:
# expected output: melihat data Revenue dan Metascore yang
test_df = df[['Actors', 'Rating','Revenue (Millions)']]
test_df

Unnamed: 0,Actors,Rating,Revenue (Millions)
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",8.1,333.13
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",7.0,126.46
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",7.3,138.12
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",7.2,270.32
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",6.2,325.02
...,...,...,...
995,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",6.2,
996,"Lauren German, Heather Matarazzo, Bijou Philli...",5.5,17.54
997,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",6.2,58.01
998,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",5.6,


In [28]:
null_df = test_df[test_df['Revenue (Millions)'].isnull()]

In [29]:
null_df.head(20)

Unnamed: 0,Actors,Rating,Revenue (Millions)
7,"Essie Davis, Andrea Riseborough, Julian Barrat...",6.4,
22,"Emma Booth, Ashleigh Cummings, Stephen Curry,S...",6.7,
25,"Fiona Gordon, Dominique Abel,Emmanuelle Riva, ...",6.8,
39,"John Francis Daley, Austin Pendleton, Colleen ...",7.1,
42,"Brittany Blanton, Ayse Howard, Roman Jossart,N...",2.7,
47,"Hermione Corfield, Addison Timlin, Joely Richa...",5.6,
49,"Charlize Theron, Javier Bardem, Adèle Exarchop...",3.7,
61,"Brian Cox, Emile Hirsch, Ophelia Lovibond, Mic...",6.8,
70,"Oscar Isaac, Charlotte Le Bon, Christian Bale,...",5.9,
103,"Mario Casas, Ana Wagener, José Coronado, Bárba...",7.9,


In [30]:
# bowo: expected output ingin fill nilai yang NaN
# contoh kasus pada fitur Actors, Rating, dan Revenue

# Mengisi nilai NaN dalam kolom 'Revenue (Millions)' 
#dengan rata-rata dari film-film yang memiliki nilai 'Rating' yang sama
# Mengisi nilai NaN dengan rata-rata sesuai dengan nilai Rating yang sama
null_df['Revenue (Millions)'] = null_df['Revenue (Millions)'].fillna(df.groupby('Rating')['Revenue (Millions)'].transform('mean'))
null_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_df['Revenue (Millions)'] = null_df['Revenue (Millions)'].fillna(df.groupby('Rating')['Revenue (Millions)'].transform('mean'))


Unnamed: 0,Actors,Rating,Revenue (Millions)
7,"Essie Davis, Andrea Riseborough, Julian Barrat...",6.4,74.114000
22,"Emma Booth, Ashleigh Cummings, Stephen Curry,S...",6.7,87.636047
25,"Fiona Gordon, Dominique Abel,Emmanuelle Riva, ...",6.8,57.301212
39,"John Francis Daley, Austin Pendleton, Colleen ...",7.1,73.456739
42,"Brittany Blanton, Ayse Howard, Roman Jossart,N...",2.7,9.350000
...,...,...,...
977,"Jason Biggs, Janet Montgomery,Ashley Tisdale, ...",5.0,64.510000
978,"Nathalie Baye, Vincent Cassel, Marion Cotillar...",7.0,93.951860
988,"Morjana Alaoui, Mylène Jampanoï, Catherine Bég...",7.1,73.456739
995,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",6.2,79.861176


In [31]:
null_df.head(20)

Unnamed: 0,Actors,Rating,Revenue (Millions)
7,"Essie Davis, Andrea Riseborough, Julian Barrat...",6.4,74.114
22,"Emma Booth, Ashleigh Cummings, Stephen Curry,S...",6.7,87.636047
25,"Fiona Gordon, Dominique Abel,Emmanuelle Riva, ...",6.8,57.301212
39,"John Francis Daley, Austin Pendleton, Colleen ...",7.1,73.456739
42,"Brittany Blanton, Ayse Howard, Roman Jossart,N...",2.7,9.35
47,"Hermione Corfield, Addison Timlin, Joely Richa...",5.6,36.773846
49,"Charlize Theron, Javier Bardem, Adèle Exarchop...",3.7,
61,"Brian Cox, Emile Hirsch, Ophelia Lovibond, Mic...",6.8,57.301212
70,"Oscar Isaac, Charlotte Le Bon, Christian Bale,...",5.9,53.689333
103,"Mario Casas, Ana Wagener, José Coronado, Bárba...",7.9,131.0755


In [32]:
# regresi
# uji statistik
null_df.isna().sum()

Actors                0
Rating                0
Revenue (Millions)    8
dtype: int64

In [33]:
# tipe data --> ghiyats
# data bukan null --> bowo

In [34]:
df.shape

(1000, 12)

**Handling duplicates**

In [35]:
# data belum diduplikasi
df.shape

(1000, 12)

In [36]:
## data setelah diduplikasi --> Deprecated, ganti dengan concat
# temp_df = df.append(df)
# temp_df.shape

In [37]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])
df

Unnamed: 0,A,B
x,1,2
y,3,4


In [38]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y'])
df.append(df2)

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# mba claire
temp_df = pd.concat([df, df], ignore_index=True)
temp_df.shape

In [None]:
# parameter keep [first, last]
# first: menaruh duplikasi nya dibawah,
# last: menaruh duplikasi nya diatas.
# subset: label column, optional

# cara cek ada duplicate
temp_df.duplicated(subset = None, keep = 'first')

In [None]:
# proses menghilangkan duplikasi
temp_df = temp_df.drop_duplicates()
temp_df.shape

In [None]:
# cek duplikasi
temp_df.duplicated(subset = None, keep = 'first')

**Column cleanup**

In [None]:
movies_df = df

In [None]:
movies_df.columns

In [None]:
movies_df.rename(columns={
        'Runtime (Minutes)': 'Runtime', 
        'Revenue (Millions)': 'Revenue_millions'
    }, inplace=True)

In [None]:
movies_df.columns

**How to work with missing values**

In [None]:
df.isnull()

In [None]:
movies_df.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
# claire: lihat data yang null

In [None]:
null_rows = df[df[['Revenue_millions', 'Metascore']].isnull()]
null_rows

In [None]:
df_check_null = df[df['Revenue_millions'].isna() & df['Metascore'].isna()]
df_check_null

In [None]:
df_deleted_null = df.dropna()

In [None]:
df_deleted_null.isnull().sum()

# DataFrame slicing, selecting, extracting

In [None]:
subset = df[['Genre', 'Rating']]

In [None]:
subset.head()

In [None]:
# .loc dipakai untuk mengakses baris berdasarkan label indeks
# .iloc digunakan untuk mengakses baris berdasarkan posisi indeks

prom = df.loc["Song"]

prom

In [None]:
df.head(2)

In [None]:
prom = df.iloc[1]

prom

In [None]:
df_filter = df[df['Director'] == "Ridley Scott"]
df_filter = df_filter[['Title', 'Director']]
df_filter.head()

loc (label-based location)

iloc (integer-based location)

condition

In [None]:
df_filter = df[df['Rating'] >= 8.6]
df_filter = df_filter[['Title', 'Rating']]
df_filter.head()

In [None]:
movies_df[(movies_df['Director'] == 'Christopher Nolan') | (movies_df['Director'] == 'Ridley Scott')].head()

In [None]:
df_filter = df[df['Director'].isin(['Christopher Nolan', 'Ridley Scott'])]
df_filter = df_filter[['Title', 'Director']]
df_filter.head()

In [None]:
movies_df[
    ((movies_df['Year'] >= 2005) & (movies_df['Year'] <= 2010))
    & (movies_df['Rating'] > 8.0)]

# Applying functions

In [None]:
def rating_function(x):
    if x >= 8.0:
        return "good"
    else:
        return "bad"

In [None]:
df_filter = df[['Title', 'Rating']]
df_filter.head()

In [None]:
df_filter["Rating_category"] = df_filter["Rating"].apply(rating_function)

In [None]:
df_filter.head()

In [None]:
movies_df["Rating_category"] = movies_df["Rating"].apply(lambda x: 'good' if x >= 8.0 else 'bad')

movies_df.head(2)

# Cleansing Example

In [None]:
# Selanjutnya kita akan memulai cleansing (pembersihan) pada kolom "text" dari DataFrame.
# Untuk melakukan cleansing, terlebih dahulu kita perlu mendefinisikan sebuah fungsi.
# Fungsi ini diberi nama "cleansing"

# Terlebih dahulu kita import library bernama "RegEx".
# "RegEx" merupakan library yang digunakan untuk memanipulasi data text berdasarkan pola text-nya
# Kita panggil library "RegEx" sebagai berikut:
import re

# Selanjutnya berbagai jenis cleansing yang digunakan sebagai berikut
def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = str(sent).lower()
    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'[^a-zA-Z0-9]', '', string)
    return string

In [None]:
# Buat DataFrame dengan satu kolom 'text'
data = pd.DataFrame({'text': ["HALO, WORLD! 😊", "Ini adalah nomorku.", "0812345678", "Apa kabar?"]})
data_from_csv = pd.read_csv('Exercise Chapter 3 Topic 2.csv')

In [None]:
data_from_csv.head(1)

In [None]:
# Setelah selesai mendefinisikan fungsi "cleansing", selanjutnya kita aplikasikan ke dalam kolom text pada DataFrame
# Caranya menjalankan script di bawah
data['text_clean'] = data.text.apply(cleansing)
data

In [None]:
data_from_csv['text_clean'] = data_from_csv['Text Tweet'].apply(cleansing)
data_from_csv.head(1)

# Brief Plotting

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12, 'figure.figsize': (5, 4)}) # set font and plot size to be larger

In [None]:
# movies_df.columns

df.plot(kind='scatter', x='Rating', y='Revenue_millions', 
        title='Revenue (millions) vs Rating');

In [None]:
movies_df['Rating'].plot(kind='hist', title='Rating')

In [None]:
movies_df['Rating'].describe()

In [None]:
movies_df['Rating'].plot(kind="box");

In [None]:
# movies_df.columns
movies_df.boxplot(column='Revenue_millions', by='Rating_category');