# Generating DataFrame

## 1. using Dictionary, value = list (index auto)

In [1]:
import pandas as pd 
import numpy as np

data = {
    '縣市': ['台北','台北','台北','新竹','新竹','新竹'], 
    # each key:value pair now becomes a Series => Series.name : Series.values
    '年份': [2000,2001,2002,2001,2002,2003],
    '人口': [2.2,2.5,2.7,1.1,1.5,2.0]
}

frame = pd.DataFrame(data)
frame

# note: how the data is transposed!

Unnamed: 0,縣市,年份,人口
0,台北,2000,2.2
1,台北,2001,2.5
2,台北,2002,2.7
3,新竹,2001,1.1
4,新竹,2002,1.5
5,新竹,2003,2.0


In [2]:
# creating df using only some of the data by specifying column name

data = {
    '縣市': ['台北','台北','台北','新竹','新竹','新竹'], 
    '年份': [2000,2001,2002,2001,2002,2003],
    '人口': [2.2,2.5,2.7,1.1,1.5,2.0]
}

frame = pd.DataFrame(data, columns=['縣市','年份'])
frame

# note: how the data is transposed!

Unnamed: 0,縣市,年份
0,台北,2000
1,台北,2001
2,台北,2002
3,新竹,2001
4,新竹,2002
5,新竹,2003


In [3]:
# by default, row index is a range
# customize row index:

data = {
    '縣市': ['台北','台北','台北','新竹','新竹','新竹'], 
    '年份': [2000,2001,2002,2001,2002,2003],
    '人口': [2.2,2.5,2.7,1.1,1.5,2.0]
}

frame = pd.DataFrame(data, 
                     columns=['縣市','人口','年份'], 
                     index=['one','two','three','four','five','six'])
frame

Unnamed: 0,縣市,人口,年份
one,台北,2.2,2000
two,台北,2.5,2001
three,台北,2.7,2002
four,新竹,1.1,2001
five,新竹,1.5,2002
six,新竹,2.0,2003


In [4]:
frame.columns

Index(['縣市', '人口', '年份'], dtype='object')

In [5]:
frame.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

## 2. using dictionary, value = dictionary (custom index)

In [20]:
# generating dataframe with dictionary within dictionary:

pop = {'台北':{2001:2.4, 2002:2.9},
      '高雄':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,台北,高雄
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [23]:
#giving names to the index and columns
frame3.index.name = '年份'
frame3.columns.name = '縣市'
frame3

縣市,台北,高雄
年份,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


## 3. using ndarray

In [34]:
# using ndarray to generate dataframe:

import numpy as np
import pandas as pd

scores = np.random.randint(50,101,(50,5))
scores_df = pd.DataFrame(scores, columns = ['國文','英文','數學','地理','歷史'],
                        index = range(1,51))
scores_df.index.name = '座號'
scores_df.columns.name = '科目'
scores_df.head()

科目,國文,英文,數學,地理,歷史
座號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,76,80,71,76,71
2,79,93,80,83,61
3,89,58,60,64,53
4,59,89,85,83,51
5,80,82,81,70,75


In [25]:
import numpy as np
import pandas as pd

np1 = np.array([[1,2,3],[4,5,6]])
frame = pd.DataFrame(np1, 
                     index = [5,6],
                     columns = ['台北','台中','高雄'],
                     copy = True)
frame

Unnamed: 0,台北,台中,高雄
5,1,2,3
6,4,5,6


In [36]:
frame = pd.DataFrame(data = np.arange(9).reshape(3,3),
                    index = ['a','b','c'],
                    columns = ['台北','台中','高雄'])
frame

Unnamed: 0,台北,台中,高雄
a,0,1,2
b,3,4,5
c,6,7,8


In [None]:
frame.reindex(index=['a','b'],columns)

### Note: pd.DataFrame is default to copy = false!

In [32]:
import numpy as np
import pandas as pd

arr1 = [[1,2,3],[4,5,6]]
ndarr1 = np.array(arr1)  # copy is default to true

df = pd.DataFrame(ndarr1,columns = ['a','b','c']) # copy is default to false!!

display(df)
print(df['a'][1])   # column name a, index 1 item
df['a'][1] = -1
display(df)

ndarr1  # note!!! ndarr1 is manipulated!!

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


4


Unnamed: 0,a,b,c
0,1,2,3
1,-1,5,6


array([[ 1,  2,  3],
       [-1,  5,  6]])

In [33]:
# creating DataFrame without manipulation == copy=True

arr1 = [[1,2,3],[4,5,6]]
ndarr1 = np.array(arr1)  # copy is default to true

df = pd.DataFrame(ndarr1,columns = ['a','b','c'], copy=True) # set copy to True!!

df['a'][1] = -1
display(df)

ndarr1  # note!!! ndarr1 is no longer manipulated!!

Unnamed: 0,a,b,c
0,1,2,3
1,-1,5,6


array([[1, 2, 3],
       [4, 5, 6]])

## 4.  csv --> 2D list --> DataFrame

In [35]:
import csv
import numpy as np
import pandas as pd

def convertToInt(string):  #note!! as data may contain none integer value, here use function to try/except
    try:
        return int(string)
    except:
        return 0
    
def convertToFloat(string):
    try:
        return float(string)
    except:
        return 0


datalist = []
with open("各鄉鎮市區人口密度.csv", encoding = "utf-8") as file:
    rows = csv.reader(file)
    for row in rows:
        if row[0] == '107' or row[0] == '統計年':
            if row[0] == '107':
                row[2] = convertToInt(row[2])
                row[3] = convertToFloat(row[3])
                row[4] = convertToInt(row[4])
            datalist.append(row)

'''
#更精簡的寫法！用comprehesion! 
with open("各鄉鎮市區人口密度.csv", encoding = "utf-8") as file:
    datalist = [item for item in list(csv.reader(file)) if item[0] == '107' or item[0] == '統計年']
for row in datalist:
    row[2] = convertToInt(row[2])
    row[3] = convertToFloat(row[3])
    row[4] = convertToInt(row[4])
'''

title = datalist.pop(0) # extracting title row, now datalist has only data
df = pd.DataFrame(datalist, columns=title)
df

Unnamed: 0,統計年,區域別,年底人口數,土地面積,人口密度
0,107,新北市板橋區,554742,23.1373,23976
1,107,新北市三重區,385826,16.3170,23646
2,107,新北市中和區,412486,20.1440,20477
3,107,新北市永和區,221098,5.7138,38695
4,107,新北市新莊區,417754,19.7383,21165
...,...,...,...,...,...
365,107,連江縣北竿鄉,2425,9.9000,245
366,107,連江縣莒光鄉,1618,4.7000,344
367,107,連江縣東引鄉,1350,3.8000,355
368,107,東沙群島,0,2.3800,0


## 5. use pd.read_csv

In [38]:
import pandas as pd
df = pd.read_csv("各鄉鎮市區人口密度.csv")
df

Unnamed: 0,statistic_yyy,site_id,people_total,area,population_density
0,統計年,區域別,年底人口數,土地面積,人口密度
1,107,新北市板橋區,554742,23.1373,23976
2,107,新北市三重區,385826,16.317,23646
3,107,新北市中和區,412486,20.144,20477
4,107,新北市永和區,221098,5.7138,38695
...,...,...,...,...,...
370,107,南沙群島,…,0.5045,…
371,,,,,
372,,說明：1.人口密度係指每單位土地面積內之人口數。,,,
373,,2.96年12月起，我國土地面積增列東沙群島(2.38平方公里)及南沙群島(0.4896平方公里),,,


In [39]:
# but note : using pd.read_csv, come in all string. cannot do comparisons
df.dtypes

statistic_yyy         object
site_id               object
people_total          object
area                  object
population_density    object
dtype: object

In [7]:
!type 各鄉鎮市區人口密度.csv
# nice way to read whats in the file before-hand!

﻿statistic_yyy,site_id,people_total,area,population_density
統計年,區域別,年底人口數,土地面積,人口密度
107,新北市板橋區,554742,23.1373,23976
107,新北市三重區,385826,16.317,23646
107,新北市中和區,412486,20.144,20477
107,新北市永和區,221098,5.7138,38695
107,新北市新莊區,417754,19.7383,21165
107,新北市新店區,302231,120.2255,2514
107,新北市樹林區,183946,33.1288,5552
107,新北市鶯歌區,86361,21.1248,4088
107,新北市三峽區,115820,191.4508,605
107,新北市淡水區,173502,70.6565,2456
107,新北市汐止區,200535,71.2354,2815
107,新北市瑞芳區,39982,70.7336,565
107,新北市土城區,236901,29.5578,8015
107,新北市蘆洲區,201332,7.4351,27079
107,新北市五股區,86329,34.8632,2476
107,新北市泰山區,78708,19.1603,4108
107,新北市林口區,110081,54.1519,2033
107,新北市深坑區,23634,20.5787,1148
107,新北市石碇區,7731,144.3498,54
107,新北市坪林區,6612,170.835,39
107,新北市三芝區,22978,65.9909,348
107,新北市石門區,12115,51.2645,236
107,新北市八里區,38906,39.4933,985
107,新北市平溪區,4666,71.3382,65
107,新北市雙溪區,8860,146.2484,61
107,新北市貢寮區,12301,99.9734,123
107,新北市金山區,21774,49.2132,442
107,新北市萬里區,22068,63.3766,348
107,新北市烏來區,6438,321.1306,20
107,臺北市松山區,205702,9.2878,22148
107,臺北市信義區,223406,

In [8]:
# use arguments of pd.read_csv to do initial manipulations of data!

def convertToInt(string):  #note!! as data may contain none integer value, here use function to try/except
    try:
        return int(string)
    except:
        return 0
    
def convertToFloat(string):
    try:
        return float(string)
    except:
        return 0

df = pd.read_csv("各鄉鎮市區人口密度.csv", 
                 header = 1, 
                 converters = {'年底人口數':convertToInt,'土地面積':convertToFloat,'人口密度':convertToInt},
                 nrows = 370)
df

Unnamed: 0,統計年,區域別,年底人口數,土地面積,人口密度
0,107,新北市板橋區,554742,23.1373,23976
1,107,新北市三重區,385826,16.3170,23646
2,107,新北市中和區,412486,20.1440,20477
3,107,新北市永和區,221098,5.7138,38695
4,107,新北市新莊區,417754,19.7383,21165
...,...,...,...,...,...
365,107,連江縣北竿鄉,2425,9.9000,245
366,107,連江縣莒光鄉,1618,4.7000,344
367,107,連江縣東引鄉,1350,3.8000,355
368,107,東沙群島,0,2.3800,0


## 6. using pd.Series

In [3]:
import pandas as pd
a = pd.Series([1,2,3],name = 'a')
b = pd.Series([4,5,6],name = 'b')

display(pd.DataFrame((a,b))) # merge as rows

display(pd.concat((a,b),axis = 1)) # merge as columns

Unnamed: 0,0,1,2
a,1,2,3
b,4,5,6


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6
