# Data Process
   1. Fetch Raw Data: read json from url with category code 1 - 20 and other data
   2. (Optional) Save to Local: save fetched data to local to avoid repeated download
   3. Data Cleaning: 
        a. remove duplicated data with same UID
        b. select useful columns
        c. extract and flatten json
   4. Save to RDBMS: save JSON data to Postgres
   5. Analysis:
        a. Use SQL to observe data from Postgres
        b. load data as dataframe from Postgres

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Import package
import pandas as pd
import numpy as np
import json
import matplotlib.pylab as plt
from tqdm import tqdm
import urllib.request, json 
from datetime import datetime
from collections.abc import Iterable

# Fetch raw data from source

In [2]:
# Read dataset from URL
data = []

URL = 'https://cloud.culture.tw/frontsite/trans/SearchShowAction.do?method=doFindTypeJ&category='
all_dataset = [URL+str(i) for i in range(1,20)]+ [URL+'all'] \
              + ['https://cloud.culture.tw/frontsite/trans/SearchShowAction.do?method=doFindNewResidentTypeJ']
dataset_issue = 'https://cloud.culture.tw/frontsite/trans/SearchShowAction.do?method=doFindIssueTypeJ'

for i in tqdm(range(len(all_dataset))):
    with urllib.request.urlopen(all_dataset[i]) as url:
        data += json.loads(url.read().decode())

with urllib.request.urlopen(dataset_issue) as url:
    data += json.loads(url.read().decode())['issue']

# Write out file
# with open('drive/MyDrive/2021-General/Taiwan_Concert_Analysis/campaign_dataset_raw.json', 'w') as f:
with open('campaign_dataset_raw.json', 'w') as f:
    json.dump(data, f)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:39<00:00,  1.88s/it]


# Save raw data to local

# Data cleaning
1. Remove duplicate UID
2. Flatten columns with JSON format into more columns

## Remove duplicate UID

In [3]:
def dropDuplicate(tmp_dataset:list, dataset:list, uid_set:set)->None:
    """
      Parameter:
        dataset: list of dictionary
        uid_set: set of uid occured so far
      Return:
        None 
    """
    for element in tmp_dataset:
        if element['UID'] not in uid_set:
            dataset.append(element)
            uid_set.add(element['UID'])

In [4]:
# Drop duplicated data
UID_set = set()
data_noduplicate = []
dropDuplicate(data, data_noduplicate, UID_set)

# Write out file
# with open('drive/MyDrive/2021-General/Taiwan_Concert_Analysis/campaign_dataset_noduplicate.json', 'w') as f:
with open('campaign_dataset_noduplicate.json', 'w') as f:
    json.dump(data_noduplicate, f)

### Load from local file

In [863]:
# Load JSON from single local file
data_raw = []
data_noduplicate = []
# data_source_raw =  'drive/MyDrive/2021-General/Taiwan_Concert_Analysis/campaign_dataset_raw.json'
data_source_raw =  'campaign_dataset_raw.json'
# data_source_noduplicate = 'drive/MyDrive/2021-General/Taiwan_Concert_Analysis/campaign_dataset_noduplicate.json'
data_source_noduplicate = 'campaign_dataset_noduplicate.json'
with open(data_source_raw) as f:
    data_raw = json.load(f)
with open(data_source_noduplicate) as f:
    data_noduplicate = json.load(f)
df_raw = pd.json_normalize(data_raw)
df = pd.json_normalize(data_noduplicate)
print('Original amount of row:', df_raw.shape)
print('Processed amount of row:', df.shape)

Original amount of row: (7815, 21)
Processed amount of row: (3862, 21)


## Parse JSON to flatten columns and asign new data type
記得要寫函式檢驗showInfo的List中的是否只有一個元素
Advanced: 用正規表示法找string裡的資訊ex: 票價1800、2800、3800、4800、5800

In [864]:
# Determine list and dict in the json
# case 1: [{}]
# case 2: []
# case 3: {}
key_listdict = set()
key_list = set()
key_dict = set()
for d in data_noduplicate:
    for key in d:
        if key not in key_listdict or key not in key_list or key not in key_dict:
            if isinstance(d[key], list):
                if len(d[key])>0:
                    if isinstance(d[key][0], dict):
                        key_listdict.add(key)
                    else:
                        key_list.add(key)
                else:
                    key_list.add(key)
            elif isinstance(d[key], dict):
                key_dict.add(key)
            
print(key_listdict)
print(key_list)
print(key_dict)

{'showInfo'}
{'subUnit', 'showInfo', 'masterUnit', 'otherUnit', 'supportUnit'}
set()


In [865]:
"""
All data columns and their dtype:
{'version': String:str,
 'UID': String:str,
 'title': String:str,
 'category': String:str,
 'showInfo': [
     { 'time': DateTime:datetime, # with HMS
       'location': String:str,
       'locationName': String:str,
       'onSales': Boolean:bool,
       'price': String:str,# but should be in [price1, price2...]
       'latitude': Float:float,
       'longitude': Float:float,
       'endTime': DateTime:datetime # with HMS
       }],
 'showUnit': String:str,
 'discountInfo': String:str,
 'descriptionFilterHtml': String:str,
 'imageUrl': String:str,
 'masterUnit': ARRAY(String):list,
 'subUnit': ARRAY(String):list,
 'supportUnit': ARRAY(String):list,
 'otherUnit': ARRAY(String):list,
 'webSales': String:str,
 'sourceWebPromote': String:str,
 'comment': String:str,
 'editModifyDate': DateTime:datetime, # with HMS
 'sourceWebName': String:str,
 'startDate': DateTime:datetime,
 'endDate': 'DateTime:datetime,
 'hitRate': Integer:int}
 """

"\nAll data columns and their dtype:\n{'version': String:str,\n 'UID': String:str,\n 'title': String:str,\n 'category': String:str,\n 'showInfo': [\n     { 'time': DateTime:datetime, # with HMS\n       'location': String:str,\n       'locationName': String:str,\n       'onSales': Boolean:bool,\n       'price': String:str,# but should be in [price1, price2...]\n       'latitude': Float:float,\n       'longitude': Float:float,\n       'endTime': DateTime:datetime # with HMS\n       }],\n 'showUnit': String:str,\n 'discountInfo': String:str,\n 'descriptionFilterHtml': String:str,\n 'imageUrl': String:str,\n 'masterUnit': ARRAY(String):list,\n 'subUnit': ARRAY(String):list,\n 'supportUnit': ARRAY(String):list,\n 'otherUnit': ARRAY(String):list,\n 'webSales': String:str,\n 'sourceWebPromote': String:str,\n 'comment': String:str,\n 'editModifyDate': DateTime:datetime, # with HMS\n 'sourceWebName': String:str,\n 'startDate': DateTime:datetime,\n 'endDate': 'DateTime:datetime,\n 'hitRate': Int

In [866]:
# flatten dict
def transformFlatten(data)->list:
    '''
    Input: list
    Return: list
    '''
    element_showInfo = ['time','location','locationName','onSales','price','latitude','longitude','endTime']
    new_data = []
    for row in data:
        new_row = {}
        for key in row:
            if isinstance(row[key], list):
                stack = [row[key]]
                new_list = []
                while stack:
                    curr = stack.pop()
                    if isinstance(curr, list):
                        stack += curr
                    elif isinstance(curr, dict):
                        for k in curr:
                            new_row[k] = curr[k]
                    else:
                        new_list.append(curr)
                if new_list:
                    new_row[key] = new_list
            else:
                new_row[key] = row[key]
            if (key == 'showInfo') and (not any(row[key])):
                for e in element_showInfo:
                    new_row[e] = None
        new_data.append(new_row)
    return new_data

In [867]:
data_flat = transformFlatten(data_noduplicate)

In [868]:
# Generate columns
def to_dtype(element, dtype):
    if isinstance(element, Iterable):
        if not any(element):
            return None
    if element is None:
        return element
    if dtype == int:
        return int(element)
    elif dtype == float:
        return float(element)
    elif dtype == 'dt_detail':
        return datetime.strptime(element, "%Y/%m/%d %H:%M:%S")
    elif dtype == 'dt':
        return datetime.strptime(element, "%Y/%m/%d")
    elif dtype == bool:
        return (lambda x: True if x == 'Y' else False)(element)
    elif dtype == str:
        tmp = element.replace('\r', ' ').replace('\n', ' ').split()
        return " ".join(tmp)
    elif dtype == list:
        return element
    else:
        return 'dtype not found!'

def transformType(data:list)->list:
    change_to_int = set(['hitRate'])
    change_to_datetime_detail = set(['time', 'endTime','editModifyDate'])
    change_to_datetime = set(['startDate','endDate'])
    change_to_bool = set(['onSales'])
    change_to_float = set(['latitude','longitude'])
    change_list = [change_to_int,change_to_datetime_detail,change_to_datetime,change_to_bool,change_to_float]
    change_dtype = [int, 'dt_detail', 'dt', bool, float]
    
    
    new_data = []
    for row in data:
        new_row = {}
        for key in row:
            target_type = None
            # operation for columns in changing set
            for c,t in zip(change_list,change_dtype):
                if key in c:
                    target_type = t
                    break
            # operation for columns not in changing set
            if not target_type:
                if isinstance(row[key],str):
                    target_type = str
                elif isinstance(row[key],list):
                    target_type = list
            new_row[key] = to_dtype(row[key],target_type)
        new_data.append(new_row)
    return new_data

In [869]:
data = transformType(data_flat)

In [870]:
# import copy
# data = copy.deepcopy(data_noduplicate)
# changeColumnType(data)

In [871]:
data_flat[1]

{'version': '1.4',
 'UID': '60315cf3d083a396f8aed7f3',
 'title': '街頭藝人-1月三坑生態公園',
 'category': '1',
 'time': '2021/01/02 12:00:00',
 'location': '桃園市龍潭區',
 'locationName': '龍潭區（桃園市）=',
 'onSales': 'N',
 'price': '',
 'latitude': None,
 'longitude': None,
 'endTime': '2021/12/31 15:00:00',
 'showUnit': '(中華民國)微笑二人組;(中華民國)潘穎文;(中華民國)真薩皮樂團;(中華民國)黃政魁',
 'discountInfo': '',
 'descriptionFilterHtml': '微笑二人組：古典吉他演奏\r\n潘穎文：薩克斯風吹奏\r\n劉政魁：薩克斯風吹奏',
 'imageUrl': '',
 'masterUnit': ['桃園市龍潭區公所'],
 'webSales': '',
 'sourceWebPromote': '',
 'comment': '',
 'editModifyDate': '',
 'sourceWebName': '全國藝文活動資訊系統',
 'startDate': '2021/01/02',
 'endDate': '2021/12/31',
 'hitRate': 3}

In [872]:
import re
def finder(regex, text):
    find = re.finditer(regex, text)
    matches = re.findall(regex, text)
    exclusion = set()
    for m,n in [f.span() for f in find]:
        exclusion.update(list(range(m,n)))
    new_text = ''
    for i in range(len(text)):
        if i not in exclusion:
            new_text += text[i]
    return matches, new_text

def transformDetail(data:list)->list:
    new_data = []
    for row in data:
        new_row = {}
        for key in row:
            new_row[key] = row[key]
            
        # Handle detail of price
        new_row['priceinfo'], new_row['price'] = new_row['price'], None
        if new_row['priceinfo'] != None:
            ans = set()
            handle_regex = ['\d+[,]\d+','(?<=[$])\d+','\d+[/]\d+','\d+[/]','\d{2,}(?=[; 元、.，↑（,])']
            text = new_row['priceinfo']+' '
            for ex in handle_regex:
                result, text = finder(ex, text)
                for e in result:
                    if ',' in e:
                        if len("".join(e.split(','))) > 5:
                            for i in e.split(','):
                                ans.add(int(i))
                        else:
                            ans.add(int("".join(e.split(','))))
                    elif '/' in e:
                        if re.search('\d+[/]$',e):
                            ans.add(int(e.split('/')[0]))
                    else:
                        if int(e) < 100000:
                            ans.add(int(e))

            # change column name
            ans = sorted(list(set(ans)))
            ans = [i for i in ans if i > 9]
            new_row['price'] = ans

            # Handle free
            if not ans:
                if re.search('免費',text):
                    new_row['price'] = [0]
                else:
                    new_row['price'] = None

        # Handle county, city
        new_row['city'],new_row['region'] = None, None
        value_addr = (lambda x: '' if not x else x)(new_row['location'])
        value_name = (lambda x: '' if not x else x)(new_row['locationName'])
        total_location = " ".join([value_addr, value_name]).replace('台','臺')
        tmp = re.findall('[^ 0-9]{1,2}[區鄉鎮市縣]',total_location)
        rough = []
        for e in tmp:
            if e not in rough:
                rough.append(e)
        if rough:
            if len(rough) >= 2:
                new_row['city'],new_row['region'] = rough[0], rough[1]
            else:
                new_row['city'] = rough[0]
                # row['city'] = str(rough) -> find out all of them are city       
            # 其餘可以用Google Map API補latitude, longitude，或利用座標搜尋關鍵字

        # Handle Online
        new_row['isOnline'] = False
        if re.search('線上',total_location):
            new_row['isOnline'] = True
                
        new_data.append(new_row)
    return new_data

In [873]:
data = transformDetail(data)

In [874]:
# # process order:
# # $200
# # 1,000; 2000; 3000,500,800元
# # 10/31
# # 100/人
# # \d{2,}(?=[; 元、.，↑（])
# # 免費
# handle_regex = ['(?<=[$])\d+', '\d+[,]\d+','\d+[/]\d+','\d+[/]','\d{2,}(?=[; 元、.，↑（,])']
# text = '100/人材料1,150元費:999元1000↑實體$3400請洽C5、NT$400（3800、2,000元、1,000元+100;12次學生或6歲65歲8折優惠300, 500,800,1200元或85折@ojc3045ew團票20張25人10/12～11/30、週五10:00在921地震115公分全票300 '
# for ex in handle_regex:
#     result, text = finder(ex, text)
#     print(result, '\n', text)
#     print('-----------------------')

In [875]:
# handle_regex = ['(?<=[$])\d+[,]*\d+', '\d+[,]\d+','\d+[/]\d+','\d+[/]','\d{2,}(?=[; 元、.，↑（,])']
# text = '$2,680元100/人材料1,150元費:999元1000↑實體$3400請洽C5、NT$400（3800、2,000元、1,000元+100;12次學生或6歲65歲8折優惠300, 500,800,1200元或85折@ojc3045ew團票20張25人10/12～11/30、週五10:00在921地震115公分全票300 '
# ans = set()
# for ex in regex_list:
#     result, text = finder(ex, text)
#     print(result, text)
#     for e in result:
#         if ',' in e:
#             if len("".join(e.split(','))) > 5:
#                 for i in e.split(','):
#                     ans.add(int(i))
#             else:
#                 ans.add(int("".join(e.split(','))))
#         elif '/' in e:
#             if re.search('\d+[/]$',e):
#                 ans.add(int(e.split('/')[0]))
#         else:
#             ans.add(int(e))
#     print(ans)
# print(sorted(list(ans)))

# Push data into RDBMS (PostgreSQL)

In [880]:
# send JSON to Postgres
from sqlalchemy import Table, Column, MetaData, ForeignKey, create_engine
from sqlalchemy import Integer, String, Date, DateTime, ARRAY, Boolean, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import mapper
from sqlalchemy.dialects.postgresql import JSONB

def connect(username, password, database, host='localhost', port=5432):     
 
    engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:5432/{database}')
    base = declarative_base()
    session = sessionmaker(bind=engine)
    meta = MetaData(bind=engine)  
  
    return engine, meta, base, session 

In [881]:
engine, meta, Base, Session = connect('yugachang', 'yuga', 'campaigndb')

In [883]:
# wrap column name and type
column_name = ['UID', 'version','title','category', 'showUnit','discountInfo',\
               'descriptionFilterHtml','imageUrl','masterUnit','subUnit','supportUnit','otherUnit',\
               'webSales', 'sourceWebPromote', 'comment', 'editModifyDate', 'sourceWebName', \
               'startDate', 'endDate', 'hitRate', 'time', 'location', 'locationName', 'onSales',\
               'price', 'latitude', 'longitude', 'endTime', 'priceinfo', 'city', 'region', 'isOnline']
column_type = [String, String, String, String, String, String, \
               String, String, ARRAY(String),ARRAY(String),ARRAY(String),ARRAY(String),\
               String, String, String, DateTime, String,\
               DateTime, DateTime, Integer, DateTime, String, String, Boolean,\
               ARRAY(Integer), Float, Float, DateTime, String, String, String, Boolean]


class Campaign(Base):
    __tablename__ = 'campaign_all'
    id = Column(Integer, primary_key = True, autoincrement=True)
    
for name, dtype in zip(column_name, column_type):
    setattr(Campaign, name, Column(dtype))
    
Base.metadata.create_all(engine)

In [884]:
session = Session()
for row in data:
#     print(row)
    session.add(Campaign(**row))
session.commit()

In [None]:
df_raw.groupby('UID').size()

In [None]:
# Show all columns
df.columns.tolist()

In [None]:
df[df['supportUnit']==[]]

In [885]:
df.groupby('title').filter(lambda x : x['title'].shape[0]>=4)

Unnamed: 0,version,UID,title,category,showInfo,showUnit,discountInfo,descriptionFilterHtml,imageUrl,masterUnit,...,supportUnit,otherUnit,webSales,sourceWebPromote,comment,editModifyDate,sourceWebName,startDate,endDate,hitRate
1228,1.4,616b233dd083a38f6c36c7d8,南投縣熊愛閱讀說情畫意110-1期縣長獎得獎作品巡迴展,6,"[{'time': '2021/12/01 08:30:00', 'location': '...",(中華民國)南投縣政府文化局,,南投縣政府文化局與18度C文化基金會合作辦理「熊愛閱讀」閱讀護照計畫，自108年7月1日起開...,,[南投縣政府文化局],...,[],"[(合辦)財團法人18℃文化基金會, (指導)南投縣政府]",,https://www.nthcc.gov.tw/,,,全國藝文活動資訊系統,2021/12/01,2021/12/31,0
1229,1.4,616b233dd083a38f6c36c7d9,南投縣熊愛閱讀說情畫意110-1期縣長獎得獎作品巡迴展,6,"[{'time': '2022/01/01 08:30:00', 'location': '...",(中華民國)南投縣政府文化局,,南投縣政府文化局與18度C文化基金會合作辦理「熊愛閱讀」閱讀護照計畫，自108年7月1日起開...,,[南投縣政府文化局],...,[],"[(合辦)財團法人18℃文化基金會, (指導)南投縣政府]",,https://www.nthcc.gov.tw/,,,全國藝文活動資訊系統,2022/01/01,2022/01/10,0
1230,1.4,616b233dd083a38f6c36c7da,南投縣熊愛閱讀說情畫意110-1期縣長獎得獎作品巡迴展,6,"[{'time': '2022/01/12 08:00:00', 'location': '...",(中華民國)南投縣政府文化局,,南投縣政府文化局與18度C文化基金會合作辦理「熊愛閱讀」閱讀護照計畫，自108年7月1日起開...,,[南投縣政府文化局],...,[],"[(合辦)財團法人18℃文化基金會, (指導)南投縣政府]",,https://www.nthcc.gov.tw/,,,全國藝文活動資訊系統,2022/01/12,2022/03/17,2
1231,1.4,616b233dd083a38f6c36c7db,南投縣熊愛閱讀說情畫意110-1期縣長獎得獎作品巡迴展,6,"[{'time': '2022/03/19 08:00:00', 'location': '...",(中華民國)南投縣政府文化局,,南投縣政府文化局與18度C文化基金會合作辦理「熊愛閱讀」閱讀護照計畫，自108年7月1日起開...,,[南投縣政府文化局],...,[],"[(合辦)財團法人18℃文化基金會, (指導)南投縣政府]",,https://www.nthcc.gov.tw/,,,全國藝文活動資訊系統,2022/03/19,2022/05/31,0
1496,1.4,618d6a42d083a37aa0dcae03,「人在六堆」戶外影像藝術展,6,"[{'time': '2021/09/17 09:00:00', 'location': '...",(中華民國)劉安明;(中華民國)李秀雲,,結合當代藝術策展人、攝影家與藝術家的不同視野，並連結六堆各堆的文化地景，創作了9座戶外影像藝...,,[屏東縣政府],...,[],[(指導)客家委員會],,,,,全國藝文活動資訊系統,2021/09/17,2021/12/31,4
1498,1.4,618d6a42d083a37aa0dcae05,「人在六堆」戶外影像藝術展,6,"[{'time': '2021/09/17 06:00:00', 'location': '...",(中華民國)劉安明;(中華民國)李秀雲,,結合當代藝術策展人、攝影家與藝術家的不同視野，並連結六堆各堆的文化地景，創作了9座戶外影像藝...,,[屏東縣政府],...,[],[(指導)客家委員會],,,,,全國藝文活動資訊系統,2021/09/17,2021/12/31,3
1630,1.4,6196a4e6d083a37aa0dcb19d,「人在六堆」戶外影像藝術展,6,"[{'time': '2022/01/01 06:00:00', 'location': '...",(中華民國)劉安明;(中華民國)李秀雲,,結合當代藝術策展人、攝影家與藝術家的不同視野，並連結六堆各堆的文化地景，創作了9座戶外影像藝...,,[屏東縣政府],...,[],[(指導)客家委員會],,,,,全國藝文活動資訊系統,2022/01/01,2022/03/31,3
1631,1.4,6196a4e6d083a37aa0dcb1a2,「人在六堆」戶外影像藝術展,6,"[{'time': '2021/06/17 06:00:00', 'location': '...",(中華民國)陳昌仁,,展出主題-先鋒堆<傘立方舟>\r\n\r\n為紀念六堆300年，以建構六堆文化之意象，邀請策...,,[屏東縣政府],...,[],[(指導)客家委員會],,,,,全國藝文活動資訊系統,2021/06/17,2021/12/31,1


# Pull data from database

# Analyze data

In [149]:
df.head(10)

Unnamed: 0,version,UID,title,category,showInfo,showUnit,discountInfo,descriptionFilterHtml,imageUrl,masterUnit,...,supportUnit,otherUnit,webSales,sourceWebPromote,comment,editModifyDate,sourceWebName,startDate,endDate,hitRate
0,1.4,5f906d3fd083a35edcf6bece,2021黃詩雅小提琴獨奏會-唯美的音符溫度,1,"[{'time': '2022/04/03 19:30:00', 'location': '...",,,,,[],...,[],[],https://ticket.com.tw/Application/UTK02/UTK020...,https://ticket.com.tw/Application/UTK02/UTK020...,,,年代,2022/04/03,2022/04/03,134
1,1.4,60315cf3d083a396f8aed7f3,街頭藝人-1月三坑生態公園,1,"[{'time': '2021/01/02 12:00:00', 'location': '...",(中華民國)微笑二人組;(中華民國)潘穎文;(中華民國)真薩皮樂團;(中華民國)黃政魁,,微笑二人組：古典吉他演奏\r\n潘穎文：薩克斯風吹奏\r\n劉政魁：薩克斯風吹奏,,[桃園市龍潭區公所],...,[],[],,,,,全國藝文活動資訊系統,2021/01/02,2021/12/31,3
2,1.4,6037f493d083a396f8aed9bd,詩弦胡琴樂團音樂會,1,"[{'time': '2022/01/09 13:00:00', 'location': '...",(中華民國)詩弦胡琴樂團老師暨全體學員,,二胡班成果發表\r\n上半場曲目：\r\n1.春吟2.天空之城3.阿美族舞曲4.三吋天堂5....,,[],...,[],[],,,,,全國藝文活動資訊系統,2022/01/09,2022/01/09,2
3,1.4,605339fed083a370c878dae8,全本音樂劇《貓》CATS,1,"[{'time': '2022/02/10 19:30:00', 'location': '...",,,,,[],...,[],[],https://kham.com.tw/application/UTK02/UTK0201_...,https://kham.com.tw/application/UTK01/UTK0101_...,,,寬宏售票,2022/02/10,2022/03/20,677
4,1.4,605cc565d083a33764cdc059,寶吉祥交響樂團-春之協奏曲音樂會,1,"[{'time': '2022/01/05 19:30:00', 'location': '...",,,,,[],...,[],[],https://ticket.com.tw/Application/UTK02/UTK020...,https://ticket.com.tw/Application/UTK02/UTK020...,,,年代,2022/01/05,2022/01/05,52
5,1.4,6075ecb7d083a3a724cd58d2,2021苗栗縣原住民族賽夏假日藝文系列活動：街頭藝人展演（110年12月，1場次）,1,"[{'time': '2021/12/25 14:20:00', 'location': '...",(中華民國)梁智凱（阿智Zpower）,,苗栗縣府原住民族事務中心規劃「原住民族假日藝文系列活動」，活動期間自農曆新年初一至初五，並延...,,[苗栗縣政府原住民族事務中心],...,[],[(指導)苗栗縣政府],,https://www.miaoli.gov.tw/,,,全國藝文活動資訊系統,2021/12/25,2021/12/25,65
6,1.4,607b32ced083a3a724cd5b0a,鐵花村音樂聚落【唱作聚家】12月,1,"[{'time': '2021/12/01 20:00:00', 'location': '...",(中華民國)台東鐵花村,,🎤鐵花村的唱作聚家🎤\r\n這是一個鐵花村禮拜三固定的音樂節目，\r\n在晚上七點鐘會開放六...,,[台東鐵花村],...,[],[],,http://www.tiehua.com.tw/calendar.php?month=-1...,,,全國藝文活動資訊系統,2021/12/01,2021/12/29,25
7,1.4,607d50ddd083a37388433308,2021正港雄有戲-嵬舞劇場舞蹈團《野獸的咆哮》,1,"[{'time': '2022/07/01 19:30:00', 'location': '...",高雄市政府文化局,,,,[],...,[],[],https://www.opentix.life/program/1331221006681...,https://www.opentix.life/program/1331221006681...,,,OPENTIX兩廳院文化生活,2022/07/01,2022/07/03,52
8,1.4,607d50dfd083a37388433349,2021KSAF春藝小劇場《幸福代理人》都會暖心音樂劇,1,"[{'time': '2022/03/25 19:30:00', 'location': '...",高雄市政府文化局,,,,[],...,[],[],https://www.opentix.life/program/1357266706820...,https://www.opentix.life/program/1357266706820...,,,OPENTIX兩廳院文化生活,2022/03/25,2022/03/27,121
9,1.4,607d50e0d083a37388433365,為偏鄉醫療多走一里路音樂會,1,"[{'time': '2022/05/14 19:30:00', 'location': '...",,,,,[],...,[],[],https://www.opentix.life/program/1364491804291...,https://www.opentix.life/program/1364491804291...,,,OPENTIX兩廳院文化生活,2022/05/14,2022/05/14,68


In [None]:
# # select row not null
# df[df.price.notnull()]
# df[df.price.isna()]
# df[df.price.notna()]

# # 各縣市各區域舉辦各類活動count
# # 各縣市免費場次count
# # 各縣市

In [None]:
df.groupby('title').filter(lambda x : x['title'].shape[0]>=4)

In [None]:
# df[df['price'].str.len()>1]