In [25]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.display import Image

import os, sys, re, datetime
from pathlib import Path

pj_dir = Path(os.getcwd()).parents[0]
data_dir = pj_dir/'data'
img_dir = pj_dir/'images'
src_dir = pj_dir/'src'
sys.path.append(str(src_dir))

from matplotlib import pyplot as plt
plt.style.use("bmh")
import numpy as np
import pandas as pd
import dask.dataframe as dd
from scipy import stats 
import sqlalchemy as  sa
from google.cloud import bigquery

from tqdm import tqdm
from dotenv import load_dotenv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
from itertools import product

In [65]:
df = pd.read_csv(data_dir/'pyconjp/talklist_20180907.csv')

In [66]:
id_room_ref = {
    'ra': 'A+B会議室',
    'rb': '小展示ホール',
    'rc': '特別会議室',
    'rd1': 'コンベンションホール',
    'rd2': 'コンベンションホール 梅',
    'rd3': 'コンベンションホール 鶯',
    're': 'C会議室',
    'rf': 'D会議室'
}
room_id_ref = {room: id_ for id_, room in id_room_ref.items()}

In [67]:
room_id_ref['小展示'] = 'rb'

In [68]:
timelist = list(df.groupby(['発表日', '時間']).groups.keys())

In [69]:
tt_no_dict = {
 '1日目13:30 - 14:15': 1,
 '1日目14:30 - 15:15': 2,
 '1日目15:45 - 16:15': 3,
 '1日目16:30 - 17:00': 4,
 '2日目11:15 - 12:00': 1,
 '2日目13:30 - 14:15': 2,
 '2日目14:30 - 15:00': 3,
 '2日目15:45 - 16:15': 4,
 '2日目16:30 - 17:00': 5
}

In [70]:
talk_format_dict = {
 '1日目13:30 - 14:15': 45,
 '1日目14:30 - 15:15': 45,
 '1日目15:45 - 16:15': 30,
 '1日目16:30 - 17:00': 30,
 '2日目11:15 - 12:00': 45,
 '2日目13:30 - 14:15': 45,
 '2日目14:30 - 15:00': 30,
 '2日目15:45 - 16:15': 30,
 '2日目16:30 - 17:00': 30
}

In [71]:
df['no'] = (df['発表日'] + df['時間']).replace(tt_no_dict)

In [72]:
df['room_id'] = df['部屋'].replace(room_id_ref)

In [73]:
df['day'] = df['発表日'].replace({'1日目': 1, '2日目': 2})

In [74]:
df.columns

Index(['id', '名前', 'プロフィール/Your Profile', 'タイトル', 'トークの概要/Talk abstract',
       'audience_level', '発表資料の言語/Language of presentation materials',
       '発表で使用する言語/Presentation language', '発表日', '時間', '部屋', '階数',
       'description', 'tag', 'location', 'bio', 'organization', 'no',
       'room_id', 'day'],
      dtype='object')

In [75]:
lang_of_slide_dict = {
    '日本語のみ/Japanese Only': 'ja',
    'English Only': 'en',
    '日本語と英語/Japanese and English': 'ja_en'
}

In [76]:
lang_of_talk_dict = {
    '日本語/Japanese': 'ja', 
    '英語/English': 'en'
}

In [77]:
df['name'] = df['名前']
df['room'] = df['部屋']
df['floor'] = df['階数']
df['title'] = df['タイトル']
df['abstract'] = df['トークの概要/Talk abstract']
df['profile'] = df['プロフィール/Your Profile']
df['abstract'] = df['トークの概要/Talk abstract']
df['talk_format'] = (df['発表日'] + df['時間']).replace(talk_format_dict)
df['lang_of_slide'] = df['発表資料の言語/Language of presentation materials'].replace(lang_of_slide_dict)
df['lang_of_talk'] = df['発表で使用する言語/Presentation language'].replace(lang_of_talk_dict)
df['tags'] = df['tag']

# 二日目のコンベンションルーム対応

In [78]:
df['no'].value_counts()

3    13
2    13
4    12
1    12
5     6
Name: no, dtype: int64

In [79]:
taget_index = df.query('day == 2').query('no == 1').query('room_id == "rd1"').index.values[0]

In [80]:
df.loc[taget_index, "room_id"] = 'rd2'

In [81]:
df.columns

Index(['id', '名前', 'プロフィール/Your Profile', 'タイトル', 'トークの概要/Talk abstract',
       'audience_level', '発表資料の言語/Language of presentation materials',
       '発表で使用する言語/Presentation language', '発表日', '時間', '部屋', '階数',
       'description', 'tag', 'location', 'bio', 'organization', 'no',
       'room_id', 'day', 'name', 'room', 'floor', 'title', 'abstract',
       'profile', 'talk_format', 'lang_of_slide', 'lang_of_talk', 'tags'],
      dtype='object')

In [82]:
cols = [
    'id', 'name', 'title', 'abstract', 'talk_format', 'profile', 'room', 'day', 'no', 'room_id',
    'lang_of_slide', 'lang_of_talk', 'audience_level', 'tags', 'description'
]

In [83]:
df[cols].to_csv(data_dir/'pyconjp/talkApiData.csv', index=False)

In [86]:
df['date'] = df['発表日'] + ' ' + df['時間']

In [87]:
dates = df['date'].unique().tolist()

In [90]:
dates

['1日目 13:30 - 14:15',
 '1日目 14:30 - 15:15',
 '1日目 15:45 - 16:15',
 '1日目 16:30 - 17:00',
 '2日目 11:15 - 12:00',
 '2日目 13:30 - 14:15',
 '2日目 14:30 - 15:00',
 '2日目 15:45 - 16:15',
 '2日目 16:30 - 17:00']

In [91]:
date_ = dates[0]

In [94]:
df.query('date == @date_')['lang_of_talk']

4     en
12    ja
26    ja
43    ja
44    en
51    ja
Name: lang_of_talk, dtype: object