In [2]:
# pip install pdf2image

In [3]:
# pip show pdf2image

In [4]:
import os
import pathlib
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import sys
import pyocr
import pyocr.builders
import pathlib
import glob

In [1]:
# pip install pyocr

In [5]:
def pdf_to_image():
    # poppler/binを環境変数Pathに追加する(一時的に)
    # Path("__file__").parent.resolve()で.pyファイルの親フォルダ絶対パスを返す
    poppler_dir = pathlib.Path("__file__").parent.resolve() / "poppler/bin"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(poppler_dir)

    # PDFファイルのパス
    pdf_dir = pathlib.Path('./pdf_file')
    # globでディレクトリ内のpdfファイルをリストで取得
    pdf_path = list(pdf_dir.glob('**/*.pdf'))

    # PDF -> Imageに変換(200dpi)
    pages = convert_from_path(str(pdf_path[0]))

    # 画像ファイルを１ページづつ保存
    image_dir = pathlib.Path("./image_file")
    for i, page in enumerate(pages):  # enumerate関数でpagesのpage数を取得
        # .stemでpathの末尾を表示（pathlib)
        file_name = pdf_path[0].stem + "_{:02d}".format(i + 1) + ".jpeg"
        image_path = image_dir / file_name
        # JPEGで保存
        page.save(str(image_path), "JPEG")

In [9]:
pdf_to_image()

In [10]:
def image_ocr():

    # tesseract-OCRのパスを通す
    tessera_path = "C:\Program Files\Tesseract-OCR"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(tessera_path)

    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)  # 引数1は終了ステータスで１を返す

    tool = tools[0]

    # ocr対象のファイルがあるディレクトリ
    image_dir = pathlib.Path('./image_file')
    # globでディレクトリ内のjpegファイルをリストで取得
    jpg_path = list(image_dir.glob('**/*.jpeg'))

    for i in jpg_path:
        # ocrした内容を変数txtにする
        txt = tool.image_to_string(
        Image.open(str(i)),
        lang="jpn",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
        )
        # 変数txtをtxt_fileディレクトリにtxtファイルで保存
        with open('./txt_file/' + str(i.stem) + '.txt', mode='wt') as t:
            t.write(txt)

In [12]:
image_ocr()

In [16]:
import pandas as pd

In [6]:
# pip install pandas

In [22]:
col_names = [i for i in range(10)]

play_by_play1 = pd.read_csv('./txt_file/rise_ibm_05.txt', names=col_names, encoding="shift-jis")

In [24]:
play_by_play1.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,IBM BigBlue(BB) vs ノジマ相模原ライズ(NR),,,,,,,,,
1,2020/11/23(月) 会場 : 富士通スタジアム川崎,,,,,,,,,
2,Play by Play ...,,,,,,,,,
3,NR 35 Kick-off 、#3 鈴木 健太 キック …BB 14,#35 Gamboa Herbert 17yリターン(#1 藤本 遼),,,,,,,,
4,IBM BigBlue 12:00,,,,,,,,,
5,1&10-BB31 R PASS #3 Kevin Craft パス失敗,,,,,,,,,
6,2&10-BB31 R PASS 、#3 Kevin Craft 一 #85 鈴木...,,,,,,,,,
7,1&10-BB42 L PASS #3 Kevin Craft 一 #14 遠藤...,,,,,,,,,
8,2&3 -BB49 R PASS 、#3 Kevin Craft 一 #85 鈴木...,,,,,,,,,
9,1&10-NR41 L RUN #7 山中 大輔 3ッヶラン(#5 田中 喜貴),,,,,,,,,


In [7]:
path = './txt_file/rise_ibm_05.txt'

In [8]:
with open(path) as f:
    lines = f.readlines()

In [9]:
lines_strip = [line.strip() for line in lines]

In [10]:
l_XXX = [line for line in lines_strip if ('&' in line) or ('Penalty' in line) or ('Kick-off' in line)]

In [11]:
remove_space = [line.replace(' ', '') for line in l_XXX]

In [12]:
remove_space

['NR35Kick-off、#3鈴木健太キック…BB14,#35GamboaHerbert17yリターン(#1藤本遼)',
 '1&10-BB31RPASS#3KevinCraftパス失敗',
 '2&10-BB31RPASS、#3KevinCraft一#85鈴木隆中11yパスP1',
 '1&10-BB42LPASS#3KevinCraft一#14遠藤健史7yパス',
 '2&3-BB49RPASS、#3KevinCraft一#85鈴木隆貴10yパスP2',
 '1&10-NR41LRUN#7山中大輔3ッヶラン(#5田中喜貴)',
 '2&7-NR38LPASS、#3KevinCraft一#40JohnStanton-2yパス(#18増山純季)',
 '3&9-NR40RRUN#32元山伊織5yラン',
 '4&4-NR35MPASS、#3KevinCraft一#82白根混12yパス(#3佐久間徹)P3',
 '1&10-NR23RPASS、#3KevinCraftパス失敗-NoPlay',
 '+PenaltyNRオフサイド5y久退',
 '1&5-NR18RRUN#32元山伊織-1yラン(#5田中豆貴)',
 '2&6-NR19RPASS#3KevinCraftパス失敗',
 '3&6-NR19RPASS#3KevinCraftパス失敗',
 '4&6-NR19RPASS#3KevinCraftパス失敗',
 '1&10-NR19LRUN#6KURTPALANDECH8yラン',
 '2&2-NR27LRUN#20DERECKAKIRAWILLIAMS0yラン(#9植村佳史)',
 '3&2-NR27LRUN#20DERECKAKIRAWILLIAMS0yラン',
 '4&2-NR27MPUNT結3鈴木健太パント…BB23,#35鈴木隆貴10yリターン(#3佐久間徹)',
 '1&10-BB33RPASS#3KevinCraftパス失敗',
 '2&10-BB33RRUN#47山中大輔1ラン(#44小宮洋平,#56浦野雄大)',
 '3&9-BB34MPASS、#3KevinCraft一#87松岡直希34yパス(#33山口遇一郎)P4',
 '1&10-NR32RPASS、#3KevinCraft一#82白根混19yパス(#3佐久間徹)P5',
 '1&10

In [68]:
df = pd.DataFrame(remove_space, columns=["test"])

In [69]:
df.head()

Unnamed: 0,test
0,"NR35Kick-off、#3鈴木健太キック…BB14,#35GamboaHerbert17..."
1,1&10-BB31RPASS#3KevinCraftパス失敗
2,2&10-BB31RPASS、#3KevinCraft一#85鈴木隆中11yパスP1
3,1&10-BB42LPASS#3KevinCraft一#14遠藤健史7yパス
4,2&3-BB49RPASS、#3KevinCraft一#85鈴木隆貴10yパスP2


In [75]:
# df["down"] = df["test"].str.split("&").str.get(0)

In [118]:
import re
down_list = []
dist_list = []
jinti_list = []
ball_on = []
hash_list = []
playtype_list = []
for i in range(len(df["test"])):
    k = re.split("[&-]", df["test"][i])
#     print(re.split("[&-]", df["test"][i]))
#     print(k)
    down_list.append(k[0])
    if len(k) >= 2:
        dist_list.append(k[1])
    else:
        dist_list.append("nan")
    if len(k) >= 3:
        jinti_list.append(k[2][:2])
    else:
        jinti_list.append("nan")
    if len(k) >= 3:
        ball_on.append(k[2][2:4])
        hash_list.append(k[2][4])
        if k[2][5:9] == "PASS":
            playtype_list.append("PASS")
        elif k[2][5:8] =="RUN":
            playtype_list.append("RUN")
        elif k[2][5:7] == "FG":
            playtype_list.append("FG")
        elif k[2][5:8] == "PUNT":
            playtype_list.append("PUNT")
        else:
            playtype_list.append("?")
    else:
        ball_on.append("nan")
        hash_list.append("nan")
        playtype_list.append("nan")

In [119]:
playtype_list

['nan',
 'PASS',
 'PASS',
 'PASS',
 'PASS',
 'RUN',
 'PASS',
 'RUN',
 'PASS',
 'PASS',
 'nan',
 'RUN',
 'PASS',
 'PASS',
 'PASS',
 'RUN',
 'RUN',
 'RUN',
 '?',
 'PASS',
 'RUN',
 'PASS',
 'PASS',
 'RUN',
 'RUN',
 'PASS',
 'FG',
 'nan',
 'RUN',
 'PASS',
 'PASS',
 'RUN']

In [96]:
df["down"] = down_list

In [92]:
df["dist"] = dist_list

In [101]:
df["jinti"] = jinti_list

In [108]:
df["ball_on"] = ball_on

In [112]:
df["hash"] = hash_list

In [120]:
df["play_type"] = playtype_list

In [123]:
df.to_csv("df_test.csv")

In [27]:
import re
stats_list = []
for i in remove_space:
    j = re.split("[&-]", i)
    stats_list.append(j)
print(stats_list)

[['NR35Kick', 'off、#3鈴木健太キック…BB14,#35GamboaHerbert17yリターン(#1藤本遼)'], ['1', '10', 'BB31RPASS#3KevinCraftパス失敗'], ['2', '10', 'BB31RPASS、#3KevinCraft一#85鈴木隆中11yパスP1'], ['1', '10', 'BB42LPASS#3KevinCraft一#14遠藤健史7yパス'], ['2', '3', 'BB49RPASS、#3KevinCraft一#85鈴木隆貴10yパスP2'], ['1', '10', 'NR41LRUN#7山中大輔3ッヶラン(#5田中喜貴)'], ['2', '7', 'NR38LPASS、#3KevinCraft一#40JohnStanton', '2yパス(#18増山純季)'], ['3', '9', 'NR40RRUN#32元山伊織5yラン'], ['4', '4', 'NR35MPASS、#3KevinCraft一#82白根混12yパス(#3佐久間徹)P3'], ['1', '10', 'NR23RPASS、#3KevinCraftパス失敗', 'NoPlay'], ['+PenaltyNRオフサイド5y久退'], ['1', '5', 'NR18RRUN#32元山伊織', '1yラン(#5田中豆貴)'], ['2', '6', 'NR19RPASS#3KevinCraftパス失敗'], ['3', '6', 'NR19RPASS#3KevinCraftパス失敗'], ['4', '6', 'NR19RPASS#3KevinCraftパス失敗'], ['1', '10', 'NR19LRUN#6KURTPALANDECH8yラン'], ['2', '2', 'NR27LRUN#20DERECKAKIRAWILLIAMS0yラン(#9植村佳史)'], ['3', '2', 'NR27LRUN#20DERECKAKIRAWILLIAMS0yラン'], ['4', '2', 'NR27MPUNT結3鈴木健太パント…BB23,#35鈴木隆貴10yリターン(#3佐久間徹)'], ['1', '10', 'BB33RPASS#3KevinCraftパス失敗'], ['2', '10', 'BB33RRU

In [60]:
with open("output.csv", "w", encoding="utf-8") as f:
#     writer = csv.writer(f)
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(["down", "distance", "test", "playtype"])
    writer.writerows(stats_list)

In [61]:
import pandas as pd
df = pd.read_csv("output.csv", error_bad_lines=False)

In [62]:
df

Unnamed: 0,down,distance,test,playtype
0,NR35Kick,"off、#3鈴木健太キック…BB14,#35GamboaHerbert17yリターン(#1藤本遼)",,
1,1,10,BB31RPASS#3KevinCraftパス失敗,
2,2,10,BB31RPASS、#3KevinCraft一#85鈴木隆中11yパスP1,
3,1,10,BB42LPASS#3KevinCraft一#14遠藤健史7yパス,
4,2,3,BB49RPASS、#3KevinCraft一#85鈴木隆貴10yパスP2,
5,1,10,NR41LRUN#7山中大輔3ッヶラン(#5田中喜貴),
6,2,7,NR38LPASS、#3KevinCraft一#40JohnStanton,2yパス(#18増山純季)
7,3,9,NR40RRUN#32元山伊織5yラン,
8,4,4,NR35MPASS、#3KevinCraft一#82白根混12yパス(#3佐久間徹)P3,
9,1,10,NR23RPASS、#3KevinCraftパス失敗,NoPlay
