In [1]:
import os
import pathlib
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import sys
import pyocr
import pyocr.builders
import pathlib
import glob
import pandas as pd
import re
import numpy as np

In [2]:
def pdf_to_image():
    # poppler/binを環境変数Pathに追加する(一時的に)
    # Path("__file__").parent.resolve()で.pyファイルの親フォルダ絶対パスを返す
    poppler_dir = pathlib.Path("__file__").parent.resolve() / "poppler/bin"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(poppler_dir)

    # PDFファイルのパス
    pdf_dir = pathlib.Path('./pdf_file')
    # globでディレクトリ内のpdfファイルをリストで取得
    pdf_path = list(pdf_dir.glob('**/*.pdf'))

    # PDF -> Imageに変換(200dpi)
    pages = convert_from_path(str(pdf_path[0]))

    # 画像ファイルを１ページづつ保存
    image_dir = pathlib.Path("./image_file")
    for i, page in enumerate(pages):  # enumerate関数でpagesのpage数を取得
        # .stemでpathの末尾を表示（pathlib)
        file_name = pdf_path[0].stem + "_{:02d}".format(i + 1) + ".jpeg"
        image_path = image_dir / file_name
        # JPEGで保存
        page.save(str(image_path), "JPEG")

In [9]:
pdf_to_image()

In [3]:
def image_ocr():

    # tesseract-OCRのパスを通す
    tessera_path = "C:\Program Files\Tesseract-OCR"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(tessera_path)

    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)  # 引数1は終了ステータスで１を返す

    tool = tools[0]

    # ocr対象のファイルがあるディレクトリ
    image_dir = pathlib.Path('./image_file')
    # globでディレクトリ内のjpegファイルをリストで取得
    jpg_path = list(image_dir.glob('**/*.jpeg'))

    for i in jpg_path:
        # ocrした内容を変数txtにする
        txt = tool.image_to_string(
        Image.open(str(i)),
        lang="jpn",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
        )
        # 変数txtをtxt_fileディレクトリにtxtファイルで保存
        with open('./txt_file/' + str(i.stem) + '.txt', mode='wt') as t:
            t.write(txt)

In [12]:
image_ocr()

In [12]:
path = './txt_file/rise_ibm_06.txt'

In [13]:
with open(path) as f:
    lines = f.readlines()

In [14]:
lines_strip = [line.strip() for line in lines]

In [15]:
lines_strip

['IBM BigBlue(BB) vs ノジマ相模原ライズ(NR)',
 '2020/11/23(月) 会場 : 富士通スタジアム川崎',
 'Play by Play                                   Second Quarter',
 'ノジマ相模原ライズ 12:00',
 '2&12-NR42 M      RUN  #6 KURT PALANDECH 0yラン(#35 Gamboa Herbert)- No Play',
 '+Penalty NR #67 ホールディング 10y 久退',
 '2&22-NR32 M     PASS  #6 KURT PALANDECH パス失敗',
 '3&22-NR32 M     PASS  #6 KURT PALANDECH パス失敗',
 '4&22-NR32 M    PUNT 約3 鈴木 健太 パント, ボールデッド',
 '#0 資格没収',
 '+Penalty NR #O0 パーソナルファウル 15y 久退',
 'IBM BigBlue 11:14',
 '1&10-BB25 L      PASS 、#3 Kevin Craft 一 #80 河村 暁光 8yパス(18 増山 純季)',
 '2&2 -BB33 R    RUN  #4 鈴木 恵多 1yラン',
 '3&1 -BB34 R     RUN  #47 山中 大輔 -1yラン(#5 田中 喜貴)',
 '4&2 -BB33 R      PUNT  #5 Kevin Coghlan パント …NR 33, #23 伊藤 雅恭 yリターン(#18 小川 知輝)',
 'ノジマ相模原ライズ 9:36',
 '1&10-NR40 L      PASS  #6 KURT PALANDECH 一 #26 森本 統介 60yパス, TOUCHDOWN                                      P2',
 'Extra Point, #59 笹尾 健 キック(H/#23 伊藤 雅奈) GOOD',
 'NR 35            Kick-off  #3 鈴木 健太 キック …BB 0, #21 佐藤 航生 24ッリターン(#31 新井 直樹)',
 'IBM BigBlue 

In [16]:
l_XXX = [line for line in lines_strip if ('&' in line) or ('Penalty' in line) or ('Kick-off' in line) or ('Extra Point' in line) or ('TIMEOUT' in line) or (':' in line)]

In [17]:
remove_space = [line.replace(' ', '') for line in l_XXX]

In [18]:
pd.Series(remove_space[0])

0    2020/11/23(月)会場:富士通スタジアム川崎
dtype: object

In [19]:
for i in range(len(remove_space)):
    print(pd.Series(remove_space[i]).str.contains("\d+:\d+").values)

[False]
[ True]
[False]
[False]
[False]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[False]
[ True]
[False]
[False]
[False]
[False]


In [25]:
remove_space

['2020/11/23(月)会場:富士通スタジアム川崎',
 'ノジマ相模原ライズ12:00',
 '2&12-NR42MRUN#6KURTPALANDECH0yラン(#35GamboaHerbert)-NoPlay',
 '+PenaltyNR#67ホールディング10y久退',
 '2&22-NR32MPASS#6KURTPALANDECHパス失敗',
 '3&22-NR32MPASS#6KURTPALANDECHパス失敗',
 '4&22-NR32MPUNT約3鈴木健太パント,ボールデッド',
 '+PenaltyNR#O0パーソナルファウル15y久退',
 'IBMBigBlue11:14',
 '1&10-BB25LPASS、#3KevinCraft一#80河村暁光8yパス(18増山純季)',
 '2&2-BB33RRUN#4鈴木恵多1yラン',
 '3&1-BB34RRUN#47山中大輔-1yラン(#5田中喜貴)',
 '4&2-BB33RPUNT#5KevinCoghlanパント…NR33,#23伊藤雅恭yリターン(#18小川知輝)',
 'ノジマ相模原ライズ9:36',
 '1&10-NR40LPASS#6KURTPALANDECH一#26森本統介60yパス,TOUCHDOWNP2',
 'ExtraPoint,#59笹尾健キック(H/#23伊藤雅奈)GOOD',
 'NR35Kick-off#3鈴木健太キック…BB0,#21佐藤航生24ッリターン(#31新井直樹)',
 'IBMBigBlue9:23',
 '1&10-BB24MPASS#3KevinCraftパス失敗',
 '2&10-BB24MPASS#3KevinCraftパス失敗',
 '3&10-BB24MPASS#3KevinCraftパス失敗INTERCEPT#5田中喜貴…BB49、0yリターン',
 'ノジマ相模原ライズ8:51',
 '1&10-BB49LRUN#6KURTPALANDECH21yランR3',
 '1&10-BB28RRUN#26森本統介8yラン(#35GamboaHerbert)',
 '2&2-BB20RRUN#20DERECKAKIRAWILLIAMS1yラン',
 '3&1-BB19RRUN#20DERECKAKIRAWILLIAMS0yラン(#35Gam

In [20]:
pd.Series(remove_space).str.contains("\d+:\d+")

0     False
1      True
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9     False
10    False
11    False
12    False
13     True
14    False
15    False
16    False
17     True
18    False
19    False
20    False
21     True
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34     True
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42     True
43    False
44    False
45    False
46    False
47     True
48    False
49    False
50    False
51    False
dtype: bool

In [19]:
df = pd.DataFrame(remove_space, columns=["test"])

In [20]:
df

Unnamed: 0,test
0,2020/11/23(月)会場:富士通スタジアム川崎
1,ノジマ相模原ライズ12:00
2,2&12-NR42MRUN#6KURTPALANDECH0yラン(#35GamboaHerb...
3,+PenaltyNR#67ホールディング10y久退
4,2&22-NR32MPASS#6KURTPALANDECHパス失敗
5,3&22-NR32MPASS#6KURTPALANDECHパス失敗
6,"4&22-NR32MPUNT約3鈴木健太パント,ボールデッド"
7,+PenaltyNR#O0パーソナルファウル15y久退
8,IBMBigBlue11:14
9,1&10-BB25LPASS、#3KevinCraft一#80河村暁光8yパス(18増山純季)


In [21]:
s = df["test"][2]

In [22]:
f = df["test"][6]

In [23]:
k = df["test"][0]

In [29]:
o = df["test"][1]

In [30]:
o

'ノジマ相模原ライズ12:00'

In [24]:
s

'2&12-NR42MRUN#6KURTPALANDECH0yラン(#35GamboaHerbert)-NoPlay'

In [25]:
k

'2020/11/23(月)会場:富士通スタジアム川崎'

In [26]:
f

'4&22-NR32MPUNT約3鈴木健太パント,ボールデッド'

In [30]:
print("PASS" in s)

True


In [49]:
import re
m = re.search(r'\d+y', s)
l = re.search(r'\d+', m.group())

In [50]:
l.group()

'11'

In [55]:
q = re.search(r'[1234]&\d+', s)
h = re.split("&", q.group())
print(h)

['2', '10']


In [52]:
q.group()

'2&10'

In [67]:
p = re.search(r'\w\w\d+[LMR]', s)
l = p.group()
h = re.search(r'\d+', l)
print(l)
print(l[0:2])
print(h.group())
print(l[-1])

BB31R
BB
31
R


In [15]:
test_list = [(remove_space[i], i) for i in range(len(remove_space)) if pd.Series(remove_space[i]).str.contains("\d+:\d+").values]

In [16]:
test_list

[('ノジマ相模原ライズ12:00', 1),
 ('IBMBigBlue11:14', 8),
 ('ノジマ相模原ライズ9:36', 13),
 ('IBMBigBlue9:23', 17),
 ('ノジマ相模原ライズ8:51', 21),
 ('IBMBigBlue5:25', 34),
 ('ノジマ相模原ライズ3:56', 42),
 ('IBMBigBlue2:34', 47)]

In [31]:
o, s, k, f

('ノジマ相模原ライズ12:00',
 '2&12-NR42MRUN#6KURTPALANDECH0yラン(#35GamboaHerbert)-NoPlay',
 '2020/11/23(月)会場:富士通スタジアム川崎',
 '4&22-NR32MPUNT約3鈴木健太パント,ボールデッド')

In [42]:
ot = re.search(r'\d+:\d+', o)
print(ot)
oo = re.search(r'\D+', o)
oo.group()

<re.Match object; span=(9, 14), match='12:00'>


'ノジマ相模原ライズ'

In [17]:
for i in range(len(remove_space)):
    if pd.Series(remove_space[i]).str.contains("\d+:\d+").values:
        print(remove_space[i], i)

ノジマ相模原ライズ12:00 1
IBMBigBlue11:14 8
ノジマ相模原ライズ9:36 13
IBMBigBlue9:23 17
ノジマ相模原ライズ8:51 21
IBMBigBlue5:25 34
ノジマ相模原ライズ3:56 42
IBMBigBlue2:34 47


In [43]:
def get_offense_team(s):
    offense_team = np.nan
    ot = re.search(r'\d+:\d+', s)
    if ot:
        offense_team = re.search(r'\D+', s).group()
    return offense_team

In [46]:
get_offense_team(o)

'ノジマ相模原ライズ'

In [47]:
get_offense_team(s)

nan

In [57]:
def get_fieldposition(s):
    position = "?"
    y = "?"
    Hash = "?"
    fp = re.search(r'\w\w\d+[LMR]', s)
    if fp is not None:
        fp_all = fp.group()
        yard = re.search(r'\d+', fp_all)
        y = yard.group()
        position = fp_all[0:2]
        Hash = fp_all[-1]
    return position, y, Hash

In [58]:
def get_downdistance(s):
    down = "?"
    distance = "?"
    dd = re.search(r'[1234]&\d+', s)
    if dd is not None:
        dd_split = re.split("&", dd.group())
        down = dd_split[0]
        distance = dd_split[1]
    return down, distance

In [59]:
def get_gain(s):
    gain = "?"
    yardy = re.search(r'-*\d+y', s)
    if yardy is not None:
        yard = re.search(r'-*\d+', yardy.group())
        gain = yard.group()
    return gain

In [60]:
def get_playtype(s):
    play_type = "?"
    if "RUN" in s:
        play_type = "RUN"
    elif "PASS" in s:
        play_type = "PASS"
    elif "FG" in s:
        play_type = "FG"
    elif "PUNT" in s:
        play_type = "PUNT"
    elif "Kick-off" in s:
        play_type = "KICK OFF"
    else:
        pass
    return play_type

In [61]:
def get_stats_dict(s):
    position, y, Hash = get_fieldposition(s)
    down, distance = get_downdistance(s)
    gain = get_gain(s)
    play_type = get_playtype(s)
    stats_list = [("position", position), ("yard", y), ("hash", Hash),
                  ("down", down), ("distance", distance), ("gain", gain),
                  ("play_type", play_type)]
    stats_dict = dict(stats_list)
    return stats_dict

In [99]:
stats_list = []
for i in range(len(df["test"])):
    stats_list.append(get_stats_dict(df["test"][i]))

In [116]:
stats_list = [get_stats_dict(df["test"][i]) for i in range(len(df["test"]))]

In [66]:
for i in range(len(df["test"])):
    print(get_stats_dict(df["test"][i]))

{'position': '?', 'yard': '?', 'hash': '?', 'down': '?', 'distance': '?', 'gain': '?', 'play_type': '?'}
{'position': '?', 'yard': '?', 'hash': '?', 'down': '?', 'distance': '?', 'gain': '?', 'play_type': '?'}
{'position': 'NR', 'yard': '42', 'hash': 'M', 'down': '2', 'distance': '12', 'gain': '0', 'play_type': 'RUN'}
{'position': '?', 'yard': '?', 'hash': '?', 'down': '?', 'distance': '?', 'gain': '10', 'play_type': '?'}
{'position': 'NR', 'yard': '32', 'hash': 'M', 'down': '2', 'distance': '22', 'gain': '?', 'play_type': 'PASS'}
{'position': 'NR', 'yard': '32', 'hash': 'M', 'down': '3', 'distance': '22', 'gain': '?', 'play_type': 'PASS'}
{'position': 'NR', 'yard': '32', 'hash': 'M', 'down': '4', 'distance': '22', 'gain': '?', 'play_type': 'PUNT'}
{'position': '?', 'yard': '?', 'hash': '?', 'down': '?', 'distance': '?', 'gain': '15', 'play_type': '?'}
{'position': '?', 'yard': '?', 'hash': '?', 'down': '?', 'distance': '?', 'gain': '?', 'play_type': '?'}
{'position': 'BB', 'yard': '25

In [56]:
stats1 = pd.DataFrame(stats_list)

NameError: name 'stats_list' is not defined

In [117]:
stats_list

[{'position': '?',
  'yard': '?',
  'hash': '?',
  'down': '?',
  'distance': '?',
  'gain': '17',
  'play_type': 'KICK OFF'},
 {'position': 'BB',
  'yard': '31',
  'hash': 'R',
  'down': '1',
  'distance': '10',
  'gain': '?',
  'play_type': 'PASS'},
 {'position': 'BB',
  'yard': '31',
  'hash': 'R',
  'down': '2',
  'distance': '10',
  'gain': '11',
  'play_type': 'PASS'},
 {'position': 'BB',
  'yard': '42',
  'hash': 'L',
  'down': '1',
  'distance': '10',
  'gain': '7',
  'play_type': 'PASS'},
 {'position': 'BB',
  'yard': '49',
  'hash': 'R',
  'down': '2',
  'distance': '3',
  'gain': '10',
  'play_type': 'PASS'},
 {'position': 'NR',
  'yard': '41',
  'hash': 'L',
  'down': '1',
  'distance': '10',
  'gain': '?',
  'play_type': 'RUN'},
 {'position': 'NR',
  'yard': '38',
  'hash': 'L',
  'down': '2',
  'distance': '7',
  'gain': '-2',
  'play_type': 'PASS'},
 {'position': 'NR',
  'yard': '40',
  'hash': 'R',
  'down': '3',
  'distance': '9',
  'gain': '5',
  'play_type': 'RUN'},


In [103]:
stats1

Unnamed: 0,position,yard,hash,down,distance,gain,play_type
0,?,?,?,?,?,17,KICK OFF
1,BB,31,R,1,10,?,PASS
2,BB,31,R,2,10,11,PASS
3,BB,42,L,1,10,7,PASS
4,BB,49,R,2,3,10,PASS
5,NR,41,L,1,10,?,RUN
6,NR,38,L,2,7,-2,PASS
7,NR,40,R,3,9,5,RUN
8,NR,35,M,4,4,12,PASS
9,NR,23,R,1,10,?,PASS


In [94]:
get_stats_dict(f)

{'position': 'NR',
 'yard': '38',
 'hash': 'L',
 'down': '2',
 'distance': '7',
 'gain': '-2',
 'play_type': 'PASS'}

In [96]:
pd.DataFrame([get_stats_dict(f), get_stats_dict(s), get_stats_dict(k)])

Unnamed: 0,position,yard,hash,down,distance,gain,play_type
0,NR,38,L,2,7,-2,PASS
1,BB,31,R,2,10,11,PASS
2,?,?,?,?,?,17,KICK OFF


In [87]:
get_playtype(s)

'PASS'

In [75]:
get_gain(s)

'11'

In [11]:
import re
down_list = []
dist_list = []
jinti_list = []
ball_on = []
hash_list = []
playtype_list = []
for i in range(len(df["test"])):
    k = re.split("[&-]", df["test"][i])
    down_list.append(k[0])
    if len(k) >= 2:
        dist_list.append(k[1])
    else:
        dist_list.append("nan")
    if len(k) >= 3:
        jinti_list.append(k[2][:2])
        ball_on.append(k[2][2:4])
        hash_list.append(k[2][4])
        if k[2][5:9] == "PASS":
            playtype_list.append("PASS")
        elif k[2][5:8] =="RUN":
            playtype_list.append("RUN")
        elif k[2][5:7] == "FG":
            playtype_list.append("FG")
        elif k[2][5:8] == "PUNT":
            playtype_list.append("PUNT")
        else:
            playtype_list.append("?")
    else:
        jinti_list.append("nan")
        ball_on.append("nan")
        hash_list.append("nan")
        playtype_list.append("nan")

In [12]:
df["down"] = down_list

In [13]:
df["dist"] = dist_list

In [14]:
df["jinti"] = jinti_list

In [15]:
df["ball_on"] = ball_on

In [16]:
df["hash"] = hash_list

In [17]:
df["play_type"] = playtype_list

In [107]:
df

Unnamed: 0,test
0,"NR35Kick-off、#3鈴木健太キック…BB14,#35GamboaHerbert17..."
1,1&10-BB31RPASS#3KevinCraftパス失敗
2,2&10-BB31RPASS、#3KevinCraft一#85鈴木隆中11yパスP1
3,1&10-BB42LPASS#3KevinCraft一#14遠藤健史7yパス
4,2&3-BB49RPASS、#3KevinCraft一#85鈴木隆貴10yパスP2
5,1&10-NR41LRUN#7山中大輔3ッヶラン(#5田中喜貴)
6,2&7-NR38LPASS、#3KevinCraft一#40JohnStanton-2yパス...
7,3&9-NR40RRUN#32元山伊織5yラン
8,4&4-NR35MPASS、#3KevinCraft一#82白根混12yパス(#3佐久間徹)P3
9,1&10-NR23RPASS、#3KevinCraftパス失敗-NoPlay


In [1]:
# df.to_csv("df_test.csv")