In [2]:
import os
import pathlib
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import sys
import pyocr
import pyocr.builders
import pathlib
import glob
import pandas as pd
import re
import numpy as np

In [3]:
def pdf_to_image():
    # poppler/binを環境変数Pathに追加する(一時的に)
    # Path("__file__").parent.resolve()で.pyファイルの親フォルダ絶対パスを返す
    poppler_dir = pathlib.Path("__file__").parent.resolve() / "poppler/bin"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(poppler_dir)

    # PDFファイルのパス
    pdf_dir = pathlib.Path('./pdf_file')
    # globでディレクトリ内のpdfファイルをリストで取得
    pdf_path = list(pdf_dir.glob('**/*.pdf'))

    # PDF -> Imageに変換(200dpi)
    pages = convert_from_path(str(pdf_path[0]))

    # 画像ファイルを１ページづつ保存
    image_dir = pathlib.Path("./image_file")
    for i, page in enumerate(pages):  # enumerate関数でpagesのpage数を取得
        # .stemでpathの末尾を表示（pathlib)
        file_name = pdf_path[0].stem + "_{:02d}".format(i + 1) + ".jpeg"
        image_path = image_dir / file_name
        # JPEGで保存
        page.save(str(image_path), "JPEG")

In [4]:
def image_ocr():

    # tesseract-OCRのパスを通す
    tessera_path = "C:\Program Files\Tesseract-OCR"
    # pathsepは環境変数に追加するときの区切り；
    os.environ["PATH"] += os.pathsep + str(tessera_path)

    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)  # 引数1は終了ステータスで１を返す

    tool = tools[0]

    # ocr対象のファイルがあるディレクトリ
    image_dir = pathlib.Path('./image_file')
    # globでディレクトリ内のjpegファイルをリストで取得
    jpg_path = list(image_dir.glob('**/*.jpeg'))

    for i in jpg_path:
        # ocrした内容を変数txtにする
        txt = tool.image_to_string(
        Image.open(str(i)),
        lang="jpn",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
        )
        # 変数txtをtxt_fileディレクトリにtxtファイルで保存
        with open('./txt_file/' + str(i.stem) + '.txt', mode='wt') as t:
            t.write(txt)

In [5]:
def get_fieldposition(s):
    position = np.nan
    y = np.nan
    Hash = np.nan
    fp = re.search(r'\w\w\d+[LMR]', s)
    if fp is not None:
        fp_all = fp.group()
        yard = re.search(r'\d+', fp_all)
        y = yard.group()
        position = fp_all[0:2]
        Hash = fp_all[-1]
    return position, y, Hash

In [6]:
def get_downdistance(s):
    down = np.nan
    distance = np.nan
    dd = re.search(r'[1234]&\d+', s)
    if dd is not None:
        dd_split = re.split("&", dd.group())
        down = dd_split[0]
        distance = dd_split[1]
    return down, distance

In [7]:
def get_gain(s):
    gain = np.nan
    yardy = re.search(r'-*\d+y', s)
    if yardy is not None:
        yard = re.search(r'-*\d+', yardy.group())
        gain = yard.group()
    return gain

In [8]:
def get_playtype(s):
    play_type = np.nan
    if "RUN" in s:
        play_type = "Run"
    elif "PASS" in s:
        play_type = "Pass"
    elif "FG" in s:
        play_type = "FG"
    elif "PUNT" in s:
        play_type = "Punt"
    elif "Kick-off" in s:
        play_type = "KO"
    elif "Extra" in s:
        play_type = "Extra Pt."
    else:
        pass
    return play_type

In [9]:
def get_offense_team(s):
    offense_team = np.nan
    ot = re.search(r'\d+:\d+', s)
    if ot:
        offense_team = re.search(r'\D+', s).group()
    return offense_team

In [25]:
def get_quarter(s):
    quarter = np.nan
    q = re.search(r'Quarter', s)
    if q:
        fq = re.search(r'First', s)
        sq = re.search(r'Second', s)        
        tq = re.search(r'Third', s)        
        yq = re.search(r'Fourth', s)
        if fq:
            quarter = "1"
        elif sq:
            quarter = "2"
        elif tq:
            quarter = "3"
        elif yq:
            quarter = "4"
    return quarter

In [26]:
def get_stats_dict(s):
    position, y, Hash = get_fieldposition(s)
    down, distance = get_downdistance(s)
    gain = get_gain(s)
    play_type = get_playtype(s)
    offense_team = get_offense_team(s)
    quarter = get_quarter(s)
    stats_list = [("position", position), ("YARD LN", y), ("HASH", Hash),
                  ("DN", down), ("DIST", distance), ("GN/LS", gain),
                  ("PLAY TYPE", play_type), ("offense team", offense_team),
                  ("QTR", quarter),]
    stats_dict = dict(stats_list)
    return stats_dict

In [33]:
def get_data(path):
    with open(path) as f:
        lines = f.readlines()
    lines_strip = [line.strip() for line in lines]
    l_XXX = [line for line in lines_strip if ('&' in line) or ('Penalty' in line) or ('Kick-off' in line) or ('Extra Point' in line) or ('TIMEOUT' in line) or (':' in line) or ('Quarter' in line)]
    remove_space = [line.replace(' ', '') for line in l_XXX]
    df = pd.DataFrame(remove_space, columns=["test"])
    stats_list = [get_stats_dict(df["test"][i]) for i in range(len(df["test"]))]
    stats_df = pd.DataFrame(stats_list)
    for i in range(len(stats_df["offense team"])):
        if stats_df["offense team"][i] is np.nan:
            if i >= 1:
                h = i - 1
                d = stats_df["offense team"][h]
                while d is np.nan and h > 0:
                    h -= 1
                    d = stats_df["offense team"][h]
                stats_df["offense team"][i] = d
#     stats_df.loc[stats_df["offense team"] == "ノジマ相模原ライズ", "ODK"] = "O"
#     stats_df.loc[stats_df["offense team"] == "IBMBigBlue", "ODK"] = "D"
#     stats_df.loc[stats_df["PLAY TYPE"] == "Punt", "ODK"] = "K"
#     stats_df.loc[stats_df["PLAY TYPE"] == "Extra Pt.", "ODK"] = "K"
#     stats_df.loc[stats_df["PLAY TYPE"] == "KO", "ODK"] = "K"

    for i in range(len(stats_df["QTR"])):
        if stats_df["QTR"][i] is np.nan:
            if i >= 1:
                h = i - 1
                d = stats_df["QTR"][h]
                while d is np.nan and h > 0:
                    h -= 1
                    d = stats_df["QTR"][h]
                stats_df["QTR"][i] = d
    return stats_df

In [34]:
get_data('./txt_file/rise_ibm_08.txt')

Unnamed: 0,position,YARD LN,HASH,DN,DIST,GN/LS,PLAY TYPE,offense team,QTR
0,,,,,,,,,
1,,,,,,,,,3.0
2,,,,,,,KO,,3.0
3,,,,,,,,ノジマ相模原ライズ,3.0
4,NR,44.0,L,1.0,10.0,5.0,Run,ノジマ相模原ライズ,3.0
5,NR,49.0,L,2.0,5.0,1.0,Run,ノジマ相模原ライズ,3.0
6,,,,,,15.0,,ノジマ相模原ライズ,3.0
7,BB,35.0,M,1.0,10.0,-1.0,Pass,ノジマ相模原ライズ,3.0
8,BB,36.0,L,2.0,11.0,7.0,Run,ノジマ相模原ライズ,3.0
9,BB,29.0,M,3.0,4.0,10.0,Pass,ノジマ相模原ライズ,3.0


In [20]:
path_list = glob.glob('./txt_file/*')

In [94]:
df = pd.DataFrame(columns=["position", "YARD LN", "HASH", "DN", "DIST", "GN/LS", "PLAY TYPE"])
for path in path_list:
    df = pd.concat([df, get_data(path)])
df = df.dropna(how='all')
df = df.reset_index(drop=True)

In [95]:
df.to_csv("test1.csv")