In [847]:
import pandas as pd
import camelot
import time
import numpy as np
import glob 

In [848]:
%%time
# Read in original pdf file using camelot and transform it into dataframes.

# tables = camelot.read_pdf("raw.pdf", flavor='lattice', pages='1-end')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


In [849]:
# Save parsed dataframes into csv.

# for i, table in enumerate(tables):
#     table.df.to_csv(f"data/{i}.csv", index=False, header=False)

# Read in saved CSV files

In [860]:
# Read in all csv files and save as list of dataframes.

filenames = glob.glob("data/*.csv")
onlynumbers = [filename[5:-4] for filename in filenames]
numbers = sorted([int(num) for num in onlynumbers])

dataframes = []
for num in numbers:
    dataframes.append(pd.read_csv(f"data/{num}.csv", header=None))

In [861]:
# Remove the first three tables, which are not real data, but an example tables from the document.

dataframes = dataframes[3:]

In [862]:
# Fix mismatching number of columns by dropping first column from dataframes that have 10 columns.
# The first column of the dataframe () is not relevant to the analysis.

# Drop first row (header) in each dataframe

for dataframe in dataframes:
    dataframe.drop(index=dataframe.index[0], axis=0, inplace=True)
    if len(dataframe.columns) == 9:
        dataframe.drop(columns=dataframe.columns[0], axis=1, inplace=True)
    if len(dataframe.columns) == 10:
        dataframe.drop(columns=dataframe.columns[[0, 1]], axis=1, inplace=True)

In [863]:
# Concatenate all tables into a single master df

df = pd.concat(dataframes, ignore_index=True)

In [864]:
# Create column names

df.columns = ['owner', 'company', 'address', 'type', 'capacity_MW', 'approval_date', 'prep_time', 'note', 'note2']

In [865]:
# Remove all rows with no cell values

df = df[~df.isnull().all(axis=1)]

In [867]:
# Shift misaligned rows into the right index.

noname = df[df.owner.isnull()]

noname.iloc[:, 0] = noname.iloc[:, 1]
noname.iloc[:, 1] = noname.iloc[:, 2]
noname.iloc[:, 2] = noname.iloc[:, 3]
noname.iloc[:, 3] = noname.iloc[:, 4]
noname.iloc[:, 4] = noname.iloc[:, 5]
noname.iloc[:, 5] = noname.iloc[:, 6]
noname.iloc[:, 6] = noname.iloc[:, 7]
noname.iloc[:, 7] = noname.iloc[:, 8]

# Merge the adjusted dataframe "noname" back to original dataframe

df = pd.concat([df[df.owner.notnull()], noname], ignore_index=True)

# Drop last column, since they are already copied over to the previous column index
df.drop(columns = noname.columns[8], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noname.iloc[:, 0] = noname.iloc[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noname.iloc[:, 1] = noname.iloc[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noname.iloc[:, 2] = noname.iloc[:, 3]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [868]:
# Strip all \n from cell values

df = df.replace(r'\n','', regex=True) 

In [869]:
# Replace '“', '\"' values with proper cell values.

for column in df.columns:
    missing_index = df[(df[column] == '“') | (df[column] == '\"')].index.values.astype(int)
    for index in missing_index:
        df[column][index] = df[column][index-1]

df = df[~df.isnull().all(axis=1)]
df.reset_index(drop=True, inplace=True)

In [870]:
# Some "owner" cells include company names. 
# If owner cell has two words and the company cell is empty,
# transfer the second word from the owner cell to the company cell.

company_in_owner_col_filt = (df['owner'].str.split().str.len() > 1) & df['company'].isna()

companies_and_owners = df.loc[company_in_owner_col_filt, 'owner'].str.split()

owners = companies_and_owners.apply(lambda l: l[0])
companies = companies_and_owners.apply(lambda l: l[1])

df.loc[company_in_owner_col_filt, 'owner'] = owners
df.loc[company_in_owner_col_filt, 'company'] = companies

In [871]:
# Some "address" cells include company names. 

# If the company cell is empty and address cell is not, transfer the company name from the address cell to the company cell.

company_in_address_col_filt = df['approval_date'].notnull() & df['address'].notnull() & df['owner'].notnull() & df['company'].isna()

companies_and_adddress = df.loc[company_in_address_col_filt, 'address'].str.split()

companies = companies_and_adddress.apply(lambda l: l[0])
addresses = companies_and_adddress.apply(lambda l: l[1:])

df.loc[company_in_address_col_filt, 'company'] = companies
df.loc[company_in_address_col_filt, 'address'] = addresses



In [872]:
df

Unnamed: 0,owner,company,address,type,capacity_MW,approval_date,prep_time,note
0,이건우,강원풍력발전(주),강원도 평창군 도암면횡계리,풍력,"98,000kw",`02.6.11,2005.10.31,
1,후란쓰이스링거,코리아카본블랙(주)여수발전소,전남 여수시 월내동 350,부생가스,"13,900×22,500×1기",‘02.6,2002.8.31,
2,후란쓰이스링거,코리아카본블랙(주)부평발전소,인천시 부평구 갈산1동94,부생가스,"12,500Kw",‘02.6,2002.8.31,
3,이호인,(주)상원이엔씨매립가스,인천시 서구 백석동 58,내연기관(매립가스),"13,500×5기1,000×3기380×1기",‘02.9,2002.12.31,
4,김영철,한국중부발전(주)보령발전소,충남 보령시 오천면오포리 산 212,석탄,총580만KW,‘02.9,7호기‘08.6.308호기‘08.12.31,송전용건설시한전및 거래와 협의
...,...,...,...,...,...,...,...,...
3041,김선웅,창원클린에너지,경남 창원시 성산구 응남동 46-1,연료전지(LNG),105.6,2022-10-31,2023-10-31,양수인가
3042,최재서,양양풍력발전,"강원도 양양군 현남면 상월천리 산1-1,현북면 어성전리 산2번지 일원",풍력,4.2,2022-10-31,2023-12-31,사업준비기간연장
3043,최재서,양양풍력발전,"강원도 양양군 현남면 하월천리 산1-1번지, 상월천리 산1-1번지, 현...",풍력,40,2022-10-31,2023-12-31,사업준비기간연장
3044,김근안,태백귀네미풍력발전,강원도 태백시 하사미동 산220-3 일원,풍력,19.8,2022-11-01,2020-06-30,상업운전 용량 일치화


In [873]:
# Combine multiple rows of same project.

df[df.owner.isnull()]

Unnamed: 0,owner,company,address,type,capacity_MW,approval_date,prep_time,note
1354,,,료),,,,,
1355,,(대구혁신도시 내),(LNG),,,,변경,
1356,,"188-11, 1-134",,,,,,
1357,,산49-1,,3,,,,
1358,,,S/T,,,,"강릉화력1,2호기→강릉안인화력1,2호기용 량 변 경 : 2,120 → ...",
...,...,...,...,...,...,...,...,...
3001,,,도:34°50′51.25″/경도:125°56′29.84″④위도:34°49′22.18...,,,,,
3007,,,조리 산159번지 일원,,,,,
3011,,,"40""E34°10'37.45""N127°42'55.59""E34°10'33.36""N12...",,,,,
3032,,,"리 산78-1, 산청읍 범학리 산31, 척지리 산134",,,,,


### Scratchwork

In [874]:
%%time

# Combine rows that are divided into 2 due to page breaks.

# for num in range(0, df.shape[0]):
#     df = df.fillna('')
#     for column in df.columns:
#         missing_index = df[df[column].isnull()].index.values.astype(int)
#         for column in df.columns:
#             for index in missing_index:
#                 df[column][index-1] = str(df[column][index-1]) + str(df[column][index])
#                 df[column][index] = ""
#         df = df.replace("", np.nan)
#         df = df[~df.isnull().all(axis=1)]

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.72 µs


In [875]:
# Remove all rows that are newly empty after being combined with the previous rows.

df = df.replace("", np.nan)
df = df[~df.isnull().all(axis=1)]

In [876]:
# Replace all capacity(MW) columns into float

In [877]:
# Blank values in note column means the project is newly approved.
# Add "newly approved" to all blank cells in "note" column.

# df.note = df.note.replace(np.nan, "newly approved")