In [212]:
import pandas as pd
import camelot
import time
import numpy as np
import re
import glob 
import re

In [213]:
# Read in original pdf file using camelot and transform it into dataframes.

# tables = camelot.read_pdf("raw.pdf", flavor='lattice', pages='1-end')

In [214]:
# Save parsed dataframes into csv.

# for i, table in enumerate(tables):
#     table.df.to_csv(f"data/{i}.csv", index=False, header=False)

# Read in saved CSV files

In [215]:
# Read in all csv files and save as list of dataframes.

filenames = glob.glob("data/*.csv")
onlynumbers = [filename[5:-4] for filename in filenames]
numbers = sorted([int(num) for num in onlynumbers])

dataframes = []
for num in numbers:
    dataframes.append(pd.read_csv(f"data/{num}.csv"))

# Remove the first three tables, which are not real data, but an example tables from the document.
dataframes = dataframes[3:]

default_columns = dataframes[0].columns.str.replace(r'\n','', regex=True)
default_columns = [col for col in default_columns if col != 'NO']

# Strip all \n from cell values
# Handle Error Cases
# Standardize Column format
for idx, dataframe in enumerate(dataframes):
    dataframe.replace(r'\n','', regex=True, inplace=True)
    dataframe.columns = dataframe.columns.str.replace(r'\n','', regex=True)
    
    dataframe.rename(columns={
        "*허가일*변경일" : "허가및변경일"
    }, inplace=True)
    
    dataframe.rename(columns={
        "NO 대표자" : "대표자"
    }, inplace=True)
    
    try:
        dataframes[idx] = dataframe[default_columns]
    except Exception as e:
        error_msg = e
        
        if str(error_msg) == "\"['용량(MW)', '허가및변경일'] not in index\"":
            dataframe.rename(columns={
                "Unnamed: 6" : "허가및변경일",
                "용량허가및변경(MW)일" : "용량(MW)"
            }, inplace=True)
            dataframes[idx] = dataframe[default_columns]
        else:
            print(f"Problem with Columns in Dataframe {idx} : {error_msg}")

In [216]:
# Concatenate all tables into a single master df

df = pd.concat(dataframes, ignore_index=True)

# Create column names

df.columns = ['owner', 'company', 'address', 'type', 'capacity_MW', 'approval_date', 'prep_time', 'note']

# Remove all rows with no cell values

df = df[~df.isnull().all(axis=1)]

In [217]:
quote_missing = df.approval_date.isna() & df.prep_time.notnull()
df.loc[quote_missing, "approval_date"] = df.loc[quote_missing, "approval_date"].fillna("“")

# Replace '“', '\"' values with proper cell values.

for column in df.columns:
    missing_index = df[(df[column] == '“') | (df[column] == '\"') | (df[column] == '‘“')].index.values.astype(int)
    for index in missing_index:
        df[column][index] = df[column][index-1]

df = df[~df.isnull().all(axis=1)]
df.reset_index(drop=True, inplace=True)

In [218]:
# Some "owner" cells include company names. 
# If owner cell has two words and the company cell is empty,
# transfer the second word from the owner cell to the company cell.

company_in_owner_col_filt = (df['owner'].str.split().str.len() > 1) & df['company'].isna()

companies_and_owners = df.loc[company_in_owner_col_filt, 'owner'].str.split()

owners = companies_and_owners.apply(lambda l: l[0])
companies = companies_and_owners.apply(lambda l: l[1])

df.loc[company_in_owner_col_filt, 'owner'] = owners
df.loc[company_in_owner_col_filt, 'company'] = companies


# Some "address" cells include company names. 

# If the company cell is empty and address cell is not, transfer the company name from the address cell to the company cell.

company_in_address_col_filt = df['approval_date'].notnull() & df['address'].notnull() & df['owner'].notnull() & df['company'].isna()

companies_and_adddress = df.loc[company_in_address_col_filt, 'address'].str.split()

companies = companies_and_adddress.apply(lambda l: l[0])
addresses = companies_and_adddress.apply(lambda l: " ".join(l[1:]))

df.loc[company_in_address_col_filt, 'company'] = companies
df.loc[company_in_address_col_filt, 'address'] = addresses



# Combine overflowing rows into a single row.

In [219]:
def concat(v1, v2):
    result = ""
    if v1 is not np.nan:
        result += str(v1)
    if result:
        result += " "
    if v2 is not np.nan:
        result += str(v2)
    if not result:
        result = np.nan
    return result

In [220]:
def roll_up(df):
    for idx in reversed(df[df["approval_date"].isna()].index):
        df.loc[idx-1] = df.loc[idx-1].combine(df.loc[idx], concat)
        df.drop(index=idx, inplace=True)
    df.reset_index(drop=True, inplace=True)

In [221]:
roll_up(df)

In [222]:
df[df.address.str.contains("포지리  1249", na=False)]

Unnamed: 0,owner,company,address,type,capacity_MW,approval_date,prep_time,note
1843,김철웅,이원신재생에너지복지마을,"충남 태안군 이원면 포지리 1249, 관리 1570",태양광,30.0,2020-06-30,2023-03-31,
2077,김병숙,한국서부발전,충남 태안군 이원명 포지리 1249번지 일원,태양광,45.0,2021-01-27,2023-01-31,사업준비기간연장
2653,신경철,이원신재생에너지복지마을,"충남 태안군 이원면 포지리 1249, 관리 1570",태양광,30.0,2022-06-28,2023-03-31 대표자 변경,


# Replace NaN in note column to "신규" (newly approved)

### Check if note section is in "prep_time" column and move them into "note" column.

In [223]:
pd.options.display.max_colwidth = None
pd.options.display.max_rows = None

df.prep_time.fillna("", inplace=True)


note_in_prep = ~df.prep_time.str.contains(r"^(\d|-|\.|\s|‘|호기|년이?내?|개?월|'|""|\(|\)|[A-Z])+$|_", na=False) & df.note.isna()
df_note_in_prep = df[note_in_prep]
df_note_in_prep.drop([12], axis=0, inplace=True)

df_note_in_prep

  note_in_prep = ~df.prep_time.str.contains(r"^(\d|-|\.|\s|‘|호기|년이?내?|개?월|'|""|\(|\)|[A-Z])+$|_", na=False) & df.note.isna()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_note_in_prep.drop([12], axis=0, inplace=True)


Unnamed: 0,owner,company,address,type,capacity_MW,approval_date,prep_time,note
199,김홍권,수완에너지(주),광주광역시 광산구수완로 130,열병합(LNG),109,‘12.12.5,2013.3.1발전사업전환,
470,미쯔오카히데노리,고원풍력발전,강원 태백시 창죽동 산 63번지 일원,풍력,20.0,2015-01-14,2015-05-31 대표자변경,
471,윤병석,당진에코파워,충남 당진시 석문면 교로3리 1026번지 외,석 탄 화 력(S/T),1160.0,2015-01-14,2016-12-31 상호및대표자변경,
479,조성철,내포그린에너지,"충남 홍성군 홍북면,예산군 삽교읍 일원",심의이유는 srf로 통일되어서 임,97.0,2015-03-03,2020-12-31 사업준비기간 연장 등,
489,정의섭,전주파워,전북 전주시 덕진구 팔복로 59,바이오매스(목질계),32.36,2015-03-13,2017-12-31 대표자변경,
505,김성회,한 국 지 역 난 방공사,전남 나주시 산포면 세남로,열병합,20.0,2015-04-03,2017-12-31 사업준비기간 연장,
506,기찬수,거 창 풍 력 발 전주식회사,경남 거창군 신원면 덕산리 산57번지 일원,풍력,14.0,2015-04-07,2017-03-31 대표자변경,
508,최종하,세현에너지,"충남 당진시 석문면 난지도리 183, 221-1, 223-2, 266-2일원",태양광,3.915,2015-04-22,2015-12-31 사업준비기간연장,
509,오재창,세종그린파워,세종시 장군면 봉안리 160-13,바 이 오 매스 소각열,5.0,2015-04-28,2016-03-31 양수인가,
510,구자숭,엘 아 이 케 이 파워,강원 강릉시 옥계면 산계리 8번지,"기 력 발 전(화석연료, 폐 기 물 에 너지, 바이오매스 등)",7.0 nan,2015-04-28,2015-12-31 용량변경 9.1-->7,


In [224]:
def retrieve_note(prep):
    note = re.findall(r"^\d{4}\.?-?\d{1,2}\.?-?\d{1,2}(.*)", prep)[0].strip()
    return note

df_note_in_prep.note = df_note_in_prep.prep_time.apply(lambda x: retrieve_note(x))

df.loc[df_note_in_prep.index, "note"] = df_note_in_prep.note

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_note_in_prep.note = df_note_in_prep.prep_time.apply(lambda x: retrieve_note(x))


In [225]:
df.note.fillna('신규', inplace=True)
df.note = df.note.str.replace(" ", "") \
    .str.replace("nan", "신규") \
    .str.replace("nannan", "신규") \
    .str.replace("신규신규", "신규")


# Check non-Nan values in note to see if they are also onewly approved.

not_new = ['연장', '변경', '취득', '용량', '용량변경', '정정', '재발급', '인가', '양수', '추가', '합병', '증설', '양도', '개시', '반납', '취소', '축소', '준비기간', '조정', '법인', 'SRF', "→", '지번', '재교부', '전환', '수정', '발전', '종류', '공급', '폐기물', '배방읍', '구분']
# new = ['신규', '이내', '허가', '일부', '동의']

df.loc[~df.note.str.contains('|'.join(not_new)), "note"] = "신규"


In [226]:
pd.options.display.max_colwidth=20

df[~df.note.str.contains('|'.join(not_new))].note.unique()

array(['신규'], dtype=object)

# Convert "approval_date" from df_new to datetime

In [227]:
# Only select plants that are newly approved

df_new = df[df.note=='신규']

In [228]:
# Clean up approval_date

df_new.approval_date = df_new.approval_date.str.replace("`", "20", regex=False).str.replace("‘", "20", regex=False).str.replace("'", "20", regex=False).str.replace(".", "-", regex=False)
df_new.approval_date = df_new.approval_date \
                        .str.replace(r"^12-5$", "2011-12-05", regex=True) \
                        .str.replace(r"^3-5$", "2012-03-05", regex=True) \
                        .str.replace(r"^3-30$", "2012-03-30", regex=True) \
                        .str.replace(r"^4-26$", "2012-04-26", regex=True) \
                        .str.replace(r"^12-6-4$", "2012-06-04", regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.approval_date = df_new.approval_date.str.replace("`", "20", regex=False).str.replace("‘", "20", regex=False).str.replace("'", "20", regex=False).str.replace(".", "-", regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.approval_date = df_new.approval_date \


In [229]:
df_new['year'] = df_new.approval_date.str[:4]

month_regex = r"\d{4}-(\d{1,2})"

def string_to_month(string):
    month = re.findall(month_regex, string)[0]
    return month

df_new['month'] = df_new.approval_date.apply(lambda x: string_to_month(x))
df_new['month'] = df_new['month'].str.zfill(2)

df_new['cleaned_approval_time'] = df_new['year'] + "-" + df_new['month']
df_new['cleaned_approval_time'] = pd.to_datetime(df_new['cleaned_approval_time'], format="%Y-%m")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['year'] = df_new.approval_date.str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['month'] = df_new.approval_date.apply(lambda x: string_to_month(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['month'] = df_new['month'].str.zfill(2)
A value is trying to be set on a copy

# Clean up "용량(MW)" (capacity) column into a consistent format

### 1. If the cell is in KW, change the number and unit to MW.

In [230]:
df_in_kw = df_new[df_new['capacity_MW'].str.lower().str.contains("kw", na=False)]

# Identify which cells have multiple plants, so I can multiply the capacity by two later on.
has_multiple_plants = df_in_kw['capacity_MW'].str.contains("기")

# df.capacity_MW["kw" in df.capacity_MW.str.lower()]
has_10k = df_in_kw['capacity_MW'].str.contains("만")
has_1k = df_in_kw['capacity_MW'].str.contains("천")

df_in_kw.loc[has_10k, 'cleaned_capacity'] = df_in_kw['capacity_MW'].str.replace("만", "0000")
df_in_kw.loc[has_10k & has_1k, 'cleaned_capacity'] = df_in_kw['capacity_MW'].str.replace("만", "").str.replace("천", "000")

# Fill in empty 'cleaned_capacity' column with 'capacity_MW' values.
df_in_kw.cleaned_capacity = df_in_kw.cleaned_capacity.fillna(df_in_kw.capacity_MW)

# Convert capacity value into int and divide capacity by 1,000
df_in_kw.cleaned_capacity = df_in_kw.cleaned_capacity \
                                .str.lower() \
                                .str.replace("총", "") \
                                .str.replace("kw", "") \
                                .str.replace(",", "") \
                                .str.replace("7-8호기", "") \
                                .str.replace("2기", "")

df_in_kw.cleaned_capacity = df_in_kw.cleaned_capacity.astype(float)
df_in_kw.cleaned_capacity = df_in_kw.cleaned_capacity / 1000
df_in_kw[has_multiple_plants].cleaned_capacity = df_in_kw[has_multiple_plants].cleaned_capacity * 2


df_in_kw['capacity_MW'] = df_in_kw['cleaned_capacity']
df_in_kw.drop(columns=['cleaned_capacity'], inplace=True)

df_new.loc[df_in_kw.index, 'capacity_MW'] = df_in_kw['capacity_MW']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in_kw.loc[has_10k, 'cleaned_capacity'] = df_in_kw['capacity_MW'].str.replace("만", "0000")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in_kw.cleaned_capacity = df_in_kw.cleaned_capacity.fillna(df_in_kw.capacity_MW)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in_kw.cleaned_capacity = d

### 2. Check if there are other problems with the capacity_MW columns

In [231]:
# Remove all commas and "MW" from values.
df_new.capacity_MW = df_new.capacity_MW.astype(str)\
                        .str.lower() \
                        .str.replace(",", "") \
                        .str.replace("mw", "") \
                        .str.replace("nan", "") \
                        .str.replace("!", "1") \
                        .str.replace(r"×?1기", "")

# Remove additional information from capacity_MW cells.
has_additional_info = df_new.capacity_MW.str.contains("\(") & df_new.capacity_MW.str.contains("\)")
df_new["capacity_MW"] = df_new["capacity_MW"].str.replace(r"[(].*[)]", "", regex=True)

# Additional manual cleaning for misread values by cross-referncig them with original pdf file.
df_new.capacity_MW = df_new.capacity_MW.str.replace("13900×22500", "36400") \
                                        .str.replace("834.3\(gt2기 st", "834.3") \
                                        .str.replace("13500×5기1000×3기380", "70880") \
                                        .str.replace("여수250군장122.9", "372.9") \
                                        .str.replace("126313.67.3", "1283.9") \
                                        .str.replace("555.17.2", "762.3") \
                                        .str.replace("19.99 3.0", "19.993") \
                                        .str.replace(" ", "") \
                                        .str.strip() \
                                        .str.replace("82219.2", "841.2") \

# There are three entries with empty capacity, where its 9.9 capacity is placed within the energy type. Fix these entires.
df_new.loc[df_new.capacity_MW == "", 'capacity_MW'] = df_new[df_new.capacity_MW == ""]["capacity_MW"].str.replace("", "9.9")


# Some capacity numbers are unreasonbly high, and may actually be in kw. Identify these entries.
df_new.capacity_MW = df_new.capacity_MW.astype(float)
df_new[df_new.capacity_MW>1000]
df_new.loc[df_new.capacity_MW >= 35000, 'capacity_MW'] = df_new[df_new.capacity_MW >= 35000]["capacity_MW"]/1000



  .str.replace(r"×?1기", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.capacity_MW = df_new.capacity_MW.astype(str)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new["capacity_MW"] = df_new["capacity_MW"].str.replace(r"[(].*[)]", "", regex=True)
  .str.replace("834.3\(gt2기 st", "834.3") \
  .str.replace("여수250군장122.9", "372.9") \
  .str.replace("126313.67.3", "1283.9") \
  .str.replace("555.17.2", "762.3") \
  .str.replace("19.99 3.0", "19.993") \
  .str.replace("82219.2", "841.2") \
A value is trying to be set on a copy of a slice from a Dat

# Clean "Type" from df_new

In [232]:
df_new.type = df_new.type.str.replace(" ", "")

# Manual cleaning misread values by cross-referencing with original pdf.
df_new.company = df_new.company.str.replace("영암태 전", "영암태양광발전")
df_new.address = df_new.address.str.replace("양광발 전", "전")
df_new.loc[956, "type"] = "태양광"
df_new.loc[1667, "type"] = "태양광"
df_new.loc[2572, "type"] = "풍력"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.type = df_new.type.str.replace(" ", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.company = df_new.company.str.replace("영암태 전", "영암태양광발전")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.address = df_new.address.str.replace("양광발 전", "전")


### Simplify type into "cleaned_type" column.

In [233]:
df_new['cleaned_type'] = np.nan

# df_new.loc[df_new.type.str.contains("열병합", na=False), 'cleaned_type'] = "coal"
# df_new.loc[df_new.type.str.contains("복합", na=False), 'cleaned_type'] = "coal"
df_new.loc[df_new.type.str.contains("가스", na=False), 'cleaned_type'] = "gas"
df_new.loc[df_new.type.str.contains("폐열", na=False), 'cleaned_type'] = "coal"
df_new.loc[df_new.type.str.contains("바이오", na=False), 'cleaned_type'] = "bioenergy"
df_new.loc[df_new.type.str.contains("목질계", na=False), 'cleaned_type'] = "bioenergy"
df_new.loc[df_new.type.str.contains("우드칩", na=False), 'cleaned_type'] = "bioenergy"
df_new.loc[df_new.type.str.contains("원자력", na=False), 'cleaned_type'] = "nuclear"
df_new.loc[df_new.type.str.contains("수력", na=False), 'cleaned_type'] = "hydro"
df_new.loc[df_new.type.str.contains("양수", na=False), 'cleaned_type'] = "hydro"
df_new.loc[df_new.type.str.contains("풍력", na=False), 'cleaned_type'] = "wind"
df_new.loc[df_new.type.str.contains("태양광", na=False), 'cleaned_type'] = "solar"
df_new.loc[df_new.type.str.contains("석탄", na=False), 'cleaned_type'] = "coal"
df_new.loc[df_new.type.str.contains("유연탄", na=False), 'cleaned_type'] = "coal"
df_new.loc[df_new.type.str.lower().str.contains("ng", na=False), 'cleaned_type'] = "gas"
# df_new.loc[df_new.type.str.lower().str.contains("srf", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("생활", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("폐기물", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("페기물", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("고체연료", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("고형연료", na=False), 'cleaned_type'] = "srf"
# df_new.loc[df_new.type.str.lower().str.contains("연료전지", na=False), 'cleaned_type'] = "hydrogen fuel cells"
# df_new.loc[df_new.type.str.lower().str.contains("조력", na=False), 'cleaned_type'] = "ocean"
# df_new.loc[df_new.type.str.lower().str.contains("해양", na=False), 'cleaned_type'] = "ocean"
# df_new.loc[df_new.type.str.lower().str.contains("스팀", na=False), 'cleaned_type'] = "steam"
# df_new.loc[df_new.type.str.lower().str.contains("증기", na=False), 'cleaned_type'] = "steam"
df_new.loc[df_new.cleaned_type.isna(), 'cleaned_type'] = "others"




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['cleaned_type'] = np.nan


In [234]:
df_cleaned = df_new[['cleaned_type', 'capacity_MW', 'address', 'year', 'company', 'owner']]

# Clean up "address" column

### Check repeating addresses to make sure they are not redundancies.

Following addresses are checked with original pdf to prove they are separate plants:
* 강원도  영월군  영월읍  정양리  540,  541-1,  541-2, 566,  567,  568,  569
* 전남 나주시 영산동664-1 등지 (Multiple parks by same company in the same location (likely expansion)
* 인천시  남구  학익동  723번지
* 강원도  영월군  상동읍  천평리  249번지  일원
* 강원  정선군  고한읍  고한리 산216-1  등지
* 전남  광양시  광양읍  세풍리 2200
* 충남  보령시  성주면  개화리 산30 
* 울산시 북구 효암로 84-13 
* 경북 김천 어모면
* 경북  의성군  의성읍  철파리  531,  532-2,  533-2, 530-1,  530-2  번지 
* 경기 포천 창수면 
* 경북  안동시  임하면  신덕리  산93,  96,  97,  100-1, 100-2 
* 인천시  서구  백범로  934번길 23
* 부산 강서구 신호동
* 경남  함안군  군북면  장지리 1407
* 강원도  태백시  동점동  동 점산업단지  블록  3-3
* 경기도  양주시  남면  삼일로485번길  67-22
* 강원도  태백시  동점동  동점산업단지  블록  3-6(동점산업단지내) 
* 충남  서산시  대산읍  죽엽로 397일원(대죽리)
* 부산시 기장군 정관면 산단1로 83
* 전남 곡성군 오곡면 침곡리 산39, 송정리 산96 일원
* 충남 아산시 둔포면 염작리 92-10, 92-28
* 충남 당진군 석문면교로리 산 974	
* 인천시 서구 백석동 58  
* 경기도  남양주  왕숙  공공주택지구
* 경남 양산시 원동면대리 산 93-4
* 경남 양산시 원동면대리 산 93-4	

Following addresses are pending investigations.
* 충남  아산시  둔포면  염작리  92-28 -> I have reached out to KOREC to clarify the situation. The data has not be treated and all addresses remain in df_new.
* 제주시  구좌읍  덕천리  산82 (Are these three separate solar farms? Or just duplication by mistake?)
* 전북  익산시  석암로  3길 80(익산제2일반산업단지 내) (Are these three separate solar farms? Or just duplication by mistake?)
* 경기  양주시  남면  상수리  산112-11
* 경북  영덕군  영덕읍  매정리  1162번지  일원
* 광주시  북구  운정동  627번지외  5필지
* 전남  장흥군  장흥읍  삼산리  749-1번지  일원
* 경북 군위군 삼국유사면 화수리 산45-1, 산44

Except for the pending investigations, all addresses should be considered as unique from here below.

In [235]:
# Some company column includes addresses. Fix these rows.

no_address = df_cleaned.address.isna()

# df_cleaned[no_address].address = df_cleaned[no_address].company.apply(lambda x: company_to_address(x))
df_cleaned[no_address].company.str.split(" ")
# df_cleaned.loc[no_address, 'address'] = df_

companies_and_adddress = df_cleaned.loc[no_address, 'company'].str.split()

companies = companies_and_adddress.apply(lambda l: l[0])
addresses = companies_and_adddress.apply(lambda l: " ".join(l[1:]))

df_cleaned.loc[no_address, 'company'] = companies
df_cleaned.loc[no_address, 'address'] = addresses

### Identify coordinate location entries.

In [236]:
has_coord = df_cleaned.address.str.contains("°|˚")

address_and_coord = df_cleaned.loc[has_coord, 'address'].str.split('\(')
addresses = address_and_coord.apply(lambda x: x[0])
df_cleaned.loc[has_coord, 'address'] = addresses
df_cleaned.loc[1424, 'address'] = "전남 신안군 도초면 우이도리 남동측 해상"

In [260]:
pd.options.display.max_colwidth = None

# Clean other rows with coordinates in the cell.

df_cleaned[df_cleaned.address.str.contains(r"\d{8}")]
address_and_coord = df_cleaned.loc[df_cleaned.address.str.contains(r"\d{8}"), 'address'].str.split('\(')
addresses = address_and_coord.apply(lambda x: x[0])
df_cleaned.loc[df_cleaned.address.str.contains(r"\d{8}"), 'address'] = addresses

df_cleaned[df_cleaned.address.str.contains("N")]
address_and_coord = df_cleaned.loc[df_cleaned.address.str.contains("N"), 'address'].str.split('\(')
addresses = address_and_coord.apply(lambda x: x[0])
df_cleaned.loc[df_cleaned.address.str.contains("N"), 'address'] = addresses

has_coord = df_cleaned.address.str.contains("위도") & df_cleaned.address.str.contains("경도")
df_cleaned[has_coord]
address_and_coord = df_cleaned.loc[has_coord, 'address'].str.split('\(')
addresses = address_and_coord.apply(lambda x: x[0])
df_cleaned.loc[has_coord, 'address'] = addresses

df_cleaned.address = df_cleaned.address.str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.address = df_cleaned.address.str.strip()


In [267]:
df_cleaned.capacity_MW = df_cleaned.capacity_MW.round(1)
df_cleaned.to_csv("df_cleaned.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.capacity_MW = df_cleaned.capacity_MW.round(1)
