In [14]:
import pandas as pd
import datetime
from datetime import date
import matplotlib.pyplot as plt

In [15]:
# Study data files
fire_data_path = 'Resources/California_Fire_Incidents_original.csv'
ca_county_path = 'Resources/ca_county_list.csv'

# Read in csv file
fire_df = pd.read_csv(fire_data_path)
ca_county = pd.read_csv(ca_county_path)

# Only 'NAME' column 
ca_county_df = ca_county.loc[:, 'NAME']

# Combine the data into a single dataset
merged_fire_df = pd.merge(ca_county_df, fire_df, how='left', left_on = 'NAME', right_on = 'Counties')
merged_fire_df.head()

Unnamed: 0,NAME,AcresBurned,Active,AdminUnit,AirTankers,ArchiveYear,CalFireIncident,CanonicalUrl,ConditionStatement,ControlStatement,...,SearchKeywords,Started,Status,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,Updated,WaterTenders
0,Imperial,,,,,,,,,,...,,,,,,,,,,
1,San Diego,7055.0,False,CAL FIRE San Diego Unit / Cleveland National F...,,2013.0,True,/incidents/2013/7/6/chariot-fire/,,,...,Chariot Fire; July 2013; Sunrise Hwy; Julian; ...,2013-07-06T12:55:00Z,Finalized,9.0,149.0,,,ee19b2ec-a96a-4738-994e-fb3ea016e053,2013-07-15T06:15:00Z,24.0
2,San Diego,2781.0,False,CAL FIRE San Diego Unit,25.0,2013.0,True,/incidents/2013/5/23/san-felipe-fire/,,,...,San Felipe Fire; May 2013; Highway 78; Julian;...,2013-05-23T12:20:00Z,Finalized,,,,,859479e3-918c-42c6-bb74-5fdc17930a16,2013-05-26T17:45:00Z,26.0
3,San Diego,2236.0,False,Camp Pendleton Marine Corps Base,,2013.0,False,/incidents/2013/10/5/deluz-fire/,,,...,Camp Pendleton Marine Corps Base; Fallbrook; S...,2013-10-05T12:45:00Z,Finalized,,,,,9fb847ce-44c7-4a11-a733-a6cc82838671,2013-10-09T19:00:00Z,
4,San Diego,1271.0,False,CAL FIRE San Diego Unit,27.0,2013.0,True,/incidents/2013/5/26/general-fire/,The fire is burning East towards the desert an...,,...,General Fire; May 2013; San Diego County; Bann...,2013-05-26T12:04:00Z,Finalized,,,,,07a5397c-a665-4f84-9a82-5f689cb2c8f3,2013-05-31T06:15:00Z,27.0


In [16]:
# Clean data
original_ca_fire_df = merged_fire_df.drop(columns=[
    "Active", 
    "CanonicalUrl", 
    "ConditionStatement", 
    "FuelType", 
    "Location", 
    "ControlStatement", 
    "CalFireIncident", 
    "AdminUnit", 
    "PercentContained", 
    "Status", 
    "SearchDescription", 
    "SearchKeywords", 
    "Public", 
    "Updated"], inplace=False)


# Converting dtypes
original_ca_fire_df['ArchiveYear'] = original_ca_fire_df['ArchiveYear'].map('{:.0f}'.format)

# Rename columns
mapping = {original_ca_fire_df.columns[0]:'Counties', original_ca_fire_df.columns[4]: 'CountiesName'}
original_ca_fire_df = original_ca_fire_df.rename(columns=mapping) 

original_ca_fire_df.head()

Unnamed: 0,Counties,AcresBurned,AirTankers,ArchiveYear,CountiesName,CountyIds,CrewsInvolved,Dozers,Engines,Extinguished,...,MajorIncident,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,WaterTenders
0,Imperial,,,,,,,,,,...,,,,,,,,,,
1,San Diego,7055.0,,2013.0,San Diego,37.0,56.0,24.0,183.0,2013-07-15T06:15:00Z,...,True,Chariot Fire,2147.0,2013-07-06T12:55:00Z,9.0,149.0,,,ee19b2ec-a96a-4738-994e-fb3ea016e053,24.0
2,San Diego,2781.0,25.0,2013.0,San Diego,37.0,33.0,25.0,73.0,2013-05-26T17:45:00Z,...,True,San Felipe Fire,911.0,2013-05-23T12:20:00Z,,,,,859479e3-918c-42c6-bb74-5fdc17930a16,26.0
3,San Diego,2236.0,,2013.0,San Diego,37.0,,,,2013-10-09T19:00:00Z,...,False,DeLuz Fire,,2013-10-05T12:45:00Z,,,,,9fb847ce-44c7-4a11-a733-a6cc82838671,
4,San Diego,1271.0,27.0,2013.0,San Diego,37.0,54.0,27.0,42.0,2013-05-31T06:15:00Z,...,True,General Fire,1217.0,2013-05-26T12:04:00Z,,,,,07a5397c-a665-4f84-9a82-5f689cb2c8f3,27.0


In [17]:
# Get start and end date and month data in each row
start = list(original_ca_fire_df['Started'])
end = list(original_ca_fire_df['Extinguished'])
start_date = []
end_date = []
new_format = "%Y-%m-%d"

for sd in start:
    try:
        s_date = datetime.datetime.strptime(sd[0:10], new_format).strftime(new_format)
        start_date.append(s_date)
    except:
        start_date.append(None)
        
for ed in end:
    try:
        e_date = datetime.datetime.strptime(ed[0:10], new_format).strftime(new_format)
        end_date.append(e_date)
    except:
        end_date.append(None)

# No. of day that fire incident happens
days = []

for d in range(len(end_date)):
    try:
        e_date = end_date[d]
        date_e = date(int(e_date[0:4]), int(e_date[5:7]), int(e_date[8:10]))
        s_date = start_date[d]
        date_s = date(int(s_date[0:4]), int(s_date[5:7]), int(s_date[8:10]))
        day = date_e - date_s
        days.append(day.days)
    except:
        days.append(None)

# Started month
months = []

for m in range(len(start_date)):
    try:
        s_month = start_date[m]
        month = int(s_month[5:7])
        months.append(str(month))
    except:
        months.append(None)
        
# Add to dataframe
original_ca_fire_df['Started Month'] = months
original_ca_fire_df['Days Burned'] = days

original_ca_fire_df.head()

Unnamed: 0,Counties,AcresBurned,AirTankers,ArchiveYear,CountiesName,CountyIds,CrewsInvolved,Dozers,Engines,Extinguished,...,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,WaterTenders,Started Month,Days Burned
0,Imperial,,,,,,,,,,...,,,,,,,,,,
1,San Diego,7055.0,,2013.0,San Diego,37.0,56.0,24.0,183.0,2013-07-15T06:15:00Z,...,2147.0,2013-07-06T12:55:00Z,9.0,149.0,,,ee19b2ec-a96a-4738-994e-fb3ea016e053,24.0,7.0,9.0
2,San Diego,2781.0,25.0,2013.0,San Diego,37.0,33.0,25.0,73.0,2013-05-26T17:45:00Z,...,911.0,2013-05-23T12:20:00Z,,,,,859479e3-918c-42c6-bb74-5fdc17930a16,26.0,5.0,3.0
3,San Diego,2236.0,,2013.0,San Diego,37.0,,,,2013-10-09T19:00:00Z,...,,2013-10-05T12:45:00Z,,,,,9fb847ce-44c7-4a11-a733-a6cc82838671,,10.0,4.0
4,San Diego,1271.0,27.0,2013.0,San Diego,37.0,54.0,27.0,42.0,2013-05-31T06:15:00Z,...,1217.0,2013-05-26T12:04:00Z,,,,,07a5397c-a665-4f84-9a82-5f689cb2c8f3,27.0,5.0,5.0


In [18]:
column = original_ca_fire_df.columns

df_column = pd.DataFrame(column)
df_column

Unnamed: 0,0
0,Counties
1,AcresBurned
2,AirTankers
3,ArchiveYear
4,CountiesName
5,CountyIds
6,CrewsInvolved
7,Dozers
8,Engines
9,Extinguished


In [31]:
# set up additional columns to hold information
# original_ca_fire_df = original_ca_fire_df.insert(2, 'AvgAcresBurned', '')
# original_ca_fire_df(4 ,'AvgAirTankers', '')
# original_ca_fire_df(9, 'AvgCrewsInvolved', '')
# original_ca_fire_df['AvgDozers'] = ''
# original_ca_fire_df['AvgEngines'] = ''
# original_ca_fire_df['AvgFatalities'] = ''
# original_ca_fire_df['AvgHelicopters'] = ''
# original_ca_fire_df['AvgInjuries'] = ''
# original_ca_fire_df['AvgPersonnelInvolved']
# original_ca_fire_df['AvgStructuresDamaged']
# original_ca_fire_df['AvgStructuresDestroyed']
# original_ca_fire_df['AvgStructuresEvacuated']
# original_ca_fire_df['AvgStructuresThreatened']
# original_ca_fire_df['WaterTenders']

unique_df = original_ca_fire_df.loc[original_ca_fire_df['UniqueId']!=None]
id_count = unique_df['UniqueId'].value_counts().to_frame().reset_index()
uid_count = id_count.loc[id_count['UniqueId'] > 1]
unique_id = uid_count['index']
index_list = []
df_index_list = original_ca_fire_df.index
avg_acres = []

for i in unique_id:
    index_df = original_ca_fire_df.loc[original_ca_fire_df['UniqueId']==i]
    index = index_df.index
    for n in index:
        index_list.append(n)
        for x in index_list:
            try:
                a_acres =  original_ca_fire_df.loc[x, ['AcresBurned']].div(len(index))
                avg_acres.append(a_acres)
            except:
                avg_acres.append(None)
            
            
            
            
#             for y in df_index_list:
#                 if x==y:
#                 a_acres =  original_ca_fire_df.loc[x, ['AcresBurned']].div(len(index))
#                 avg_acres.append(a_acres)    
#             elif x!=y:
#                 a_acres =  original_ca_fire_df.loc[y, ['AcresBurned']]
#                 avg_acres.append()                                                       
#         ca_fire_df.loc[x, ['AcresBurned', 
#                            'Dozers',
#                            'Engines', 
#                            'PersonnelInvolved', 
#                            'WaterTenders']] = ca_fire_df.loc[x, ['AcresBurned',
#                                                                  'Dozers',
#                                                                  'Engines', 
#                                                                  'PersonnelInvolved', 
#                                                                  'WaterTenders']].div(len(index))

avg_acres

[AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1301, dtype: object,
 AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1301, dtype: object,
 AcresBurned    7
 Name: 1437, dtype: object,
 AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1301, dtype: object,
 AcresBurned    7
 Name: 1437, dtype: object,
 AcresBurned    7
 Name: 1483, dtype: object,
 AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1301, dtype: object,
 AcresBurned    7
 Name: 1437, dtype: object,
 AcresBurned    7
 Name: 1483, dtype: object,
 AcresBurned    102551
 Name: 1303, dtype: object,
 AcresBurned    7
 Name: 1210, dtype: object,
 AcresBurned    7
 Name: 1301, dtype: object,
 AcresBurned    7
 Name: 1437, dtype: object,
 AcresBurned    7
 Name: 1483, dtype: object,
 AcresBurned    102551
 Name: 1303, dtype: object,
 AcresBurned    102551
 Name: 1310, dtype: object,
 AcresBurned    7
 

In [73]:
index

Int64Index([348, 451], dtype='int64')

In [71]:
original_ca_fire_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633],
           dtype='int64', length=1634)

In [None]:
id_count = ca_fire_df['UniqueId'].value_counts().to_frame().reset_index()
uid_count = id_count.loc[id_count['UniqueId'] > 1]
df_index = []

for i in uid_count['index']:
    index = ca_fire_df[ca_fire_df['UniqueId']==i]
    df_index.append(index)
    for x in index:
#         if x == 
        ca_fire_df.loc[x, ['AcresBurned', 
                           'Dozers',
                           'Engines', 
                           'PersonnelInvolved', 
                           'WaterTenders']] = ca_fire_df.loc[x, ['AcresBurned',
                                                                 'Dozers',
                                                                 'Engines', 
                                                                 'PersonnelInvolved', 
                                                                 'WaterTenders']].div(len(index))

df_index

In [None]:
# Store clean data in csv file
original_ca_fire_df.to_csv('output_data/california_fire_cleaned.csv', index=False, header=True)