In [1]:
import pandas as pd
import numpy as np
from time import strptime
from datetime import datetime, timedelta
from time import mktime
import time

In [2]:
flights = pd.read_csv("~/Desktop/flight_training_2019_firstweek.csv")
flights.shape

(146485, 42)

In [3]:
WeatherEvents = pd.read_csv("~/Desktop/WeatherEvents_Jan2016-Dec2020.csv")

In [4]:
WeatherEvents.shape

(6274206, 13)

In [5]:
flights_test = pd.read_csv("~/Desktop/flight_testing.csv")

In [6]:
WeatherEvents.head()

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
0,W-1,Snow,Light,2016-01-06 23:14:00,2016-01-07 00:34:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
1,W-2,Snow,Light,2016-01-07 04:14:00,2016-01-07 04:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
2,W-3,Snow,Light,2016-01-07 05:54:00,2016-01-07 15:34:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
3,W-4,Snow,Light,2016-01-08 05:34:00,2016-01-08 05:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
4,W-5,Snow,Light,2016-01-08 13:54:00,2016-01-08 15:54:00,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0


In [7]:
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

In [8]:
mask = list(flights_test.columns) + ["arr_delay"]

In [9]:
flights_df = flights[mask] #only getting columns we will have in the test data + arr_delay
flights_df.shape

(146485, 21)

In [10]:
WeatherEvents["AirportCode"].value_counts()

K3TH    11142
KMLP    10883
KHYW     9268
K0CO     8242
KSMP     7996
        ...  
KCZZ       43
K4HV       27
KCQV       25
KSPL        8
KI39        1
Name: AirportCode, Length: 2071, dtype: int64

In [11]:
flights_df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'arr_delay'],
      dtype='object')

In [12]:
WeatherEvents["Type"].value_counts()

Rain             3752341
Fog              1385264
Snow              827555
Cold              169182
Precipitation      96684
Storm              40553
Hail                2627
Name: Type, dtype: int64

In [13]:
WeatherEvents["Severity"].value_counts()

Light       3820970
Severe      1167203
Moderate    1020399
Heavy        166323
UNK           96684
Other          2627
Name: Severity, dtype: int64

### Steps to merge the data
* Filter weather to the min/max interval of the flights data
    * Add columns with local time instead of (UTC) - US daylight saving time is Mar-Nov so no need to do check that
* Create columns for Rain, Fog, Snow, Cold, Precipitation, Storm, and Hail for Arrival and Departure
* For each row in Flights:
    * Get all rows in Weather which contain departure and arrival time and locations
    * Populate the weather columns with weather information
    

### Filtering Weather

In [14]:
origin = [a.split(',', 1)[0] for a in flights_df["origin_city_name"]]
flights_df["origin_city"] = [a.split('/', 1)[0] for a in origin]

destination = [a.split(',', 1)[0] for a in flights_df["dest_city_name"]]
flights_df["dest_city"] = [a.split('/', 1)[0] for a in destination]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_df["origin_city"] = [a.split('/', 1)[0] for a in origin]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_df["dest_city"] = [a.split('/', 1)[0] for a in destination]


In [15]:
all_locations  = set(list(flights_df["origin_city"]) + list(flights_df["dest_city"]))
WeatherEvents = WeatherEvents[WeatherEvents["City"].isin(all_locations)]

In [16]:
all_weather = set(WeatherEvents["City"])
not_in_weather = all_locations - all_weather
len(not_in_weather)# 78 locations missing in Weather

78

In [17]:
not_in_weather

{'Adak Island',
 'Aguadilla',
 'Anchorage',
 'Arcata',
 'Asheville',
 'Atlantic City',
 'Barrow',
 'Binghamton',
 'Boston',
 'Bozeman',
 'Bristol',
 'Cape Girardeau',
 'Champaign',
 'Charlotte Amalie',
 'Charlottesville',
 'Christiansted',
 'Clarksburg',
 'Cordova',
 'Deadhorse',
 'Des Moines',
 'Durango',
 'Eagle',
 'Elmira',
 'Fairbanks',
 'Flint',
 'Green Bay',
 'Guam',
 'Hancock',
 'Harrisburg',
 'Hilo',
 'Hilton Head',
 'Honolulu',
 'Hoolehua',
 'Iron Mountain',
 'Islip',
 'Joplin',
 'Kahului',
 'Kalamazoo',
 'Kapalua',
 'Ketchikan',
 'Kodiak',
 'Kona',
 'Kotzebue',
 'Lanai',
 'Lihue',
 'Marquette',
 'Mission',
 'Moab',
 'Moline',
 'Muskegon',
 'New Haven',
 'Newburgh',
 'Nome',
 'Ogden',
 'Paducah',
 'Pago Pago',
 'Ponce',
 'Providence',
 'Saipan',
 'San Juan',
 'Santa Ana',
 'Santa Barbara',
 'Santa Rosa',
 'Sault Ste. Marie',
 'Scranton',
 'Sitka',
 'St. Cloud',
 'St. George',
 'St. Louis',
 'St. Petersburg',
 'Staunton',
 'Sun Valley',
 'Unalaska',
 'Wenatchee',
 'White Plains

In [18]:
len(all_locations)

342

In [19]:
flights_df.shape

(146485, 23)

In [20]:
#Only getting cities that have weather data
flights_df = flights_df[flights_df["origin_city"].isin(all_weather) | flights_df["dest_city"].isin(all_weather)]

In [21]:
flights_df.shape

(144458, 23)

In [21]:
146485 - 144458 # lost 2067, not a huge number

2027

### Add local time columns

![](TimeZones.png)

In [22]:
WeatherEvents["TimeZone"].value_counts()

US/Eastern     544172
US/Central     508104
US/Mountain    135529
US/Pacific     114029
Name: TimeZone, dtype: int64

In [23]:
n = WeatherEvents.shape[0]

In [24]:
def local_time(UTC_string, time_zone, format_in = "%Y-%m-%d %H:%M:%S"):
    """
    Returns local time given string containing UTC datetime, timezone and date format from string
    Time zones must be one of the following
    US/Central
    US/Eastern
    US/Mountain
    US/Pacific
    """
    time_temp = strptime(UTC_string, format_in)
    time_unix = round(time.mktime(time_temp))
    if time_zone == "US/Central":
        unix_out = time_unix - 3600*6
    elif time_zone == "US/Eastern":
        unix_out = time_unix - 3600*5
    elif time_zone == "US/Mountain":
        unix_out = time_unix - 3600*7
    elif time_zone == "US/Pacific":
        unix_out = time_unix - 3600*8
    else:
        print("Time zone not one of the specified types (see function description)")
    time_out = datetime.fromtimestamp(unix_out).strftime('%Y-%m-%d %H:%M:%S')
    return(time_out)
    

In [25]:
LocalTimeZoneStart = []
LocalTimeZoneEnd = []
for i in WeatherEvents.index:
    TimeZone_temp = WeatherEvents["TimeZone"][i]
    start_temp = WeatherEvents['StartTime(UTC)'][i]
    end_temp = WeatherEvents['EndTime(UTC)'][i]
    LocalTimeZoneStart.append(local_time(start_temp, TimeZone_temp))
    LocalTimeZoneEnd.append(local_time(end_temp, TimeZone_temp))


In [26]:
WeatherEvents["LocalTimeStart"] = LocalTimeZoneStart
WeatherEvents["LocalTimeEnd"] = LocalTimeZoneEnd

In [27]:
WeatherEvents.head()

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode,LocalTimeStart,LocalTimeEnd
9117,W-9120,Rain,Light,2016-01-01 07:53:00,2016-01-01 10:53:00,US/Central,KBTR,30.5378,-91.1468,Baton Rouge,East Baton Rouge,LA,70807.0,2016-01-01 01:53:00,2016-01-01 04:53:00
9118,W-9121,Rain,Light,2016-01-01 11:53:00,2016-01-01 12:53:00,US/Central,KBTR,30.5378,-91.1468,Baton Rouge,East Baton Rouge,LA,70807.0,2016-01-01 05:53:00,2016-01-01 06:53:00
9119,W-9122,Rain,Light,2016-01-01 19:53:00,2016-01-01 20:53:00,US/Central,KBTR,30.5378,-91.1468,Baton Rouge,East Baton Rouge,LA,70807.0,2016-01-01 13:53:00,2016-01-01 14:53:00
9120,W-9123,Rain,Light,2016-01-01 21:53:00,2016-01-01 22:53:00,US/Central,KBTR,30.5378,-91.1468,Baton Rouge,East Baton Rouge,LA,70807.0,2016-01-01 15:53:00,2016-01-01 16:53:00
9121,W-9124,Rain,Light,2016-01-03 14:53:00,2016-01-03 16:53:00,US/Central,KBTR,30.5378,-91.1468,Baton Rouge,East Baton Rouge,LA,70807.0,2016-01-03 08:53:00,2016-01-03 10:53:00


In [28]:
min_date = "2019-01-01 00:00:00"
max_date = "2019-01-07 23:59:59"

In [29]:
after_start_date = WeatherEvents["LocalTimeEnd"] >= min_date
before_end_date = WeatherEvents["LocalTimeStart"] <= max_date
between_two_dates = after_start_date & before_end_date
WeatherEvents = WeatherEvents.loc[between_two_dates]

I have dropped 2 rows as outlier. They were 2 events going on for months

In [30]:
try:
    WeatherEvents.drop(6207906, inplace=True)
except:
    pass
try:
    WeatherEvents.drop(551423, inplace=True)
except:
    pass

In [31]:
WeatherEvents.shape #5318 events for cities of interest within the first week of 2019

(5318, 15)

In [32]:
WeatherEvents.to_csv("~/Desktop/LHL---midterm/mid-term-project-I/data/Weather_FirstWeek2019.csv")

### Mergind both data

In [33]:
WeatherEvents = pd.read_csv("~/Desktop/LHL---midterm/mid-term-project-I/data/Weather_FirstWeek2019.csv", index_col = "Unnamed: 0")
WeatherEvents.reset_index(inplace=True,drop = True)
WeatherEvents["Type"] = WeatherEvents.Type.astype("string")
WeatherEvents["Severity"] = WeatherEvents.Severity.astype("string")
WeatherEvents["LocalTimeStart"] = [datetime.strptime(x,"%Y-%m-%d %H:%M:%S") for x in WeatherEvents["LocalTimeStart"]]
WeatherEvents["LocalTimeEnd"] = [datetime.strptime(x,"%Y-%m-%d %H:%M:%S") for x in WeatherEvents["LocalTimeEnd"]]
flights_df.reset_index(inplace = True, drop = True)

In [34]:
flights_df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'arr_delay', 'origin_city', 'dest_city'],
      dtype='object')

In [35]:
WeatherEvents.columns

Index(['EventId', 'Type', 'Severity', 'StartTime(UTC)', 'EndTime(UTC)',
       'TimeZone', 'AirportCode', 'LocationLat', 'LocationLng', 'City',
       'County', 'State', 'ZipCode', 'LocalTimeStart', 'LocalTimeEnd'],
      dtype='object')

#### Adding departure and arrival date-time column

In [36]:
weather_df = WeatherEvents

In [37]:
n = flights_df.shape[0]

In [38]:
crs_dep_time = []
crs_arr_time = []
for i in range(n):
    dep_time = str(flights_df["crs_dep_time"][i])
    arr_time = str(flights_df["crs_arr_time"][i])
    while len(dep_time)<4:
        dep_time = "0" + dep_time
    while len(arr_time)<4:
        arr_time = "0" + arr_time
    if dep_time == "2400":
        dep_time = "0000"
    if arr_time == "2400":
        arr_time = "0000"
    crs_dep_time.append(dep_time)
    crs_arr_time.append(arr_time)

In [39]:
date_time_dep = [flights_df["fl_date"][x] + " " + crs_dep_time[x] for x in range(n)]
flights_df["dep_datetime"] = [datetime.strptime(x,'%Y-%m-%d %H%M') for x in date_time_dep]

In [40]:
date_time_arr = [flights_df["fl_date"][x] + " " + crs_arr_time[x] for x in range(n)]
flights_df["arr_datetime"] = [datetime.strptime(x,'%Y-%m-%d %H%M') for x in date_time_arr]

In [41]:
for i in range(n):
    if int(crs_dep_time[i])>int(crs_arr_time[i]) and (flights_df.at[i,"arr_datetime"].date() == flights_df.at[i,"dep_datetime"].date()):
        #print(flights_df.at[i,"dep_datetime"])
        #print(flights_df.at[i,"arr_datetime"])
        #print("===============================")
        flights_df.at[i,"arr_datetime"] = flights_df["arr_datetime"][i] + timedelta(days=1)


In [42]:
WeatherEvents["Type"].value_counts()

Rain             3159
Fog              1002
Snow              902
Cold              190
Storm              39
Hail               15
Precipitation      11
Name: Type, dtype: Int64

In [43]:
def merge_flights_weather(flights_df, weather_df):
    """
    Function designed to merge specific weather and flights data of this project. 
    It will return the flights data with the merged weather info.
    """
    flights_df["dep_Rain"] = "NW"
    flights_df["dep_Fog"] = "NW"
    flights_df["dep_Snow"] = "NW"
    flights_df["dep_Cold"] = "NW"
    flights_df["dep_Storm"] = "NW"
    flights_df["dep_Hail"] = "NW"
    flights_df["dep_Precipitation"] = "NW"
    
    flights_df["arr_Rain"] = "NW"
    flights_df["arr_Fog"] = "NW"
    flights_df["arr_Snow"] = "NW"
    flights_df["arr_Cold"] = "NW"
    flights_df["arr_Storm"] = "NW"
    flights_df["arr_Hail"] = "NW"
    flights_df["arr_Precipitation"] = "NW"
    
    n = flights_df.shape[0]
    
    for i in range(n):
        temp_city = flights_df["origin_city"][i]
        departure_time = flights_df.at[i,"dep_datetime"]
        mask = (weather_df["LocalTimeStart"]<= departure_time) & (weather_df["LocalTimeEnd"] >= departure_time)
        weather_filtered_dep = weather_df[mask]
        weather_filtered_dep = weather_filtered_dep[weather_filtered_dep["City"] == temp_city]
        n_weather_temp = weather_filtered_dep.shape[0]
        if n_weather_temp != 0:
            for j in weather_filtered_dep.index:
                weather = weather_filtered_dep.at[j,"Type"]
                flights_df.at[i,"dep_" + weather] = weather_filtered_dep.at[j,"Severity"]
                
        temp_city = flights_df["dest_city"][i]
        arrival_time = flights_df.at[i,"arr_datetime"]
        mask = (weather_df["LocalTimeStart"]<= arrival_time) & (weather_df["LocalTimeEnd"] >= arrival_time)
        weather_filtered_dep = weather_df[mask]
        weather_filtered_dep = weather_filtered_dep[weather_filtered_dep["City"] == temp_city]
        n_weather_temp = weather_filtered_dep.shape[0]
        if n_weather_temp != 0:
            for j in weather_filtered_dep.index:
                weather = weather_filtered_dep.at[j,"Type"]
                flights_df.at[i,"arr_" + weather] = weather_filtered_dep.at[j,"Severity"]
    return(flights_df)

In [44]:
df_out = merge_flights_weather(flights_df, WeatherEvents)

In [45]:
df_out.iloc[0:10,-14:]

Unnamed: 0,dep_Rain,dep_Fog,dep_Snow,dep_Cold,dep_Storm,dep_Hail,dep_Precipitation,arr_Rain,arr_Fog,arr_Snow,arr_Cold,arr_Storm,arr_Hail,arr_Precipitation
0,NW,NW,NW,NW,NW,NW,NW,Light,NW,NW,NW,NW,NW,NW
1,NW,NW,NW,NW,NW,NW,NW,Light,NW,NW,NW,NW,NW,NW
2,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
3,Light,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
4,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
5,Moderate,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
6,Light,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
7,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
8,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW
9,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW,NW


In [46]:
df_out["dep_Rain"].value_counts()

NW          129013
Light        14203
Moderate      1032
Heavy          210
Name: dep_Rain, dtype: int64

In [47]:
df_out["dep_Fog"].value_counts()

NW          140124
Moderate      2230
Severe        2104
Name: dep_Fog, dtype: int64

In [48]:
df_out.shape

(144458, 39)

In [50]:
df_out.to_csv("~/Desktop/DataDump/Flights_Weather_TRAIN.csv")