In [1]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import csv



In [8]:
%%file flight_analysis.py

from mrjob.job import MRJob
from mrjob.step import MRStep
import csv

class MRFlightDelays(MRJob):
    def mapper(self, _, line):
        try:
            (year, month, day, day_of_week, airline, flight_number, tail_number, origin_airport,
             destination_airport, scheduled_departure, departure_time, departure_delay, taxi_out,
             wheels_off, scheduled_time, elapsed_time, air_time, distance, wheels_on, taxi_in,
             scheduled_arrival, arrival_time, arrival_delay, diverted, cancelled,
             cancellation_reason, air_system_delay, security_delay, airline_delay,
             late_aircraft_delay, weather_delay) = line.split(',')
            
            month = int(month)
            dep_delay = float(departure_delay) if departure_delay else 0
            arr_delay = float(arrival_delay) if arrival_delay else 0
            
            yield month, (dep_delay, arr_delay)
        except (IndexError, ValueError):
            pass
        
    def reducer(self, key, values):
        total_dep_delay = 0
        total_arr_delay = 0
        num_records = 0
        
        for value in values:
            total_dep_delay += value[0]
            total_arr_delay += value[1]
            num_records += 1
        
        avg_dep_delay = total_dep_delay / num_records
        avg_arr_delay = total_arr_delay / num_records
        
        yield key, (avg_dep_delay, avg_arr_delay)

if __name__ == '__main__':
    MRFlightDelays.run()

Overwriting flight_analysis.py


In [4]:
%%file flight_analysis.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import csv

class MRFlightDelays(MRJob):
    def mapper(self, _, line):
        try:
            fields = line.split(',')
            month = int(fields[1])
            dep_delay = float(fields[11]) if fields[11] else 0
            arr_delay = float(fields[23]) if fields[23] else 0
            
            yield month, (dep_delay, arr_delay)
        except (IndexError, ValueError):
            pass
        
    def reducer(self, key, values):
        total_dep_delay = 0
        total_arr_delay = 0
        num_records = 0
        
        for value in values:
            total_dep_delay += value[0]
            total_arr_delay += value[1]
            num_records += 1
        
        avg_dep_delay = total_dep_delay / num_records
        avg_arr_delay = total_arr_delay / num_records
        
        yield key, (avg_dep_delay, avg_arr_delay)
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer),
            MRStep(reducer=self.output_reducer)
        ]
        
    def output_reducer(self, key, values):
        avg_dep_delay, avg_arr_delay = next(values)
        yield key, f"{avg_dep_delay},{avg_arr_delay}"

if __name__ == '__main__':
    MRFlightDelays.run()

Overwriting flight_analysis.py


In [9]:
!python flight_analysis.py flights.csv > delays_by_month.csv

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\Karez\AppData\Local\Temp\flight_analysis.Karez.20240506.190657.131896
Running step 1 of 1...
job output is in C:\Users\Karez\AppData\Local\Temp\flight_analysis.Karez.20240506.190657.131896\output
Streaming final output from C:\Users\Karez\AppData\Local\Temp\flight_analysis.Karez.20240506.190657.131896\output...
Removing temp directory C:\Users\Karez\AppData\Local\Temp\flight_analysis.Karez.20240506.190657.131896...


In [10]:
import pandas as pd

# Ścieżka do pliku CSV
file_path = 'delays_by_month.csv'

# Wczytanie danych z pliku, używając tabulatora jako separatora
data = pd.read_csv(file_path, delimiter='\t', header=None)
data.columns = ['Month', 'Values']

# Ekstrakcja wartości do nowych kolumn 'Value1' i 'Value2' z kolumny 'Values'
data[['Value1', 'Value2']] = data['Values'].str.extract(r'\[([-\d\.]+),([-\d\.]+)\]')

# Konwersja typów danych na float
data['Value1'] = data['Value1'].astype(float)
data['Value2'] = data['Value2'].astype(float)

# Usunięcie oryginalnej kolumny 'Values', która jest już zbędna
data.drop('Values', axis=1, inplace=True)

# Sortowanie danych względem kolumny 'Month'
data_sorted = data.sort_values(by='Month')

# Wyświetlenie posortowanych danych
print(data_sorted.head())

# Zapisanie przetworzonych danych do nowego pliku CSV, jeśli potrzebne
data_sorted.to_csv('delays_new.csv', index=False)


   Month     Value1    Value2
0      1   9.517399  5.653327
4      2  11.329804  7.903148
5      3   9.457096  4.801889
6      4   7.654192  3.124722
7      5   9.352639  4.418672


In [13]:
import pandas as pd
import plotly.graph_objects as go

# Wczytaj dane z pliku CSV
df = pd.read_csv('delays_new.csv')

# a. Wykres słupkowy - miesiące vs opóźnienia odlotów
fig_dep = go.Figure(data=[go.Bar(x=df['Month'], y=df['Value1'])])
fig_dep.update_layout(title='Average Departure Delay by Month',
                     xaxis_title='Month',
                     yaxis_title='Average Departure Delay (minutes)')
fig_dep.show()

# b. Wykres słupkowy - miesiące vs opóźnienia przylotów
fig_arr = go.Figure(data=[go.Bar(x=df['Month'], y=df['Value2'])])
fig_arr.update_layout(title='Average Arrival Delay by Month',
                     xaxis_title='Month',
                     yaxis_title='Average Arrival Delay (minutes)')
fig_arr.show()

# c. Jeden wykres słupkowy - miesiące vs opóźnienia odlotów i przylotów (różne kolory)
fig_both = go.Figure(data=[
    go.Bar(name='Departure Delay', x=df['Month'], y=df['Value1']),
    go.Bar(name='Arrival Delay', x=df['Month'], y=df['Value2'])
])
fig_both.update_layout(title='Average Departure and Arrival Delays by Month',
                      xaxis_title='Month',
                      yaxis_title='Average Delay (minutes)',
                      barmode='group')
fig_both.show()

# d. Wykres liniowy - miesiące vs opóźnienia odlotów i przylotów (różne kolory)
fig_line = go.Figure()
fig_line.add_trace(go.Scatter(x=df['Month'], y=df['Value1'],
                    mode='lines+markers',
                    name='Departure Delay'))
fig_line.add_trace(go.Scatter(x=df['Month'], y=df['Value2'],
                    mode='lines+markers',
                    name='Arrival Delay'))
fig_line.update_layout(title='Average Departure and Arrival Delays by Month',
                      xaxis_title='Month',
                      yaxis_title='Average Delay (minutes)')
fig_line.show()

In [12]:
pip install plotly

Collecting plotly
  Downloading plotly-5.22.0-py3-none-any.whl (16.4 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.22.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
