In [34]:
import dask.dataframe as dd
import pandas as pd
from io import StringIO

In [35]:
data = 'Airline Code;DelayTimes;FlightCodes;To_From\nAir Canada (!);[21, 40];20015.0;WAterLoo_NEWYork\n<Air France> (12);[];;Montreal_TORONTO\n(Porter Airways. );[60, 22, 87];20035.0;CALgary_Ottawa\n12. Air France;[78, 66];;Ottawa_VANcouvER\n""".\\.Lufthansa.\\.""";[12, 33];20055.0;london_MONTreal\n'

In [36]:
# Read the string into a pandas DataFrame (Dask can't read from StringIO directly)
pdf = pd.read_csv(StringIO(data), sep=';')

# Convert to Dask DataFrame
dask_df = dd.from_pandas(pdf, npartitions=1)

In [37]:
# --- 1. Interpolate FlightCodes using map_partitions ---
dask_df['FlightCodes'] = dask_df['FlightCodes'].astype(float)
dask_df['FlightCodes'] = dask_df['FlightCodes'].map_partitions(lambda s: s.interpolate())
dask_df['FlightCodes'] = dask_df['FlightCodes'].astype(int)
print(df.compute())

     Airline Code    DelayTimes  FlightCodes        To       From
0      Air Canada      [21, 40]        20015  WATERLOO    NEWYORK
1      Air France            []        20025  MONTREAL    TORONTO
2  Porter Airways  [60, 22, 87]        20035   CALGARY     OTTAWA
3      Air France      [78, 66]        20045    OTTAWA  VANCOUVER
4       Lufthansa      [12, 33]        20055    LONDON   MONTREAL


In [38]:
# --- 2. Split To_From into To and From ---
dask_df[['To', 'From']] = dask_df['To_From'].str.split('_', n=1, expand=True)
dask_df['To'] = dask_df['To'].str.upper()
dask_df['From'] = dask_df['From'].str.upper()
dask_df = dask_df.drop(columns='To_From')
print(dask_df.compute())

         Airline Code    DelayTimes  FlightCodes        To       From
0      Air Canada (!)      [21, 40]        20015  WATERLOO    NEWYORK
1   <Air France> (12)            []        20025  MONTREAL    TORONTO
2  (Porter Airways. )  [60, 22, 87]        20035   CALGARY     OTTAWA
3      12. Air France      [78, 66]        20045    OTTAWA  VANCOUVER
4   ".\.Lufthansa.\."      [12, 33]        20055    LONDON   MONTREAL


In [39]:
# --- 3. Clean Airline Code ---
dask_df['Airline Code'] = dask_df['Airline Code'].str.replace(r'[^a-zA-Z\s]+', ' ', regex=True)
dask_df['Airline Code'] = dask_df['Airline Code'].str.strip()
dask_df['Airline Code'] = dask_df['Airline Code'].str.replace(r'\s+', ' ', regex=True)
print(df.compute())

     Airline Code    DelayTimes  FlightCodes        To       From
0      Air Canada      [21, 40]        20015  WATERLOO    NEWYORK
1      Air France            []        20025  MONTREAL    TORONTO
2  Porter Airways  [60, 22, 87]        20035   CALGARY     OTTAWA
3      Air France      [78, 66]        20045    OTTAWA  VANCOUVER
4       Lufthansa      [12, 33]        20055    LONDON   MONTREAL
