# Chatbot

This notebook contains the logic for creating and training the chatbot model. It the cleaned data obtained from the flights dataset.

In [2]:
import random
from datetime import datetime, timedelta
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm





## Data preprocessing
Here we load the datasets and preprocess them for training. This is the airlines information and the airport information. Airline information is called *airlines.csv* and airport information is called *airline_prices.csv*

In [3]:
df_airlines = pd.read_csv('airlines.csv')
df_airports = pd.read_csv('airline_prices.csv')

df_airports.drop(["2023 Passenger Rank", "Average_Fare", "Inflation_Average_Fare(2023)"], axis=1, inplace=True)
df_airlines.to_pickle('airlines_dataframe')
df_airports.to_pickle('airports_dataframe')

In [3]:
# Here we convert the dataframe into records
airports = df_airports.to_dict('records')
airlines = df_airlines.to_dict('records')

In [4]:
airlines

[{'CODE': 'UA', 'AIRLINE': 'United Air Lines Inc.'},
 {'CODE': 'AA', 'AIRLINE': 'American Airlines Inc.'},
 {'CODE': 'US', 'AIRLINE': 'US Airways Inc.'},
 {'CODE': 'F9', 'AIRLINE': 'Frontier Airlines Inc.'},
 {'CODE': 'B6', 'AIRLINE': 'JetBlue Airways'},
 {'CODE': 'OO', 'AIRLINE': 'Skywest Airlines Inc.'},
 {'CODE': 'AS', 'AIRLINE': 'Alaska Airlines Inc.'},
 {'CODE': 'NK', 'AIRLINE': 'Spirit Air Lines'},
 {'CODE': 'WN', 'AIRLINE': 'Southwest Airlines Co.'},
 {'CODE': 'DL', 'AIRLINE': 'Delta Air Lines Inc.'},
 {'CODE': 'EV', 'AIRLINE': 'Atlantic Southeast Airlines'},
 {'CODE': 'HA', 'AIRLINE': 'Hawaiian Airlines Inc.'},
 {'CODE': 'MQ', 'AIRLINE': 'American Eagle Airlines Inc.'},
 {'CODE': 'VX', 'AIRLINE': 'Virgin America'}]

In [5]:
airports

[{'Airport_Code': 'LAX',
  'Airport_Name': 'Los Angeles International',
  'City_Name': 'Los Angeles',
  'State_Name': 'CA'},
 {'Airport_Code': 'ORD',
  'Airport_Name': "Chicago O'Hare International",
  'City_Name': "Chicago-O'Hare",
  'State_Name': 'IL'},
 {'Airport_Code': 'DEN',
  'Airport_Name': 'Denver International',
  'City_Name': 'Denver',
  'State_Name': 'CO'},
 {'Airport_Code': 'ATL',
  'Airport_Name': 'Hartsfield-Jackson Atlanta International',
  'City_Name': 'Atlanta',
  'State_Name': 'GA'},
 {'Airport_Code': 'EWR',
  'Airport_Name': 'Newark Liberty International',
  'City_Name': 'Newark',
  'State_Name': 'NJ'},
 {'Airport_Code': 'BOS',
  'Airport_Name': 'Logan International',
  'City_Name': 'Boston',
  'State_Name': 'MA'},
 {'Airport_Code': 'SEA',
  'Airport_Name': 'Seattle/Tacoma International',
  'City_Name': 'Seattle',
  'State_Name': 'WA'},
 {'Airport_Code': 'DFW',
  'Airport_Name': 'Dallas/Fort Worth International',
  'City_Name': 'Dallas-DFW',
  'State_Name': 'TX'},
 {

In [6]:
# This is a function to generate the flight data that will be used for training the chatbot. (3000 lines)
def generate_flight_training_data(airlines, airports):
    flights_data = []
    for _ in range(3000):
        airline = random.choice(airlines)
        origin = random.choice(airports)
        destination = random.choice(airports)
        while origin == destination:
            destination = random.choice(airports)
        departure_date = datetime.now() + timedelta(days=random.randint(1, 365))
        departure_time = datetime.now() + timedelta(hours=random.randint(1, 24))

        flight = {
            "compagnie": airline['AIRLINE'],
            "code_compagnie": airline['CODE'],
            "date_depart": departure_date.strftime("%Y-%m-%d"),
            "heure_depart": departure_time.strftime("%H:%M"),
            "aeroport_origine": origin["Airport_Name"],
            "code_aeroport_origine": origin["Airport_Code"],
            "ville_origine": origin["City_Name"],
            "aeroport_destination": destination["Airport_Name"],
            "code_aeroport_destination": destination["Airport_Code"],
            "ville_destination": destination["City_Name"],
        }
        flights_data.append(flight)
    return flights_data

In [7]:
# Build the chatbot training data and save to a json file
flights_data = generate_flight_training_data(airlines, airports)
flights_data_json = json.dumps(flights_data)
with open('chatbot_training_data.json', 'w') as f:
    f.write(flights_data_json)

## Model Training
Here we will build and train the model using the data obtained above.

In [9]:
df_flights_data = pd.DataFrame(flights_data)
df_flights_data.to_pickle('flights_dataframe')
df_flights_data

Unnamed: 0,compagnie,code_compagnie,date_depart,heure_depart,aeroport_origine,code_aeroport_origine,ville_origine,aeroport_destination,code_aeroport_destination,ville_destination
0,Spirit Air Lines,NK,2025-02-02,22:10,Dickinson - Theodore Roosevelt Regional,DIK,Dickinson,Valdez Pioneer Field,VDZ,Valdez
1,Delta Air Lines Inc.,DL,2025-05-19,05:10,Joslin Field - Magic Valley Regional,TWF,Twin Falls,Rick Husband Amarillo International,AMA,Amarillo
2,Skywest Airlines Inc.,OO,2025-02-13,15:10,Elmira/Corning Regional,ELM,Elmira/Corning,Glacier Park International,FCA,Kalispell
3,JetBlue Airways,B6,2024-12-14,01:10,Ted Stevens Anchorage International,ANC,Anchorage,Rapid City Regional,RAP,Rapid City
4,Skywest Airlines Inc.,OO,2025-05-01,05:10,Ogden-Hinckley,OGD,Ogden,Alpena County Regional,APN,Alpena
...,...,...,...,...,...,...,...,...,...,...
2995,JetBlue Airways,B6,2024-11-27,08:10,Scott AFB MidAmerica St Louis,BLV,Belleville,Wrangell Airport,WRG,Wrangell
2996,Spirit Air Lines,NK,2024-07-26,12:10,Provincetown Municipal,PVC,Provincetown,Corpus Christi International,CRP,Corpus Christi
2997,Skywest Airlines Inc.,OO,2024-08-26,17:10,Rochester International,RST,Rochester,Central Nebraska Regional,GRI,Grand Island
2998,Atlantic Southeast Airlines,EV,2024-09-22,04:10,Burlington International,BTV,Burlington,Melbourne Orlando International,MLB,Melbourne


In [10]:
def chat_conversation(df):
    training_data = []
    for index, row in df.iterrows():
        training_data.append({
            "question": "Pouvez-vous me donner le code de la compagnie aérienne ?",
            "answer": row["code_compagnie"]
        })
        training_data.append({
            "question": "Quel est le nom de la compagnie ?",
            "answer": row["compagnie"]
        })
        training_data.append({
            "question": "Quelle est la date de départ (YYYY-MM-DD) ?",
            "answer": row["date_depart"]
        })
        training_data.append({
            "question": "Quelle est l'heure de départ (HH:MM) ?",
            "answer": row["heure_depart"]
        })
        training_data.append({
            "question": "Quel est le code de l'aéroport d'origine ?",
            "answer": row["code_aeroport_origine"]
        })
        training_data.append({
            "question": "Quel est le code de l'aéroport de destination ?",
            "answer": row["code_aeroport_destination"]
        })
        training_data.append({
            "question": "Quel est le nom de l'aéroport d'origine ?",
            "answer": row["aeroport_origine"]
        })
        training_data.append({
            "question": "Quel est le nom de l'aéroport de destination ?",
            "answer": row["aeroport_destination"]
        })
        training_data.append({
            "question": "Quel est la ville d'origine ?",
            "answer": row["ville_origine"]
        })
        training_data.append({
            "question": "Quel est la ville de destination ?",
            "answer": row["ville_destination"]
        })
        
    return training_data

In [11]:
chat_data = chat_conversation(df_flights_data)

In [12]:
questions = [item['question'] for item in chat_data]
answers = [item['answer'] for item in chat_data]

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encoder les questions et les réponses séparément
encoded_questions = tokenizer(questions, padding=True, truncation=True, return_tensors='tf')
encoded_answers = tokenizer(answers, padding=True, truncation=True, return_tensors='tf')

# Nous allons utiliser les tokens des questions et des réponses ensemble comme features
features = {
    "input_ids": encoded_questions['input_ids'],
    "attention_mask": encoded_questions['attention_mask']
}
labels = tf.ones(len(encoded_questions['input_ids']))  

In [13]:
# Vérifier la cohérence des tailles des inputs et labels
assert features['input_ids'].shape[0] == labels.shape[0], "Le nombre d'inputs et de labels doit être égal"

In [14]:
# Split des données
train_features, val_features, train_labels, val_labels = train_test_split(
    features["input_ids"].numpy(), labels.numpy(), test_size=0.2)

# Convertir les données en format TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels)).shuffle(len(train_labels)).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices((val_features, val_labels)).batch(8)

In [15]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compiler le modèle
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

# Entraîner le modèle
model.fit(train_dataset, validation_data=val_dataset, epochs=1)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Cause: for/else statement not yet supported
Cause: for/else statement not yet supported





<tf_keras.src.callbacks.History at 0x21f51c1b110>

In [None]:
model.save_pretrained("tmodels/")
tokenizer.save_pretrained("tmodels/")