In [1]:
import pandas as pd
import numpy as np

## **Read data**

In [4]:
import os

# Path to the main directory containing the folders
main_directory = '/content/drive/MyDrive/old_data'

# Get a list of all folders in the main directory
folders = [folder for folder in os.listdir(main_directory) if os.path.isdir(os.path.join(main_directory, folder))]

# Dictionary to store DataFrames
tickets, flights = {}, {}

# Iterate over each folder
for folder in folders:
    # Construct the path to the CSV file in the current folder
    csv_path_ticket = os.path.join(main_directory, folder, 'ticket.csv')
    csv_path_flight = os.path.join(main_directory, folder, 'flight.csv')

    # Check if the CSV file exists
    if os.path.exists(csv_path_ticket):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_path_ticket)

        # Store the DataFrame in the dictionary with the folder name as the key
        tickets[folder] = df

        # Optionally, store the DataFrame in a dictionary with folder names as keys
        # dataframes[folder] = df
    else:
        print(f'not found in {folder}')

        # Check if the CSV file exists
    if os.path.exists(csv_path_flight):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_path_flight)

        # Store the DataFrame in the dictionary with the folder name as the key
        flights[folder] = df

        # Optionally, store the DataFrame in a dictionary with folder names as keys
        # dataframes[folder] = df
    else:
        print(f'not found in {folder}')



## **Data Processing**

### **Join Tickets data and Flights data**

### **Merge all data to form time series data**

## **Model**

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load your flight ticket price data (replace 'your_data.csv' with your actual file)
data = pd.read_csv('your_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Visualize the time series
plt.figure(figsize=(12, 6))
plt.plot(data['Price'])
plt.title('Flight Ticket Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

# Train-test split
train_size = int(len(data) * 0.8)
train, test = data[:train_size], data[train_size:]

# Build and train the ARIMA model
order = (1, 1, 1)  # Order parameters (p, d, q)
model = ARIMA(train['Price'], order=order)
model_fit = model.fit()

# Forecasting
predictions = model_fit.predict(start=len(train), end=len(train) + len(test) - 1, typ='levels')

# Evaluate the model
mse = mean_squared_error(test['Price'], predictions)
print(f'Mean Squared Error: {mse}')

# Visualize the results
plt.figure(figsize=(12, 6))
plt.plot(train.index, train['Price'], label='Train')
plt.plot(test.index, test['Price'], label='Test')
plt.plot(test.index, predictions, label='Predictions', color='red')
plt.title('ARIMA Model for Flight Ticket Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()


## **Model Tuning and Evaluation**