In [1]:
from data_structuring import structure_data
from data_preprocessing import preprocess_data
from data_wrangling import wrangle_data
from model_training import train_and_evaluate_models
from visualization_and_analysis import visualize_and_analyze
from data_collecting import collect_data
import utils
import time

In [2]:
def main():
    while True:
        print("Choose an option:")
        print("1. Run all steps.")
        print("2. Run steps interactively.")
        print("3. Run only one step.")
        print("4. Exit.")
        choice = int(input("Enter the option number: "))

        if choice == 1:
            run_all_steps()
        elif choice == 2:
            run_steps_interactively()
        elif choice == 3:
            run_single_step()
        elif choice == 4:
            break
        else:
            print("Invalid input. Please try again.")

In [3]:
def run_all_steps():
    steps = [collect_data, structure_data_step, preprocess_data_step, wrangle_data_step, train_and_evaluate_models_step, visualize_and_analyze_step]

    for step in steps:
        step()

In [4]:
def run_steps_interactively():
    steps = [collect_data, structure_data_step, preprocess_data_step, wrangle_data_step, train_and_evaluate_models_step, visualize_and_analyze_step]

    for step in steps:
        answer = input(f"Do you want to run the step '{step.__name__}'? (y/n): ")
        if answer.lower() == 'y':
            step()

In [5]:
def run_single_step():
    step_mapping = {
        1: collect_data,
        2: structure_data_step,
        3: preprocess_data_step,
        4: wrangle_data_step,
        5: train_and_evaluate_models_step,
        6: visualize_and_analyze_step
    }

    print("Choose a step to run:")
    for idx, step in step_mapping.items():
        print(f"{idx}. {step.__name__}")

    choice = int(input("Enter the step number: "))
    step_mapping[choice]()

In [6]:
def structure_data_step():
    structure_data("transfer_news_data.csv")

In [7]:
def preprocess_data_step():
    structured_data_rows = utils.pandas_load_csv("structured_data.csv")
    transfer_news_data = utils.pandas_load_csv("transfer_news_data.csv")
    football_api_players = utils.pandas_load_csv("football_api_players.csv")
    transfermarkt_data = utils.pandas_load_csv("transfermarkt_data.csv")

    preprocess_data(structured_data_rows, transfer_news_data, football_api_players, transfermarkt_data)

In [8]:
def wrangle_data_step():
    preprocessed_data = utils.pandas_load_csv("preprocessed_data.csv")
    transfermarkt_data = utils.pandas_load_csv("transfermarkt_data.csv")

    wrangle_data(preprocessed_data, transfermarkt_data)

In [9]:
def train_and_evaluate_models_step():
    output_data = utils.pandas_load_csv("output_data.csv")
    train_and_evaluate_models(output_data)

In [10]:
def visualize_and_analyze_step():
    data = utils.pandas_load_csv("output_data.csv")

    continuous_features_to_analyze = ['age', 'time_to_transfer_window', 'market_value']
    categorical_features_to_analyze = ['nationality', 'position', 'source']
    visualize_and_analyze(data, continuous_features_to_analyze, categorical_features_to_analyze, 'veracity')

In [None]:
if __name__ == "__main__":
    start_time = time.time()

    main()

    end_time = time.time()
    elapsed_time = end_time - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Elapsed time: {int(hours)} hours, {int(minutes)} minutes, and {seconds:.2f} seconds")  

Choose an option:
1. Run all steps.
2. Run steps interactively.
3. Run only one step.
4. Exit.
Enter the option number: 1
Checking GOOGLE_API_KEY...
Checking CX_ID...
Creating transfer_rumours_articles.csv...
transfer_rumours_articles.csv already exists
1. Use last date in transfer_rumours_articles.csv 2. Enter a start date in the format YYYY-MM-DD
> 2
> 2022-01-01
Enter the end date in the format YYYY-MM-DD
> 2022-01-02
Searching for articles published on 1 January 2022...
Article found: https://www.bbc.com/sport/59840889
Sleeping for 22 seconds...
