In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import glob

#Load Qualifying data
quali_column_names = ["Position","Driver Number","Driver","Driver Abbreviation","Car","Q1","Q2","Q3","Laps","Year","Location"]

quali_directory_path = "drive/MyDrive/f1_data/*/Qualifying Results/*.csv"
quali_files = glob.glob(quali_directory_path)

dfs = []
for file in quali_files:
  year = file.split("/")[3]
  location = file.split("/")[5].split("_")[0].replace("-", " ")
  df = pd.read_csv(file, header = None, skiprows=1)
  df['Year'] = year
  df['Location'] = location
  dfs.append(df)

quali_dataframe = pd.concat(dfs, ignore_index=True)
quali_dataframe.columns = quali_column_names

In [None]:
print(len(quali_dataframe))

16459


In [None]:
print(quali_dataframe.head(-20))

      Position  Driver Number             Driver Driver Abbreviation  \
0            1             22     Lewis Hamilton                 HAM   
1            2              2       Felipe Massa                 MAS   
2            3             23  Heikki Kovalainen                 KOV   
3            4             11       Jarno Trulli                 TRU   
4            5              5    Fernando Alonso                 ALO   
...        ...            ...                ...                 ...   
16434        1              6          Jim Clark                 CLA   
16435        1              5          Jim Clark                 CLA   
16436        1              8       Jack Brabham                 BRA   
16437        1              6        Graham Hill                 HIL   
16438        1              9         Chris Amon                 AMO   

                    Car        Q1        Q2        Q3  Laps  Year  \
0      McLaren Mercedes  1:15.218  1:14.603  1:15.666  13.0  2008 

In [None]:
from sqlalchemy import create_engine, MetaData, Table
engine = create_engine("sqlite:///database.sqlite")
# Create a metadata object
metadata = MetaData()

# Reflect the database schema
metadata.reflect(bind=engine)

quali_dataframe.to_sql("qualifying", engine, if_exists="append")


16459

In [None]:

qualifyingTable = Table("qualifying", metadata, autoload_with=engine)

# Perform a select query
query = qualifyingTable.select().limit(10)

with engine.connect() as connection:
    result = connection.execute(query)
    for row in result:
        print(row)

(0, '1', 22, 'Lewis Hamilton', 'HAM', 'McLaren Mercedes', '1:15.218', '1:14.603', '1:15.666', 13.0, '2008', 'germany')
(1, '2', 2, 'Felipe Massa', 'MAS', 'Ferrari', '1:14.921', '1:14.747', '1:15.859', 16.0, '2008', 'germany')
(2, '3', 23, 'Heikki Kovalainen', 'KOV', 'McLaren Mercedes', '1:15.476', '1:14.855', '1:16.143', 17.0, '2008', 'germany')
(3, '4', 11, 'Jarno Trulli', 'TRU', 'Toyota', '1:15.560', '1:15.122', '1:16.191', 21.0, '2008', 'germany')
(4, '5', 5, 'Fernando Alonso', 'ALO', 'Renault', '1:15.917', '1:14.943', '1:16.385', 19.0, '2008', 'germany')
(5, '6', 1, 'Kimi Räikkönen', 'RAI', 'Ferrari', '1:15.201', '1:14.949', '1:16.389', 19.0, '2008', 'germany')
(6, '7', 4, 'Robert Kubica', 'KUB', 'Sauber BMW', '1:15.985', '1:15.109', '1:16.521', 20.0, '2008', 'germany')
(7, '8', 10, 'Mark Webber', 'WEB', 'Red Bull Renault', '1:15.900', '1:15.481', '1:17.014', 20.0, '2008', 'germany')
(8, '9', 15, 'Sebastian Vettel', 'VET', 'STR Ferrari', '1:15.532', '1:15.420', '1:17.244', 22.0, '2

In [None]:
#Load race data
race_column_names = ["Position","Driver Number","Driver","Driver Abbreviation","Car","Laps","Time/Retired","Points","Year","Location"]

race_directory_path = "drive/MyDrive/f1_data/*/Race Results/*.csv"
race_files = glob.glob(race_directory_path)

dfs = []
for file in race_files:
  year = file.split("/")[3]
  location = file.split("/")[5].split("_")[0].replace("-", " ")
  df = pd.read_csv(file, header = None, skiprows=1)
  df['Year'] = year
  df['Location'] = location
  dfs.append(df)

race_dataframe = pd.concat(dfs, ignore_index=True)
race_dataframe.columns = race_column_names

In [None]:
print(len(race_dataframe))

24655


In [None]:
print(race_dataframe.head(-20))

      Position Driver Number             Driver Driver Abbreviation  \
0            1             2       Felipe Massa                 MAS   
1            2             1     Kimi Räikkönen                 RAI   
2            3             4      Robert Kubica                 KUB   
3            4             3      Nick Heidfeld                 HEI   
4            5            23  Heikki Kovalainen                 KOV   
...        ...           ...                ...                 ...   
24630        3             8       Jochen Rindt                 RIN   
24631        4             5        Graham Hill                 HIL   
24632        5             6     Jackie Stewart                 STE   
24633        6             9    Lorenzo Bandini                 BAN   
24634        7            12         Dan Gurney                 GUR   

                    Car  Laps Time/Retired  Points  Year Location  
0               Ferrari  57.0  1:31:06.970    10.0  2008  bahrain  
1          

In [None]:
race_dataframe.to_sql("race", engine, if_exists="append")

24655

In [None]:
raceTable = Table("race", metadata, autoload_with=engine)

# Perform a select query
query = raceTable.select().limit(10)

with engine.connect() as connection:
    result = connection.execute(query)
    for row in result:
        print(row)

(0, '1', '2', 'Felipe Massa', 'MAS', 'Ferrari', 57.0, '1:31:06.970', 10.0, '2008', 'bahrain')
(1, '2', '1', 'Kimi Räikkönen', 'RAI', 'Ferrari', 57.0, '+3.339s', 8.0, '2008', 'bahrain')
(2, '3', '4', 'Robert Kubica', 'KUB', 'Sauber BMW', 57.0, '+4.998s', 6.0, '2008', 'bahrain')
(3, '4', '3', 'Nick Heidfeld', 'HEI', 'Sauber BMW', 57.0, '+8.409s', 5.0, '2008', 'bahrain')
(4, '5', '23', 'Heikki Kovalainen', 'KOV', 'McLaren Mercedes', 57.0, '+26.789s', 4.0, '2008', 'bahrain')
(5, '6', '11', 'Jarno Trulli', 'TRU', 'Toyota', 57.0, '+41.314s', 3.0, '2008', 'bahrain')
(6, '7', '10', 'Mark Webber', 'WEB', 'Red Bull Renault', 57.0, '+45.473s', 2.0, '2008', 'bahrain')
(7, '8', '7', 'Nico Rosberg', 'ROS', 'Williams Toyota', 57.0, '+55.889s', 1.0, '2008', 'bahrain')
(8, '9', '12', 'Timo Glock', 'GLO', 'Toyota', 57.0, '+69.500s', 0.0, '2008', 'bahrain')
(9, '10', '5', 'Fernando Alonso', 'ALO', 'Renault', 57.0, '+77.181s', 0.0, '2008', 'bahrain')


In [None]:
!pip install langchain langchain-experimental pymysql transformers accelerate


Collecting langchain
  Downloading langchain-0.1.17-py3-none-any.whl (867 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-experimental
  Downloading langchain_experimental-0.0.57-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymysql
  Downloading PyMySQL-1.1.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.5-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)

In [None]:
from langchain.utilities import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import transformers
import torch
from langchain.schema import HumanMessage, SystemMessage

model = "codellama/CodeLlama-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    max_new_tokens=50000,
    # max_length=50000
)
hf = HuggingFacePipeline(pipeline=pipeline)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
sqllite_url = "sqlite:///database.sqlite"
db = SQLDatabase.from_uri(sqllite_url, include_tables=["race","qualifying"],sample_rows_in_table_info=100)

# db_chain = SQLDatabaseChain.from_llm(hf, db, verbose=True)

chain = SQLDatabaseChain.from_llm(hf, db, verbose = True)

prompt = PromptTemplate(
    input_variables=["query"],
    template="Answer the question based on the data in the database: {query}",
)

messages = [
    SystemMessage(content="You have access to a database containing Formula 1 race and qualifying results."),
    HumanMessage(content="Which driver has won the most races in the history of Formula 1?"),
]

# chain = prompt | hf

# response = chain.run(messages=messages, prompt = [prompt.format(query="Who has won the most races in Formula 1?")],  max_length=10000)
inputs = {"query": "Who has won the most races in Formula 1?"}
# response = chain.run(inputs=inputs, messages=messages, prompts=[prompt], max_length=10000)
# print(chain.invoke({"question": "Who has won the most races in Formula 1?"}))
print(chain.invoke(input = "Who has won the most races in Formula 1?", max_new_tokens = 10000, max_length=10000))
# print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=50000) and `max_length`(=50000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




[1m> Entering new SQLDatabaseChain chain...[0m
Who has won the most races in Formula 1?
SQLQuery: