In [None]:
import sqlite3
import pandas as pd
import os
import glob
import re


In [None]:
csv_dir = "mobi_csv"
table_name = "rides"

In [None]:
def standardize_column_string(name: str) -> str:
    # Remove any text within parentheses
    temp = re.sub(r'\([^()]*\)', '', name)
    temp = temp.strip().lower().replace(' ', '_')
    return temp

def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame:
    standardized_columns = [standardize_column_string(col) for col in df.columns]
    df.columns = standardized_columns
    return df

In [None]:
sample_csv = "mobi_csv/September_2025.csv"
sample_df = pd.read_csv(sample_csv, nrows=5)
sample_df = standardize_column_names(sample_df)
standardized_columns = sample_df.columns.tolist()
# standardized_columns
# ['departure',
#  'return',
#  'bike',
#  'electric_bike',
#  'departure_station',
#  'return_station',
#  'membership_type',
#  'covered_distance',
#  'duration',
#  'departure_temperature',
#  'return_temperature',
#  'stopover_duration',
#  'number_of_stopovers']




In [None]:
standardized_columns_datatypes = [
    'DATETIME',     # departure
    'DATETIME',     # return
    'INTEGER',      # bike
    'BOOLEAN',      # electric_bike
    'TEXT',         # departure_station
    'TEXT',         # return_station
    'TEXT',         # membership_type
    'INTEGER',      # covered_distance
    'INTEGER',      # duration
    'INTEGER',      # departure_temperature
    'INTEGER',      # return_temperature
    'INTEGER',      # stopover_duration
    'INTEGER'       # number_of_stopovers
]
standardized_columns_dict = dict(zip(standardized_columns, standardized_columns_datatypes))

In [None]:
conn = sqlite3.connect("mobi.db")
cursor = conn.cursor()

In [None]:
table_name = 'rides'
column_sql_strings = [f"{col_name} {col_type}" for col_name, col_type in standardized_columns_dict.items()]
create_table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join(column_sql_strings)})"

In [None]:
column_sql_strings

In [None]:
cursor.execute(create_table_sql)
conn.commit()

In [None]:
csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))

In [None]:
print(f"Found {len(csv_files)} CSV files in directory '{csv_dir}':")                    

In [None]:
workspace = locals()

In [None]:
finished_files = set()


In [None]:
def read_mobi_file(file):
    df = pd.read_csv(file)
    df = standardize_column_names(df)
    df = df[[x for x in df.columns if x in standardized_columns]]
    return df



In [None]:
csv_dir

In [None]:
bad_files_dir = os.path.join(csv_dir, "bad_files")
os.makedirs(bad_files_dir, exist_ok=True)

for file in csv_files:
    if file not in finished_files:
        print(f"Importing file: {file}")
        try:
            df = read_mobi_file(file)
            df.to_sql(table_name, conn, if_exists='append', index=False)
            finished_files.add(file)
        except:
            print(f"Failed to import file: {file}")
            os.move(file, bad_files_dir)
    else:
        print(f"Skipping already imported file: {file}")

In [None]:
query = f"SELECT * FROM {table_name} LIMIT 10"
test = pd.read_sql_query(query, conn)
test