In [1]:
# must occur first:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
import pandas as pd

# step one : figure out how to merge all the text files into a single csv for each subcategory of the world stock data

# here is an adapated function based on Google Gemini output when prompted with a more detailed version of the above comment
# this function has more robust error handling then I maybe need, but hey it seems to work
# this is pretty much the same thing that Isaac did for our initial merge as well, but I intend to run it for all of the
# folders undearneath world to get a bunch of other stock datasets that we could look for correlations between

def combine_txts_from_folder(in_folder_path, out_file_path):
  combined_data = []

  try:
      for filename in os.listdir(in_folder_path):
          if filename.endswith(".txt"):
              market_name = os.path.splitext(filename)[0]  # Extract market name from filename
              file_path = os.path.join(in_folder_path, filename)

              try:
                  df = pd.read_csv(file_path, sep=",") # Assumes comma separated values. if other delimiter is used add sep=";" for example.
                  df['market'] = market_name
                  combined_data.append(df)
              except FileNotFoundError:
                  print(f"Error: File not found - {file_path}")
              except pd.errors.EmptyDataError:
                  print(f"Error: Empty file - {file_path}")
              except pd.errors.ParserError:
                  print(f"Error: Parsing error in file - {file_path}")
              except Exception as e:

                  print(f"An unexpected error occurred processing {file_path}: {e}")

      if combined_data:  # Check if any data was successfully loaded
          combined_df = pd.concat(combined_data, ignore_index=True)
          combined_df.to_csv(out_file_path, index=False)
          print(f"Combined data saved to {out_file_path}")
      else:
          print("No .txt files found or successfully processed in the specified folder.")

  except FileNotFoundError:
      print(f"Error: Folder not found - {in_folder_path}")
  except Exception as e:
      print(f"An unexpected error occurred: {e}")


In [3]:
# test the function
drive_folder = "/content/drive/MyDrive/CSC442 Team Project/Datasets/world/stooq stocks indices"  # Your Google Drive folder path
output_csv = "/content/drive/MyDrive/CSC442 Team Project/Output/world_combined/stooq stocks indices.csv" # Your desired output path.
combine_txts_from_folder(drive_folder, output_csv)

Combined data saved to /content/drive/MyDrive/CSC442 Team Project/Output/world_combined/stooq stocks indices.csv


In [4]:
# examine the result
test_df = (pd.read_csv("/content/drive/MyDrive/CSC442 Team Project/Output/world_combined/stooq stocks indices.csv"))

print(test_df.shape)
test_df.head()

(34570, 11)


Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,^_USNM,D,20160216,0,833.46,835.37,833.46,835.37,69634617.0,0,^_usnm
1,^_USNM,D,20160217,0,838.96,848.24,838.96,848.24,68263582.0,0,^_usnm
2,^_USNM,D,20160218,0,848.99,848.99,848.14,848.14,64118433.0,0,^_usnm
3,^_USNM,D,20160219,0,849.0,849.0,845.8,845.8,65392448.0,0,^_usnm
4,^_USNM,D,20160222,0,851.78,853.42,851.78,853.42,65156979.0,0,^_usnm


In [5]:
# prepare to run for each folder
folder_names = ["cryptocurrencies", "bonds", "currencies/major", "currencies/other", "indices", "money market", "stooq stocks indices" ]
# list to store resulting dataframes
dfs = []

# file naming/path constants
IN_FOLDER_HEAD = "/content/drive/MyDrive/CSC442 Team Project/Datasets/world/"
OUT_FOLDER_HEAD = "/content/drive/MyDrive/CSC442 Team Project/Output/world_combined/"
FILE_EXT = ".csv"

# call the function for each folder
for name in folder_names:
  # determine file names
  drive_folder = IN_FOLDER_HEAD + name
  output_csv = OUT_FOLDER_HEAD + name + FILE_EXT

  # run function
  # Check if the output file already exists
  if not os.path.exists(output_csv):
    combine_txts_from_folder(drive_folder, output_csv)

  # read the csvs to our list of dataframes
  dfs.append(pd.read_csv(output_csv))

In [6]:
# examine output
print(f"Total dataframes: {len(dfs)}")
for df in dfs:
  print(df.shape)

Total dataframes: 7
(383018, 11)
(1226698, 11)
(828905, 11)
(14643669, 11)
(677827, 11)
(175807, 11)
(34570, 11)


In [7]:
# standardize the dates
import datetime

earliest = []
latest = []
# get the earliest and latest date for each data set
for df in dfs:
  df["<DATE>"] = pd.to_datetime(df['<DATE>'], format='%Y%m%d', errors='coerce')
  earliest.append(df["<DATE>"].min())
  latest.append(df["<DATE>"].max())

print("Earliest:")
for date in earliest:
  print(str(date))

print("Latest:")
for date in latest:
  print(str(date))

overall_earliest = max(earliest)
overall_latest = min(latest)

print(f"\nOverall: earliest = {overall_earliest}, latest = {overall_latest}")

Earliest:
2010-07-17 00:00:00
1871-01-01 00:00:00
1792-03-01 00:00:00
1793-03-01 00:00:00
1789-05-01 00:00:00
1995-01-02 00:00:00
1991-04-16 00:00:00
Latest:
2025-02-24 00:00:00
2025-02-24 00:00:00
2025-02-24 00:00:00
2025-02-24 00:00:00
2025-02-24 00:00:00
2025-02-24 00:00:00
2025-02-24 00:00:00

Overall: earliest = 2010-07-17 00:00:00, latest = 2025-02-24 00:00:00


From here I'm just printing a sample from each dataframe for verification

In [8]:
dfs[0].head() # cryptocurrency

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,ASTR.V,D,2022-05-14,0,0.071378,0.071378,0.071272,0.071279,20234.76,0,astr.v
1,ASTR.V,D,2022-05-15,0,0.068804,0.073412,0.065701,0.073105,23436700.0,0,astr.v
2,ASTR.V,D,2022-05-16,0,0.073412,0.075017,0.064503,0.067312,25383770.0,0,astr.v
3,ASTR.V,D,2022-05-17,0,0.067412,0.071122,0.065415,0.069916,28364640.0,0,astr.v
4,ASTR.V,D,2022-05-18,0,0.070109,0.073512,0.060022,0.06152,38051090.0,0,astr.v


In [9]:
dfs[1].head() # bonds

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,10YCHY.B,D,2005-11-28,0,2.182,2.221,2.164,2.176,0,0,10ychy.b
1,10YCHY.B,D,2005-11-29,0,2.163,2.178,2.142,2.172,0,0,10ychy.b
2,10YCHY.B,D,2005-11-30,0,2.185,2.185,2.151,2.164,0,0,10ychy.b
3,10YCHY.B,D,2005-12-01,0,2.175,2.183,2.152,2.166,0,0,10ychy.b
4,10YCHY.B,D,2005-12-02,0,2.174,2.195,2.147,2.147,0,0,10ychy.b


In [10]:
dfs[2].head() # currencies (major)

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,CADCHF,D,1971-01-04,0,4.2714,4.2714,4.2714,4.2714,0,0,cadchf
1,CADCHF,D,1971-01-05,0,4.2682,4.2682,4.2682,4.2682,0,0,cadchf
2,CADCHF,D,1971-01-06,0,4.2662,4.2662,4.2662,4.2662,0,0,cadchf
3,CADCHF,D,1971-01-07,0,4.2474,4.2474,4.2474,4.2474,0,0,cadchf
4,CADCHF,D,1971-01-08,0,4.2456,4.2456,4.2456,4.2456,0,0,cadchf


In [11]:
dfs[3].head() # currencies (other)

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,JPYXPT,D,1984-01-09,0,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0,0,jpyxpt
1,JPYXPT,D,1984-01-16,0,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0,0,jpyxpt
2,JPYXPT,D,1984-01-23,0,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0,0,jpyxpt
3,JPYXPT,D,1984-01-30,0,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0,0,jpyxpt
4,JPYXPT,D,1984-02-06,0,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0,0,jpyxpt


In [12]:
dfs[4].head() # indices

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,^BUX,D,1991-01-02,0,1000.0,1000.0,1000.0,1000.0,0.0,0,^bux
1,^BUX,D,1991-01-03,0,1001.91,1001.91,1001.91,1001.91,0.0,0,^bux
2,^BUX,D,1991-01-04,0,996.34,996.34,996.34,996.34,0.0,0,^bux
3,^BUX,D,1991-01-07,0,997.61,997.61,997.61,997.61,0.0,0,^bux
4,^BUX,D,1991-01-08,0,1002.78,1002.78,1002.78,1002.78,0.0,0,^bux


In [13]:
# money market
# since these markets all record a single value, we will adjust a little further for its use
dfs[5]['<GROWTH>'] = dfs[5].groupby("<TICKER>")["<CLOSE>"].diff()
dfs[5]["<ZERO>"] = 0

dfs[5].head()

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market,<GROWTH>,<ZERO>
0,PLBPLN6M,D,1997-02-03,0,21.7,21.7,21.7,21.7,0,0,plbpln6m,,0
1,PLBPLN6M,D,1997-02-04,0,21.75,21.75,21.75,21.75,0,0,plbpln6m,0.05,0
2,PLBPLN6M,D,1997-02-05,0,21.77,21.77,21.77,21.77,0,0,plbpln6m,0.02,0
3,PLBPLN6M,D,1997-02-06,0,21.75,21.75,21.75,21.75,0,0,plbpln6m,-0.02,0
4,PLBPLN6M,D,1997-02-07,0,21.75,21.75,21.75,21.75,0,0,plbpln6m,0.0,0


In [15]:
dfs[6].head() # stooqs stock indices

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>,market
0,^_USNM,D,2016-02-16,0,833.46,835.37,833.46,835.37,69634617.0,0,^_usnm
1,^_USNM,D,2016-02-17,0,838.96,848.24,838.96,848.24,68263582.0,0,^_usnm
2,^_USNM,D,2016-02-18,0,848.99,848.99,848.14,848.14,64118433.0,0,^_usnm
3,^_USNM,D,2016-02-19,0,849.0,849.0,845.8,845.8,65392448.0,0,^_usnm
4,^_USNM,D,2016-02-22,0,851.78,853.42,851.78,853.42,65156979.0,0,^_usnm


In [16]:
# Now I will attempt to merge these into a single dataset
# The result will be entries (rows) corresponding to dates
# Each row will have features (columns) corresponding to the average growth and volatility of the markets in each of the datasets above

# for reference, these are the categories of markets:
#    cryptocurrencies, bonds, currencies/major, currencies/other, indices, money market, stooq stocks indices

# here is a function to extract the average of the differences between two columns in a dataframe for all entries on a given date
# subtracts the second column from the first
# throws an exception if the date has no entries
def get_avg_diff(df, date, col1, col2):
  date_mask = df['<DATE>'] == date
  date_df = df[date_mask]

  if(date_df.empty):
    raise ValueError()

  differences = date_df[col1] - date_df[col2]
  average_difference = differences.mean()

  return average_difference

# test
test_date = dfs[0]['<DATE>'][10]
test_date_str = test_date.strftime('%Y-%m-%d')

print(get_avg_diff(dfs[1], test_date, '<HIGH>', '<LOW>'))


0.06999054054054052


In [17]:
earliest_date = pd.to_datetime(overall_earliest).date()
latest_date = pd.to_datetime(overall_latest).date()

# obtain a date range to iterate over
date_range = pd.date_range(earliest_date, latest_date).date

# Initialize the result DataFrame
merged_data = []
skipped_dates = []

for date in date_range:
  date_str = date.strftime('%Y-%m-%d')
  try:
    crypto_growth = get_avg_diff(dfs[0], date_str, "<CLOSE>", "<OPEN>")
    crypto_volatility = get_avg_diff(dfs[0], date_str, "<HIGH>", "<LOW>")

    bonds_growth = get_avg_diff(dfs[1], date_str, "<CLOSE>", "<OPEN>")
    bonds_volatility = get_avg_diff(dfs[1], date_str, "<HIGH>", "<LOW>")

    major_currencies_growth = get_avg_diff(dfs[2], date_str, "<CLOSE>", "<OPEN>")
    major_currencies_volatility = get_avg_diff(dfs[2], date_str, "<HIGH>", "<LOW>")

    other_currencies_growth = get_avg_diff(dfs[3], date_str, "<CLOSE>", "<OPEN>")
    other_currencies_volatility = get_avg_diff(dfs[3], date_str, "<HIGH>", "<LOW>")

    indices_growth = get_avg_diff(dfs[4], date_str, "<CLOSE>", "<OPEN>")
    indices_volatility = get_avg_diff(dfs[4], date_str, "<HIGH>", "<LOW>")

    money_market_growth = get_avg_diff(dfs[5], date_str, "<GROWTH>", "<ZERO>")

    stooqs_stock_growth = get_avg_diff(dfs[6], date_str, "<CLOSE>", "<OPEN>")
    stooqs_stock_volatility = get_avg_diff(dfs[6], date_str, "<HIGH>", "<LOW>")

    merged_data.append({
                'Date': date_str,
                'Crypto Volatility': crypto_volatility,
                'Crypto Growth': crypto_growth,
                'Bonds Volatility': bonds_volatility,
                'Bonds Growth': bonds_growth,
                'Major Currencies Volatility' : major_currencies_volatility,
                'Major Currencies Growth': major_currencies_growth,
                'Other Currencies Volatility': other_currencies_volatility,
                'Other Currencies Growth': other_currencies_growth,
                'Indices Volatility': indices_volatility,
                'Indices Growth': indices_growth,
                'Money Market Growth': money_market_growth,
                'Stooqs Stocks Volatility': stooqs_stock_volatility,
                'Stooqs Stocks Growth': stooqs_stock_growth,
            })
  except ValueError:
    # print(f"No data found in one of the categories for date: {date}")
    skipped_dates.append(date_str)


final_df = pd.DataFrame(merged_data)

final_df.to_csv("/content/drive/MyDrive/CSC442 Team Project/Output/world_combined/final_data.csv", index=False)

print(f"Skipped {len(skipped_dates)} dates due to missing data")
print(skipped_dates)
print(final_df.shape)
final_df.head()


Skipped 1627 dates due to missing data
['2010-07-17', '2010-07-18', '2010-07-24', '2010-07-25', '2010-07-31', '2010-08-01', '2010-08-07', '2010-08-08', '2010-08-14', '2010-08-15', '2010-08-21', '2010-08-22', '2010-08-28', '2010-08-29', '2010-09-04', '2010-09-05', '2010-09-11', '2010-09-12', '2010-09-18', '2010-09-19', '2010-09-25', '2010-09-26', '2010-10-02', '2010-10-03', '2010-10-09', '2010-10-10', '2010-10-16', '2010-10-17', '2010-10-23', '2010-10-24', '2010-10-30', '2010-10-31', '2010-11-01', '2010-11-06', '2010-11-07', '2010-11-11', '2010-11-13', '2010-11-14', '2010-11-20', '2010-11-21', '2010-11-27', '2010-11-28', '2010-12-04', '2010-12-05', '2010-12-11', '2010-12-12', '2010-12-18', '2010-12-19', '2010-12-24', '2010-12-25', '2010-12-26', '2011-01-01', '2011-01-02', '2011-01-06', '2011-01-08', '2011-01-09', '2011-01-15', '2011-01-16', '2011-01-22', '2011-01-23', '2011-01-29', '2011-01-30', '2011-02-05', '2011-02-06', '2011-02-12', '2011-02-13', '2011-02-19', '2011-02-20', '2011-02

Unnamed: 0,Date,Crypto Volatility,Crypto Growth,Bonds Volatility,Bonds Growth,Major Currencies Volatility,Major Currencies Growth,Other Currencies Volatility,Other Currencies Growth,Indices Volatility,Indices Growth,Money Market Growth,Stooqs Stocks Volatility,Stooqs Stocks Growth
0,2010-07-19,0.01584,-0.00504,0.04385,-0.001233,2.232157,-1.335385,0.168446,-0.043343,88.733051,23.229492,-0.1648,1.04,1.04
1,2010-07-20,0.00755,-0.00606,0.036574,-0.016573,2.213713,1.139422,0.166088,0.033777,121.568833,20.939833,-0.0288,1.92,-1.92
2,2010-07-21,0.01287,0.00447,0.037135,-0.025235,1.920781,0.361047,0.168475,0.063285,87.734,11.914667,-0.02,8.19,8.19
3,2010-07-22,0.03131,-0.02871,0.042159,-0.018359,1.66674,0.054496,0.130519,0.009895,132.509831,97.244746,0.0408,7.23,7.23
4,2010-07-23,0.01717,0.01212,0.04113,-0.013021,2.040011,-0.325834,0.163199,0.074019,79.866271,24.910169,0.0508,6.32,6.32


In [18]:
final_df

Unnamed: 0,Date,Crypto Volatility,Crypto Growth,Bonds Volatility,Bonds Growth,Major Currencies Volatility,Major Currencies Growth,Other Currencies Volatility,Other Currencies Growth,Indices Volatility,Indices Growth,Money Market Growth,Stooqs Stocks Volatility,Stooqs Stocks Growth
0,2010-07-19,0.015840,-0.005040,0.043850,-0.001233,2.232157,-1.335385,0.168446,-0.043343,88.733051,23.229492,-0.164800,1.040000,1.040000
1,2010-07-20,0.007550,-0.006060,0.036574,-0.016573,2.213713,1.139422,0.166088,0.033777,121.568833,20.939833,-0.028800,1.920000,-1.920000
2,2010-07-21,0.012870,0.004470,0.037135,-0.025235,1.920781,0.361047,0.168475,0.063285,87.734000,11.914667,-0.020000,8.190000,8.190000
3,2010-07-22,0.031310,-0.028710,0.042159,-0.018359,1.666740,0.054496,0.130519,0.009895,132.509831,97.244746,0.040800,7.230000,7.230000
4,2010-07-23,0.017170,0.012120,0.041130,-0.013021,2.040011,-0.325834,0.163199,0.074019,79.866271,24.910169,0.050800,6.320000,6.320000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3705,2025-02-18,31.386240,-7.274415,0.028454,-0.004294,135.387594,115.190675,4513.601961,-1841.628133,2510.178229,2159.370505,0.018394,3.731538,1.863846
3706,2025-02-19,17.226509,11.627288,0.037913,0.009154,77.136296,-33.208451,3136.558475,700.938474,1183.628211,82.133842,-0.006024,5.590000,-2.042308
3707,2025-02-20,23.208994,15.214756,0.033726,-0.010058,92.761039,-66.627199,3060.917499,1093.155275,1033.096662,110.179022,0.006424,4.326154,-2.506154
3708,2025-02-21,43.473538,-20.760155,0.067727,0.000753,99.507868,-28.058723,5625.323003,-2829.508183,1723.560463,-788.075584,0.009758,5.776667,-0.973333


Final data: look at, for a given date, average volatility (swing between high and low) across all the markets in each category on that date. Look at the distribution in each column for these values, and use these to create a boolean column for whether that category was particularly volatile on that day. We can then use clustering to identify particularly volatile days and see what events impacted that. Potentially use decision trees to determine volatility cutoffs.

Could also examine average growth/loss on each date in each category too and look for correlations between categories. Can also do some time-series to see if volatility or growth/loss in one category tends to lead to volatility or growth/loss in another (i.e. how sensitive each category is to the others and how much time it takes for the effects to spill over)