<a href="https://colab.research.google.com/github/kimkynningsrud/MasterThesis2024/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transfer Learning using DTW - Master Thesis 2024

## Importing Data

In [144]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

stock_prices_url = "https://raw.githubusercontent.com/kimkynningsrud/MasterThesis2024/main/data/indexData.csv?token=GHSAT0AAAAAACML34ERF76CZOWCJEXGGYJYZNPUR6A"
stock_prices = pd.read_csv(stock_prices_url)
stock_prices.name = "Stock Prices"

gold_prices_url = "https://raw.githubusercontent.com/kimkynningsrud/MasterThesis2024/main/data/Gold%20Price.csv?token=GHSAT0AAAAAACML34EQS56T2DT6QGLUZDE4ZNPRLKQ"
gold_prices = pd.read_csv(gold_prices_url)
gold_prices.name = "Gold Prices"

petrolium_prices_url = "https://raw.githubusercontent.com/kimkynningsrud/MasterThesis2024/main/data/PET_PRI_GND_DCUS_NUS_W.csv?token=GHSAT0AAAAAACML34ERPSSHRB7KWYAFCJ3MZNPRLVQ"
petrolium_prices = pd.read_csv(petrolium_prices_url)
petrolium_prices.name = "Petrolium Prices"

In [145]:
stock_prices.head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.0
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.0
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.0
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.0
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.0


In [146]:
gold_prices.head()

Unnamed: 0,Date,Price,Open,High,Low,Volume,Chg%
0,2014-01-01,29542,29435,29598,29340,2930,0.25
1,2014-01-02,29975,29678,30050,29678,3140,1.47
2,2014-01-03,29727,30031,30125,29539,3050,-0.83
3,2014-01-04,29279,29279,29279,29279,0,-1.51
4,2014-01-06,29119,29300,29395,29051,24380,-0.55


In [147]:
petrolium_prices.head()

Unnamed: 0,Date,A1,A2,A3,R1,R2,R3,M1,M2,M3,P1,P2,P3,D1
0,01/02/1995,1.127,1.104,1.231,1.079,1.063,1.167,1.17,1.159,1.298,1.272,1.25,1.386,1.104
1,01/09/1995,1.134,1.111,1.232,1.086,1.07,1.169,1.177,1.164,1.3,1.279,1.256,1.387,1.102
2,01/16/1995,1.126,1.102,1.231,1.078,1.062,1.169,1.168,1.155,1.299,1.271,1.249,1.385,1.1
3,01/23/1995,1.132,1.11,1.226,1.083,1.068,1.165,1.177,1.165,1.296,1.277,1.256,1.378,1.095
4,01/30/1995,1.131,1.109,1.221,1.083,1.068,1.162,1.176,1.163,1.291,1.275,1.255,1.37,1.09


## Pre-processing

In [148]:
def preprocess_data(df, date_column='Date', date_format=None):
    """
    Preprocesses financial data by handling dates, missing values, and duplicates.

    :param df: Pandas DataFrame containing the financial data.
    :param date_column: Name of the column containing date information.
    :param date_format: The format of the dates in the date column. If None, pandas will infer the format.
    :return: Preprocessed DataFrame.
    """
    # Convert 'Date' column to datetime format
    df[date_column] = pd.to_datetime(df[date_column], format=date_format, errors='coerce')

    # Detect and remove missing values with mean for numerical columns
    NA_values_before = df.isna().sum().sum()

    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    NA_values_after = df.isna().sum().sum()
    print(f'There are {NA_values_before} NA values in the {df.name} dataset. {NA_values_after} after removing them.')

    # Scale numerical columns except the date column
    scaler = StandardScaler()
    numerical_columns = df.select_dtypes(include=['number']).columns
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [149]:
datasets = [
    (gold_prices, None),  # Standard date format assumed
    (stock_prices, None),   # Standard date format assumed
    (petrolium_prices, '%m/%d/%Y')  # Custom date format
]

# Preprocessing all datasets
preprocessed_datasets = []
for df, date_format in datasets:
    preprocessed_df = preprocess_data(df, date_format=date_format)
    preprocessed_datasets.append(preprocessed_df)

There are 0 NA values in the Gold Prices dataset. 0 after removing them.
There are 13224 NA values in the Stock Prices dataset. 0 after removing them.
There are 0 NA values in the Petrolium Prices dataset. 0 after removing them.


In [153]:
preprocessed_datasets[1].head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,-0.791197,-0.791442,-0.79059,-0.791087,-0.791057,-0.295191
1,NYA,1966-01-03,-0.791362,-0.791605,-0.790755,-0.791251,-0.791221,-0.295191
2,NYA,1966-01-04,-0.791292,-0.791535,-0.790685,-0.791181,-0.791151,-0.295191
3,NYA,1966-01-05,-0.790928,-0.791174,-0.790318,-0.790817,-0.790787,-0.295191
4,NYA,1966-01-06,-0.790822,-0.791069,-0.790212,-0.790712,-0.790682,-0.295191


In [150]:
print(len(stock_prices))

110253
