<a href="https://colab.research.google.com/github/jada-ke/CodeJam14/blob/main/DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Forex Data Cleaing

Clean DataFrame

In [None]:
import pandas as pd

# Load the forex data (replace with your actual file path)
file_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Collection/forex_data.csv'  # Update with the actual path to your forex data CSV
forex_df = pd.read_csv(file_path)

# Display the first few rows to check the structure
print(forex_df.head())

# Convert the 'Date' column to datetime type
forex_df['Date'] = pd.to_datetime(forex_df['Date'])

# Set 'Date' as the index of the DataFrame for time-series analysis
forex_df.set_index('Date', inplace=True)

# Check for missing values in the data
print(forex_df.isnull().sum())

# If there are missing values, you can choose to fill or drop them.
# For example, we can forward fill the missing values:
forex_df.fillna(method='ffill', inplace=True)

# Alternatively, you can drop rows with missing values:
# forex_df.dropna(inplace=True)

# Drop any unnecessary columns if you have others in the dataset you don't need.
# For example, if there's an additional 'Timestamp' column, you can drop it:
# forex_df.drop(columns=['Timestamp'], inplace=True)

# Optionally, if you only want to keep the 'Close' price as a target variable:
forex_df = forex_df[['Open', 'High', 'Low', 'Close']]

# Display the cleaned DataFrame
print(forex_df.head())

# Define the path to save the preprocessed data
save_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/cleaned_forex_data.csv'  # Update with the desired path to save the cleaned data

# Save the cleaned DataFrame to a new CSV file
forex_df.to_csv(save_path)

print(f"Preprocessed data saved to: {save_path}")


         Date    Open    High     Low   Close
0  2016-01-01  0.7223  0.7226  0.7214  0.7226
1  2016-01-04  0.7205  0.7222  0.7147  0.7178
2  2016-01-05  0.7175  0.7193  0.7129  0.7150
3  2016-01-06  0.7147  0.7154  0.7084  0.7101
4  2016-01-07  0.7098  0.7115  0.7053  0.7080
Open     0
High     0
Low      0
Close    0
dtype: int64
              Open    High     Low   Close
Date                                      
2016-01-01  0.7223  0.7226  0.7214  0.7226
2016-01-04  0.7205  0.7222  0.7147  0.7178
2016-01-05  0.7175  0.7193  0.7129  0.7150
2016-01-06  0.7147  0.7154  0.7084  0.7101
2016-01-07  0.7098  0.7115  0.7053  0.7080
Preprocessed data saved to: /content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/cleaned_forex_data.csv


  forex_df.fillna(method='ffill', inplace=True)


# Combined Forex and GDP

In [None]:
import pandas as pd

# File paths (update with your actual file paths)
forex_file_path = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/cleaned_forex_data.csv"
gdp_file_path = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Collection/combined_gdp.csv"

# Step 1: Load the Forex data
try:
    forex_df = pd.read_csv(forex_file_path)
except FileNotFoundError:
    print(f"Error: File not found at {forex_file_path}")
    raise

# Ensure the required columns exist in the Forex data
required_forex_columns = {'Date', 'Open', 'High', 'Low', 'Close'}
missing_forex_columns = required_forex_columns - set(forex_df.columns)
if missing_forex_columns:
    raise KeyError(f"Missing columns in Forex data: {missing_forex_columns}")

# Step 2: Add 'Year' column to Forex data (extract from 'Date')
forex_df['Date'] = pd.to_datetime(forex_df['Date'])
forex_df['Year'] = forex_df['Date'].dt.year

# Step 3: Load the GDP data
try:
    gdp_df = pd.read_csv(gdp_file_path)
except FileNotFoundError:
    print(f"Error: File not found at {gdp_file_path}")
    raise

# Ensure the GDP data has required columns
required_gdp_columns = {'Country', 'Year', 'GDP'}
missing_gdp_columns = required_gdp_columns - set(gdp_df.columns)
if missing_gdp_columns:
    raise KeyError(f"Missing columns in GDP data: {missing_gdp_columns}")

# Step 4: Pivot GDP data to have separate columns for each country
gdp_pivot = gdp_df.pivot(index='Year', columns='Country', values='GDP').reset_index()
gdp_pivot.columns.name = None  # Remove the name of the pivot table columns for simplicity
gdp_pivot = gdp_pivot.rename(columns={'CAN': 'CAN_GDP', 'USA': 'USA_GDP'})

# Step 5: Merge the pivoted GDP data with the Forex data
merged_df = pd.merge(forex_df, gdp_pivot, on='Year', how='left')

# Step 6: Save the combined data
output_file_path = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/Master.csv"
merged_df.to_csv(output_file_path, index=False)

# Display the first few rows of the combined dataset
print(merged_df.head())


        Date    Open    High     Low   Close  Year  CAN_GDP  USA_GDP
0 2016-01-01  0.7223  0.7226  0.7214  0.7226  2016  46545.8  57930.9
1 2016-01-04  0.7205  0.7222  0.7147  0.7178  2016  46545.8  57930.9
2 2016-01-05  0.7175  0.7193  0.7129  0.7150  2016  46545.8  57930.9
3 2016-01-06  0.7147  0.7154  0.7084  0.7101  2016  46545.8  57930.9
4 2016-01-07  0.7098  0.7115  0.7053  0.7080  2016  46545.8  57930.9


# Added Interest Rates to Master File

In [None]:
import pandas as pd

# File paths
master_file_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/Master.csv'
usa_interest_file_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/cleaned_usa_interest_rates.csv'
can_interest_file_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/cleaned_can_interest_rates.csv'
output_file_path = '/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/Master2.csv'

# Step 1: Load all three files
try:
    master_df = pd.read_csv(master_file_path)
    usa_interest_df = pd.read_csv(usa_interest_file_path)
    can_interest_df = pd.read_csv(can_interest_file_path)
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise
except pd.errors.ParserError as e:
    print(f"Error: There was an issue parsing one of the files: {e}")
    raise

# Step 2: Drop the 'Year' column from Master.csv if it exists
if 'Year' in master_df.columns:
    master_df = master_df.drop(columns=['Year'])

# Step 3: Merge USA interest rates with Master.csv based on 'Date'
merged_df = pd.merge(master_df, usa_interest_df, on='Date', how='left')

# Step 4: Merge Canadian interest rates with the previous merged DataFrame based on 'Date'
merged_df = pd.merge(merged_df, can_interest_df, on='Date', how='left')

# Step 5: Inspect the final merged DataFrame
print("Final merged DataFrame:")
print(merged_df.head())

# Step 6: Save the merged DataFrame to a new CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged data saved to {output_file_path}")


Final merged DataFrame:
         Date    Open    High     Low   Close  CAN_GDP  USA_GDP  \
0  2016-01-01  0.7223  0.7226  0.7214  0.7226  46545.8  57930.9   
1  2016-01-04  0.7205  0.7222  0.7147  0.7178  46545.8  57930.9   
2  2016-01-05  0.7175  0.7193  0.7129  0.7150  46545.8  57930.9   
3  2016-01-06  0.7147  0.7154  0.7084  0.7101  46545.8  57930.9   
4  2016-01-07  0.7098  0.7115  0.7053  0.7080  46545.8  57930.9   

   USA_Interest_Rate CAN_Interest_Rate  
0               0.34              0.50  
1               0.34              0.50  
2               0.34              0.50  
3               0.34              0.50  
4               0.34              0.50  
Merged data saved to /content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Clean/Master2.csv


# COMBINING DATA

Forex + GDP

In [None]:
import pandas as pd

# Load Forex and GDP datasets
forex_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/forex_data.csv"  # Replace with the actual path to your Forex data
gdp_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/gdp_data.csv"

# Read the data
forex_data = pd.read_csv(forex_file)
gdp_data = pd.read_csv(gdp_file)

# Ensure "Date" columns are datetime
forex_data['Date'] = pd.to_datetime(forex_data['Date'])
gdp_data['Date'] = pd.to_datetime(gdp_data['Date'])

# Sort both datasets by date
forex_data = forex_data.sort_values(by='Date')
gdp_data = gdp_data.sort_values(by='Date')

# Forward-fill GDP data to match Forex dates
# This ensures each daily Forex data point gets the most recent GDP data available
gdp_data = gdp_data.set_index('Date').reindex(forex_data['Date'], method='ffill').reset_index()

# Merge Forex and GDP data on the "Date" column
merged_data = pd.merge(forex_data, gdp_data, how='left', on='Date')

# Save the merged data
output_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv"
merged_data.to_csv(output_file, index=False)

print(f"Merged data saved to {output_file}")


Merged data saved to /content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv


In [None]:
import pandas as pd

# Load Forex and GDP datasets
forex_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/forex_data.csv"  # Replace with the actual path to your Forex data
gdp_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/gdp_data.csv"      # Replace with the actual path to your GDP data

# Read the CSV files
forex_data = pd.read_csv(forex_file)
gdp_data = pd.read_csv(gdp_file)

# Ensure 'Date' columns are in datetime format
forex_data['Date'] = pd.to_datetime(forex_data['Date'])
gdp_data['Date'] = pd.to_datetime(gdp_data['Date'])

# Sort both datasets by date
forex_data = forex_data.sort_values(by='Date')
gdp_data = gdp_data.sort_values(by='Date')

# Forward-fill GDP data to match daily Forex data
gdp_data = gdp_data.set_index('Date')  # Set Date as index for easier alignment
forex_data = forex_data.set_index('Date')

# Use forward fill to ensure each day gets the most recent GDP value
merged_data = forex_data.join(gdp_data, how='left').fillna(method='ffill')

# Reset the index to restore 'Date' as a column
merged_data = merged_data.reset_index()

# Save the merged data to a new CSV
output_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv"
merged_data.to_csv(output_file, index=False)

print(f"Data successfully merged and saved to {output_file}")


Data successfully merged and saved to /content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv


  merged_data = forex_data.join(gdp_data, how='left').fillna(method='ffill')


In [None]:
import pandas as pd

# Load Forex and GDP datasets
forex_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/forex_data.csv"
gdp_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Retrieved/gdp_data.csv"

# Read the CSV files
forex_data = pd.read_csv(forex_file)
gdp_data = pd.read_csv(gdp_file)

# Ensure 'Date' columns are in datetime format
forex_data['Date'] = pd.to_datetime(forex_data['Date'])
gdp_data['Date'] = pd.to_datetime(gdp_data['Date'])

# Sort both datasets by date
forex_data = forex_data.sort_values(by='Date')
gdp_data = gdp_data.sort_values(by='Date')

# Forward-fill GDP data to match daily Forex data
gdp_data = gdp_data.set_index('Date')  # Set Date as index for easier alignment
forex_data = forex_data.set_index('Date')

# Use forward fill to ensure each day gets the most recent GDP value
merged_data = forex_data.join(gdp_data, how='left').fillna(method='ffill')

# Reset the index to restore 'Date' as a column
merged_data = merged_data.reset_index()

# Save the merged data to a new CSV
output_file = "/content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv"
merged_data.to_csv(output_file, index=False)

print(f"Data successfully merged and saved to {output_file}")


Data successfully merged and saved to /content/drive/MyDrive/Ai4Ducks/hack/CodeJam/Data/Correction/Main/forexPLUSgdp.csv


  merged_data = forex_data.join(gdp_data, how='left').fillna(method='ffill')
