IMPORT LIBRARY

In [6]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

DATA LOADING

In [7]:
file_path = r'C:\Qin\Dummy Project\Toyota-Stock-Data\TOYOTA Stock Data 2025.csv'

try:
    df = pd.read_csv(file_path)
    print('Data shape:', df.shape)
    print(df.head())
except Exception as e:
    print('Error reading file:', e)
    exit()

Data shape: (11414, 7)
         date                open                high                 low  \
0         NaN                  TM                  TM                  TM   
1  1980-03-17                 0.0   3.344743013381958   3.291227102279663   
2  1980-03-18                 0.0  3.3581221103668213  3.3046059608459473   
3  1980-03-19  3.3046059608459473  3.3046059608459473  3.3046059608459473   
4  1980-03-20                 0.0  3.3581221103668213  3.3046059608459473   

                close           adj_close volume  
0                  TM                  TM     TM  
1   3.291227102279663  1.8489787578582764  41109  
2  3.3046059608459473  1.8564950227737427   9343  
3  3.3046059608459473  1.8564950227737427      0  
4  3.3046059608459473  1.8564950227737427  10277  


DATA CLEANING

In [8]:
# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Convert numeric columns
numeric_cols = ['open', 'high', 'low', 'close', 'adj_close', 'volume']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Show missing data
print('\nMissing values:')
print(df.isna().sum())

# Drop missing rows
df.dropna(inplace=True)
print('\nData shape after cleaning:', df.shape)


Missing values:
date         1
open         1
high         1
low          1
close        1
adj_close    1
volume       1
dtype: int64

Data shape after cleaning: (11413, 7)


VISUALIZATION

In [9]:
# Histogram of adjusted close price
plt.figure(figsize=(10, 6))
sns.histplot(df['adj_close'], bins=30, kde=True)
plt.title('Distribution of Adjusted Close Prices')
plt.xlabel('Adjusted Close Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('hist_adj_close.png')
plt.show()

# Volume categories (Low, Medium, High)
df['volume_category'] = pd.cut(df['volume'], bins=3, labels=['Low', 'Medium', 'High'])
plt.figure(figsize=(8, 6))
sns.countplot(x='volume_category', data=df)
plt.title('Volume Categories')
plt.xlabel('Volume Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('volume_category.png')
plt.show()

# Pairplot
sns.pairplot(df[['open', 'high', 'low', 'close', 'adj_close']])
plt.savefig('pairplot.png')
plt.show()

# Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])
if numeric_df.shape[1] >= 4:
    plt.figure(figsize=(10, 8))
    corr_matrix = numeric_df.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    plt.show()
else:
    print('Not enough numeric columns for correlation heatmap.')