In [83]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_ta as ta
import seaborn as sns
import glob
import os

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [84]:
overview_data = pd.read_csv('./data-vn-20230228/ticker-overview.csv',index_col=0)
overview_data_clean = overview_data.copy()[['exchange', 'shortName', 'industryID', 'industryIDv2', 'industry',\
       'industryEn', 'establishedYear', 'noEmployees', 'noShareholders',\
       'foreignPercent', 'website', 'stockRating', 'deltaInWeek','deltaInMonth', 'deltaInYear', 'outstandingShare', 'issueShare','companyType', 'ticker']]

overview_data_clean.dropna(how='all',inplace=True)
overview_data_clean.head()

Unnamed: 0,exchange,shortName,industryID,industryIDv2,industry,industryEn,establishedYear,noEmployees,noShareholders,foreignPercent,website,stockRating,deltaInWeek,deltaInMonth,deltaInYear,outstandingShare,issueShare,companyType,ticker
0,UPCOM,Đầu tư Phát triển Máy Việt Nam,211.0,3353.0,Ô tô và phụ tùng,Automobiles & Parts,,352.0,105.0,0.0,https://vimid.vn,3.3,-0.09,-0.098,,20.5,20.5,CT,VVS
1,UPCOM,Xây dựng Công trình Tân Cảng,181.0,2357.0,Xây dựng và Vật liệu,Construction & Materials,,110.0,0.0,0.0,https://saigonnewport.com.vn,3.7,-0.133,-0.082,,9.0,0.0,CT,XDC
2,UPCOM,Gang Thép Hà Nội,159.0,1757.0,Tài nguyên Cơ bản,Basic Resources,2013.0,0.0,109.0,0.003,https://gangthephanoi.com,3.7,0.013,0.064,-0.37,15.7,15.7,CT,HSV
3,UPCOM,Than Cao Sơn - TKV,160.0,1771.0,Tài nguyên Cơ bản,Basic Resources,2020.0,3473.0,5157.0,0.039,http://thancaoson.vn,4.0,-0.087,0.093,0.457,42.8,42.8,CT,CST
4,UPCOM,BV Land,339.0,8637.0,Bất động sản,Real Estate,2008.0,46.0,175.0,0.0,http://bvland.vn,,-0.204,-0.236,-0.433,57.3,57.3,CT,BVL


In [85]:
overview_data_all_groups = overview_data_clean['industryEn'].unique()
overview_data_all_groups

array(['Automobiles & Parts', 'Construction & Materials',
       'Basic Resources', 'Real Estate', 'Personal & Household Goods',
       'Oil & Gas', 'Chemicals', 'Financial Services', 'Food & Beverage',
       'Media', 'Travel & Leisure', 'Industrial Goods & Services',
       'Utilities', 'Health Care', 'Banks', 'Insurance', 'Retail',
       'Technology', 'Telecommunications'], dtype=object)

In [86]:
# Group data by industryEn
overview_data_clean['industryEn'] = overview_data_clean['industryEn'].astype(str)
overview_data_grouped = overview_data_clean.groupby(overview_data_clean['industryEn'])
overview_data_grouped.get_group('Travel & Leisure').head()

Unnamed: 0,exchange,shortName,industryID,industryIDv2,industry,industryEn,establishedYear,noEmployees,noShareholders,foreignPercent,website,stockRating,deltaInWeek,deltaInMonth,deltaInYear,outstandingShare,issueShare,companyType,ticker
20,UPCOM,DHC Suối Đôi,260.0,5755.0,Du lịch và Giải trí,Travel & Leisure,2014.0,0.0,307.0,0.0,https://dhcsuoidoi.vn,3.0,0.159,0.582,,31.1,31.1,CT,DSD
90,HNX,"ATESCO.,JSC",261.0,5757.0,Du lịch và Giải trí,Travel & Leisure,1998.0,103.0,109.0,0.0,http://www.atesco.vn,3.6,0.013,0.064,-0.202,3.5,3.5,CT,ATS
96,UPCOM,Du lịch và Thương mại Bằng Giang Cao Bằng - Vi...,259.0,5753.0,Du lịch và Giải trí,Travel & Leisure,2008.0,26.0,104.0,0.0,http://www.ksbanggiangcaobang.com.vn,3.3,0.013,0.064,0.302,1.8,1.8,CT,BCV
158,UPCOM,Du lịch tỉnh BR-VT,262.0,5759.0,Du lịch và Giải trí,Travel & Leisure,2007.0,272.0,183.0,0.0,http://www.vungtautourist.com.vn,3.6,0.013,0.064,0.487,18.6,18.6,CT,VTG
172,UPCOM,DV Du lịch Bến Thành,262.0,5759.0,Du lịch và Giải trí,Travel & Leisure,2005.0,250.0,188.0,0.0,https://benthanhtourist.com,3.5,-0.205,-0.154,-0.137,25.0,25.0,CT,BTV


In [87]:
# Comprehensive Analysis Functions
def enrich_portfolio_data(company_metadata, existing_portfolio_data):
    # 1. Industry Classification Enhancement
    industry_classification = company_metadata.groupby('industry').agg({
        'ticker': 'count',
        'foreignPercent': 'mean',
        'noShareholders': 'mean'
    }).rename(columns={
        'ticker': 'industry_company_count',
        'foreignPercent': 'avg_foreign_investment',
        'noShareholders': 'avg_shareholders'
    })

    # 2. Company Maturity Analysis
    current_year = pd.Timestamp.now().year
    company_metadata['company_age'] = current_year - company_metadata['establishedYear']
    company_metadata['maturity_category'] = pd.cut(
        company_metadata['company_age'], 
        bins=[0, 5, 10, 20, 100], 
        labels=['Startup', 'Young', 'Established', 'Mature']
    )

    # 3. Foreign Investment Potential
    company_metadata['foreign_investment_score'] = (
        company_metadata['foreignPercent'] * 
        np.log1p(company_metadata['noShareholders'])
    )

    # 4. Stock Performance Metrics
    performance_metrics = company_metadata[['ticker', 'deltaInWeek', 'deltaInMonth', 'deltaInYear']]

    # 5. Shareholder Concentration
    company_metadata['shareholder_concentration'] = (
        company_metadata['noShareholders'] / company_metadata['outstandingShare']
    )

    return {
        'industry_insights': industry_classification,
        'company_maturity': company_metadata[['ticker', 'company_age', 'maturity_category']],
        'foreign_investment_potential': company_metadata[['ticker', 'foreign_investment_score']],
        'stock_performance': performance_metrics,
        'shareholder_analysis': company_metadata[['ticker', 'shareholder_concentration']]
    }

# Execute Enrichment
enriched_data = enrich_portfolio_data(company_metadata, existing_portfolio_data)

# Visualization Functions
def visualize_industry_insights(industry_classification):
    plt.figure(figsize=(12, 6))
    industry_classification['industry_company_count'].plot(kind='bar')
    plt.title('Number of Companies per Industry')
    plt.xlabel('Industry')
    plt.ylabel('Company Count')
    plt.xticks(rotation=45)
    plt.tight_layout()

def analyze_foreign_investment(company_metadata):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='foreignPercent', 
        y='noShareholders', 
        hue='industry', 
        data=company_metadata
    )
    plt.title('Foreign Investment vs Shareholders by Industry')
    plt.xlabel('Foreign Investment Percentage')
    plt.ylabel('Number of Shareholders')

# Portfolio Enhancement Function
def enhance_portfolio_selection(existing_portfolio, enriched_data):
    # Combine existing portfolio with enriched data
    enhanced_portfolio = existing_portfolio.merge(
        enriched_data['company_maturity'], 
        on='ticker', 
        how='left'
    )
    
    # Additional filters
    enhanced_portfolio['investment_score'] = (
        enhanced_portfolio['profit_score'] * 0.5 +
        enhanced_portfolio['foreign_investment_score'] * 0.3 +
        (enhanced_portfolio['company_age'] / 100) * 0.2
    )
    
    # Select top companies with enhanced scoring
    top_enhanced_portfolio = enhanced_portfolio.nlargest(15, 'investment_score')
    
    return top_enhanced_portfolio

# Risk Profile Classification
def classify_investment_profile(company_metadata):
    company_metadata['investment_profile'] = np.where(
        company_metadata['foreignPercent'] > 50, 'International Friendly',
        np.where(
            company_metadata['noShareholders'] > company_metadata['noShareholders'].median(), 
            'Stable', 
            'Emerging'
        )
    )
    
    profile_distribution = company_metadata['investment_profile'].value_counts()
    
    plt.figure(figsize=(8, 6))
    profile_distribution.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Investment Profile Distribution')

# Advanced Company Network Analysis
def create_industry_network(company_metadata):
    import networkx as nx
    
    G = nx.Graph()
    
    # Add nodes and edges based on industry relationships
    for industry in company_metadata['industry'].unique():
        industry_companies = company_metadata[company_metadata['industry'] == industry]
        
        # Add nodes
        for ticker in industry_companies['ticker']:
            G.add_node(ticker, industry=industry)
        
        # Connect companies within same industry
        for i in range(len(industry_companies)):
            for j in range(i+1, len(industry_companies)):
                G.add_edge(
                    industry_companies.iloc[i]['ticker'], 
                    industry_companies.iloc[j]['ticker']
                )
    
    return G

# Execution
visualize_industry_insights(enriched_data['industry_insights'])
analyze_foreign_investment(company_metadata)
classify_investment_profile(company_metadata)
industry_network = create_industry_network(company_metadata)

NameError: name 'company_metadata' is not defined

In [None]:
# Function to load data from multiple files
def load_data_from_directory(directory_path, file_type):
    file_paths = glob.glob(os.path.join(directory_path, f'*{file_type}'))
    all_data = (pd.read_csv(file_path, index_col=0).assign(ticker=os.path.basename(file_path)[:3]) for file_path in file_paths)
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

Unnamed: 0,Open,High,Low,Close,Volume,TradingDate,ticker
0,15326.0,15326.0,15326.0,15300.0,0.0,2018-10-23,A32
1,15326.0,15326.0,15326.0,15300.0,0.0,2018-10-24,A32
2,15326.0,15326.0,15326.0,15300.0,0.0,2018-10-25,A32
3,15326.0,15326.0,15326.0,15300.0,0.0,2018-10-26,A32
4,15326.0,15326.0,15326.0,15300.0,0.0,2018-10-29,A32


In [None]:
# Load stock historical data, financial ratios, industry analysis, and dividend history
stock_historical_data = load_data_from_directory('data-vn-20230228/stock-historical-data', '.csv')
stock_historical_data.shape

(3719441, 7)

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List, Dict, Any

class PortfolioAnalyzer:
    def __init__(self, data_directories: Dict[str, str]):
        """
        Initialize Portfolio Analyzer with data directories
        
        Args:
            data_directories (dict): Dictionary of data type and directory paths
        """
        self.data_directories = data_directories
        self.loaded_data = {}
        self.processed_data = []
    
    def load_data(self) -> None:
        """
        Load data from multiple CSV files in specified directories
        """
        for data_type, directory in self.data_directories.items():
            try:
                files = [f for f in os.listdir(directory) if f.endswith('.csv')]
                data_frames = []
                
                for file in files:
                    file_path = os.path.join(directory, file)
                    df = pd.read_csv(file_path,index_col=0)
                    
                    # Add filename or ticker as identifier
                    df['source_file'] = file.replace('.csv', '')
                    data_frames.append(df)
                
                # Concatenate all dataframes for this data type
                self.loaded_data[data_type] = pd.concat(data_frames, ignore_index=True)
                print(f"Loaded {len(self.loaded_data[data_type])} records for {data_type}")
            
            except Exception as e:
                print(f"Error loading {data_type} data: {e}")
    # Standardize column names
    def standardize_columns(self,df: pd.DataFrame) -> pd.DataFrame:
        df.columns = [col.lower().replace(' ', '_') for col in df.columns]
        return df
    def preprocess_data(self) -> None:
        """
        Comprehensive data preprocessing and merging
        """
        try:
            # Extract common identifiers
            stock_historical = self.loaded_data.get('stock_historical', pd.DataFrame())
            financial_ratios = self.loaded_data.get('financial_ratios', pd.DataFrame())
            industry_data = self.loaded_data.get('industry_data', pd.DataFrame())
            dividend_history = self.loaded_data.get('dividend_history', pd.DataFrame())
            
            
            
            stock_historical = self.standardize_columns(stock_historical)
            financial_ratios = self.standardize_columns(financial_ratios)
            industry_data = self.standardize_columns(industry_data)
            dividend_history = self.standardize_columns(dividend_history)
            print(stock_historical.head())
            # Merge datasets
            merged_data = stock_historical.merge(
                financial_ratios, 
                on=['ticker', 'date'], 
                how='inner'
            )
            
            merged_data = merged_data.merge(
                industry_data, 
                on='industry', 
                how='left'
            )
            
            merged_data = merged_data.merge(
                dividend_history, 
                on='ticker', 
                how='left'
            )
            # Data cleaning
            merged_data.dropna(subset=['close_price', 'volume'], inplace=True)
            
            # Feature engineering
            merged_data['return'] = merged_data.groupby('ticker')['close_price'].pct_change()
            merged_data['cumulative_return'] = merged_data.groupby('ticker')['return'].cumsum()
            
            # Risk metrics
            merged_data['volatility'] = merged_data.groupby('ticker')['return'].transform('std')
            
            self.processed_data = merged_data
            print("Data preprocessing completed successfully")
        
        except Exception as e:
            print(f"Error in data preprocessing: {e}")
    
    def portfolio_composition_analysis(self) -> Dict[str, Any]:
        """
        Advanced portfolio composition analysis
        """
        if self.processed_data is None:
            raise ValueError("Data not preprocessed. Run preprocess_data() first.")
        
        # Ranking and scoring mechanism
        portfolio_metrics = self.processed_data.groupby('ticker').agg({
            'cumulative_return': 'mean',
            'volatility': 'mean',
            'volume': 'sum',
            'close_price': 'last'
        }).reset_index()
        
        # Composite scoring
        portfolio_metrics['composite_score'] = (
            portfolio_metrics['cumulative_return'] * 0.4 +
            (1 / portfolio_metrics['volatility']) * 0.3 +
            np.log(portfolio_metrics['volume']) * 0.2 +
            np.log(portfolio_metrics['close_price']) * 0.1
        )
        
        # Top portfolio candidates
        top_portfolio = portfolio_metrics.nlargest(20, 'composite_score')
        
        # Visualization
        plt.figure(figsize=(12, 6))
        sns.barplot(x='ticker', y='composite_score', data=top_portfolio)
        plt.title('Top 20 Portfolio Candidates')
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        return {
            'top_portfolio': top_portfolio,
            'total_candidates': len(portfolio_metrics)
        }
    
    def risk_management_analysis(self) -> Dict[str, Any]:
        """
        Advanced risk management analysis
        """
        portfolio_metrics = self.processed_data.groupby('ticker').agg({
            'volatility': ['mean', 'max'],
            'return': ['std', 'skew']
        }).reset_index()
        
        portfolio_metrics.columns = ['ticker', 'avg_volatility', 'max_volatility', 'return_std', 'return_skew']
        
        # Risk categorization
        portfolio_metrics['risk_category'] = pd.cut(
            portfolio_metrics['avg_volatility'], 
            bins=[0, 0.1, 0.3, 1], 
            labels=['Low', 'Medium', 'High']
        )
        
        return {
            'risk_distribution': portfolio_metrics['risk_category'].value_counts(),
            'risk_details': portfolio_metrics
        }
    
    def generate_comprehensive_report(self):
        """
        Generate a comprehensive investment report
        """
        portfolio_composition = self.portfolio_composition_analysis()
        risk_analysis = self.risk_management_analysis()
        
        report = f"""
        COMPREHENSIVE INVESTMENT PORTFOLIO REPORT
        
        Portfolio Composition:
        - Total Candidates: {portfolio_composition['total_candidates']}
        - Top 20 Portfolio Candidates Selected
        
        Risk Management:
        Risk Category Distribution:
        {risk_analysis['risk_distribution']}
        
        Detailed Insights Available
        """
        
        return report

# Usage Example
def main():
    data_directories = {
        'stock_historical': 'data-vn-20230228/stock-historical-data',
        'financial_ratios': 'data-vn-20230228/financial-ratio',
        'industry_data': 'data-vn-20230228/industry-analysis',
        'dividend_history': 'data-vn-20230228/dividend-history'
    }
    
    analyzer = PortfolioAnalyzer(data_directories)
    analyzer.load_data()
    analyzer.preprocess_data()
    
    # Generate report
    report = analyzer.generate_comprehensive_report()
    print(report)

if __name__ == "__main__":
    main()

Loaded 3719441 records for stock_historical
Loaded 42578 records for financial_ratios
Loaded 26668 records for industry_data
Loaded 13828 records for dividend_history
   unnamed:_0     open     high      low    close  volume tradingdate  \
0           0  15326.0  15326.0  15326.0  15300.0     0.0  2018-10-23   
1           1  15326.0  15326.0  15326.0  15300.0     0.0  2018-10-24   
2           2  15326.0  15326.0  15326.0  15300.0     0.0  2018-10-25   
3           3  15326.0  15326.0  15326.0  15300.0     0.0  2018-10-26   
4           4  15326.0  15326.0  15326.0  15300.0     0.0  2018-10-29   

              source_file  
0  A32-UpcomIndex-History  
1  A32-UpcomIndex-History  
2  A32-UpcomIndex-History  
3  A32-UpcomIndex-History  
4  A32-UpcomIndex-History  
Error in data preprocessing: 'ticker'


AttributeError: 'list' object has no attribute 'groupby'

In [92]:
aimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List, Dict, Any

class PortfolioAnalyzer:
    def __init__(self, data_directories: Dict[str, str]):
        """
        Initialize Portfolio Analyzer with data directories
        
        Args:
            data_directories (dict): Dictionary of data type and directory paths
        """
        self.data_directories = data_directories
        self.loaded_data = {}
        self.processed_data = pd.DataFrame()  # Initialize as an empty DataFrame

    def load_data(self) -> None:
        """
        Load data from multiple CSV files in specified directories
        """
        for data_type, directory in self.data_directories.items():
            try:
                files = [f for f in os.listdir(directory) if f.endswith('.csv')]
                data_frames = []
                
                for file in files:
                    file_path = os.path.join(directory, file)
                    df = pd.read_csv(file_path, index_col=0)
                    
                    # Add filename or ticker as identifier based on data type
                    if data_type == 'stock_historical':
                        ticker = file[:3]  # First 3 letters of the filename
                        df['ticker'] = ticker
                        df.rename(columns={'TradingDate': 'date', 'Open': 'open_price', 
                                           'High': 'high_price', 'Low': 'low_price', 
                                           'Close': 'close_price', 'Volume': 'volume'}, inplace=True)
                        df['date'] = pd.to_datetime(df['date'])  # Ensure date is in datetime format
                        data_frames.append(df[['ticker', 'date', 'open_price', 'high_price', 'low_price', 'close_price', 'volume']])
                    
                    elif data_type == 'industry_data':
                        df['ticker'] = df['ticker'].str.strip()  # Ensure ticker is stripped of whitespace
                        data_frames.append(df)

                    elif data_type == 'dividend_history':
                        ticker = file[:3]  # First 3 letters of the filename
                        df['ticker'] = ticker
                        df.rename(columns={'exerciseDate': 'date', 'cashYear': 'cash_year', 
                                           'cashDividendPercentage': 'cash_dividend_percentage', 
                                           'issueMethod': 'issue_method'}, inplace=True)
                        df['date'] = pd.to_datetime(df['date'])  # Ensure date is in datetime format
                        data_frames.append(df[['ticker', 'date', 'cash_year', 'cash_dividend_percentage', 'issue_method']])
                
                # Concatenate all dataframes for this data type
                self.loaded_data[data_type] = pd.concat(data_frames, ignore_index=True)
                print(f"Loaded {len(self.loaded_data[data_type])} records for {data_type}")
            
            except Exception as e:
                print(f"Error loading {data_type} data: {e}")

    def standardize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Standardize column names in a DataFrame
        
        Args:
            df (pd.DataFrame): DataFrame to standardize
        
        Returns:
            pd.DataFrame: Standardized DataFrame
        """
        df.columns = [col.lower().replace(' ', '_') for col in df.columns]
        return df

    def preprocess_data(self) -> None:
        """
        Comprehensive data preprocessing and merging
        """
        try:
            # Extract common identifiers
            stock_historical = self.loaded_data.get('stock_historical', pd.DataFrame())
            financial_ratios = self.loaded_data.get('financial_ratios', pd.DataFrame())
            industry_data = self.loaded_data.get('industry_data', pd.DataFrame())
            dividend_history = self.loaded_data.get('dividend_history', pd.DataFrame())

            # Standardize columns
            stock_historical = self.standardize_columns(stock_historical)
            financial_ratios = self.standardize_columns(financial_ratios)
            industry_data = self.standardize_columns(industry_data)
            dividend_history = self.standardize_columns(dividend_history)

            # Merge datasets
            merged_data = stock_historical.merge(
                financial_ratios, 
                on=['ticker', 'date'], 
                how='inner'
            )
            
            merged_data = merged_data.merge(
                industry_data, 
                on='ticker', 
                how='left'
            )
            
            merged_data = merged_data.merge(
                dividend_history, 
                on=['ticker', 'date'], 
                how='left'
            )
            
            # Data cleaning
            merged_data.dropna(subset=['close_price', 'volume'], inplace=True)
                        # Feature engineering
            merged_data['return'] = merged_data.groupby('ticker')['close_price'].pct_change()
            merged_data['cumulative_return'] = merged_data.groupby('ticker')['return'].cumsum()
            
            # Risk metrics
            merged_data['volatility'] = merged_data.groupby('ticker')['return'].transform('std')
            
            self.processed_data = merged_data
            print("Data preprocessing completed successfully")
        
        except Exception as e:
            print(f"Error in data preprocessing: {e}")
    
    def portfolio_composition_analysis(self) -> Dict[str, Any]:
        """
        Advanced portfolio composition analysis
        
        Returns:
            dict: Analysis results including top portfolio candidates
        """
        if self.processed_data.empty:
            raise ValueError("Data not preprocessed. Run preprocess_data() first.")
        
        # Ranking and scoring mechanism
        portfolio_metrics = self.processed_data.groupby('ticker').agg({
            'cumulative_return': 'mean',
            'volatility': 'mean',
            'volume': 'sum',
            'close_price': 'last'
        }).reset_index()
        
        # Composite scoring
        portfolio_metrics['composite_score'] = (
            portfolio_metrics['cumulative_return'] * 0.4 +
            (1 / portfolio_metrics['volatility'].replace(0, np.nan)) * 0.3 +  # Avoid division by zero
            np.log(portfolio_metrics['volume'].replace(0, np.nan)) * 0.2 + 
            np.log(portfolio_metrics['close_price'].replace(0, np.nan)) * 0.1  # Avoid log(0)
        )
        
        # Top portfolio candidates
        top_portfolio = portfolio_metrics.nlargest(20, 'composite_score')
        
        # Visualization
        plt.figure(figsize=(12, 6))
        sns.barplot(x='ticker', y='composite_score', data=top_portfolio)
        plt.title('Top 20 Portfolio Candidates')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()  # Show the plot
        
        return {
            'top_portfolio': top_portfolio,
            'total_candidates': len(portfolio_metrics)
        }
    
    def risk_management_analysis(self) -> Dict[str, Any]:
        """
        Advanced risk management analysis
        
        Returns:
            dict: Risk analysis results including risk distribution and details
        """
        portfolio_metrics = self.processed_data.groupby('ticker').agg({
            'volatility': ['mean', 'max'],
            'return': ['std', 'skew']
        }).reset_index()
        
        portfolio_metrics.columns = ['ticker', 'avg_volatility', 'max_volatility', 'return_std', 'return_skew']
        
        # Risk categorization
        portfolio_metrics['risk_category'] = pd.cut(
            portfolio_metrics['avg_volatility'], 
            bins=[0, 0.1, 0.3, 1], 
            labels=['Low', 'Medium', 'High']
        )
        
        return {
            'risk_distribution': portfolio_metrics['risk_category'].value_counts(),
            'risk_details': portfolio_metrics
        }
    
    def generate_comprehensive_report(self) -> str:
        """
        Generate a comprehensive investment report
        
        Returns:
            str: The comprehensive report as a string
        """
        portfolio_composition = self.portfolio_composition_analysis()
        risk_analysis = self.risk_management_analysis()
        
        report = f"""
        COMPREHENSIVE INVESTMENT PORTFOLIO REPORT
        
        Portfolio Composition:
        - Total Candidates: {portfolio_composition['total_candidates']}
        - Top 20 Portfolio Candidates Selected:
        {portfolio_composition['top_portfolio'][['ticker', 'composite_score']].to_string(index=False)}
        
        Risk Management:
        Risk Category Distribution:
        {risk_analysis['risk_distribution']}
        
        Detailed Insights Available
        """
        
        return report

# Usage Example
def main():
    data_directories = {
        'stock_historical': 'data-vn-20230228/stock-historical-data',
        'financial_ratios': 'data-vn-20230228/financial-ratio',
        'industry_data': 'data-vn-20230228/industry-analysis',
        'dividend_history': 'data-vn-20230228/dividend-history'
    }
    
    analyzer = PortfolioAnalyzer(data_directories)
    analyzer.load_data()
    analyzer.preprocess_data()
    
    # Generate report
    report = analyzer.generate_comprehensive_report()
    print(report)

if __name__ == "__main__":
    main()

Loaded 3719441 records for stock_historical
Loaded 42578 records for financial_ratios
Loaded 26668 records for industry_data
Loaded 13828 records for dividend_history
Error in data preprocessing: 'ticker'


ValueError: Data not preprocessed. Run preprocess_data() first.