In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

## Principal Component Analysis (PCA)

### Objective
The goal of this notebook is to perform PCA on financial data from 2014 to 2018 to reduce the number of features and identify the principal components that capture the most variance in the data.

### Methodology
1. **Data Loading**: Load the processed data for each year.
2. **Standardization**: Standardize the features since PCA is affected by scale.
3. **PCA Execution**: Perform PCA to reduce dimensions while retaining 95% of the variance.
4. **Results Analysis**: Plot and examine the explained variance to determine how many features are necessary to capture the majority of the information.

### Results
The PCA results will help in understanding the underlying structure of the data and guide further analyses and modeling efforts.


In [None]:
data_2014 = pd.read_csv('processed_data/clean_df_2014.csv')
data_2015 = pd.read_csv('processed_data/clean_df_2015.csv')
data_2016 = pd.read_csv('processed_data/clean_df_2016.csv')
data_2017 = pd.read_csv('processed_data/clean_df_2017.csv')
data_2018 = pd.read_csv('processed_data/clean_df_2018.csv')

In [None]:
def perform_pca_analysis(data, num_top_features=10):
    """
    Perform PCA on the provided dataset and visualize the explained variance.
    Also, provide the loadings for the top features of each component.
    
    Parameters:
    - data (DataFrame): The dataset to perform PCA on.
    - num_top_features (int): Number of top features' loadings to return for each component.
    
    Returns:
    - pca: The PCA model.
    - explained_variance_plot: Matplotlib figure object for the explained variance plot.
    - loadings_dict: Dictionary containing the loadings for the top features of each component.
    """
    # Isolate numeric data for PCA
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Perform PCA
    pca = PCA()
    pca.fit(scaled_data)
    
    # Explained variance plot
    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Explained Variance by PCA Components')
    plt.grid(True)
    explained_variance_plot = plt
    
    # Create a dictionary to store loadings for each component
    loadings_dict = {}
    for i in range(len(pca.components_)):
        loadings = pca.components_[i]
        loading_scores = pd.Series(loadings, index=numeric_data.columns)
        sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
        loadings_dict[f"Component {i+1}"] = sorted_loading_scores.head(num_top_features)
    
    return pca, explained_variance_plot, loadings_dict

# 2014 dataset

In [None]:
# 2014 dataset
pca_model, ev_plot, loadings_2014 = perform_pca_analysis(data_2014)
ev_plot.show()

# Displaying loadings
print(' --- COMPONENT 1 --- ')
print(loadings_2014["Component 1"])
print(' --- COMPONENT 2 --- ')
print(loadings_2014["Component 2"])
print(' --- COMPONENT 3 --- ')
print(loadings_2014["Component 3"])

# 2015 dataset

In [None]:
# 2015 dataset
pca_model, ev_plot, loadings_2015 = perform_pca_analysis(data_2015)
ev_plot.show()

# Displaying loadings
print(' --- COMPONENT 1 --- ')
print(loadings_2014["Component 1"])
print(' --- COMPONENT 2 --- ')
print(loadings_2014["Component 2"])
print(' --- COMPONENT 3 --- ')
print(loadings_2014["Component 3"])

# 2016 dataset

In [None]:
# 2016 dataset
pca_model, ev_plot, loadings_2016 = perform_pca_analysis(data_2016)
ev_plot.show()

# Displaying loadings
print(' --- COMPONENT 1 --- ')
print(loadings_2014["Component 1"])
print(' --- COMPONENT 2 --- ')
print(loadings_2014["Component 2"])
print(' --- COMPONENT 3 --- ')
print(loadings_2014["Component 3"])

# 2017 dataset

In [None]:
# 2016 dataset
pca_model, ev_plot, loadings_2017 = perform_pca_analysis(data_2017)
ev_plot.show()

# Displaying loadings
print(' --- COMPONENT 1 --- ')
print(loadings_2014["Component 1"])
print(' --- COMPONENT 2 --- ')
print(loadings_2014["Component 2"])
print(' --- COMPONENT 3 --- ')
print(loadings_2014["Component 3"])

# 2018 dataset

In [None]:
# 2016 dataset
pca_model, ev_plot, loadings_2018 = perform_pca_analysis(data_2018)
ev_plot.show()

# Displaying loadings
print(' --- COMPONENT 1 --- ')
print(loadings_2014["Component 1"])
print(' --- COMPONENT 2 --- ')
print(loadings_2014["Component 2"])
print(' --- COMPONENT 3 --- ')
print(loadings_2014["Component 3"])

## PCA Explained Variance

- The plot shows that the cumulative explained variance increases sharply with the number of components and starts to plateau around 100 components.
- This suggests that around 100 components can explain most of the variance in your data, allowing for significant dimensionality reduction from the original number of features (which appears to be around 200).

## Loadings for PCA Components (2014 and 2015)

### Component 1
The top features are related to profitability and earnings:

- **EBITDA, EBIT, Operating Income**: These are direct measures of a company's operational efficiency and profitability.
- **Gross Profit, Operating Cash Flow**: Indicate the fundamental earnings power of the company.

This component seems to capture the overall operational performance of companies.

### Component 2
The top features focus on various profitability ratios:

- **ebitperRevenue, netProfitMargin, pretaxProfitMargin**: These ratios measure the efficiency of profit generation relative to sales and pre-tax earnings.
- **Profit Margin, EBITDA Margin**: Further detail on profitability from different accounting perspectives.

This component reflects different facets of profitability margins, emphasizing how effectively companies convert sales into profits.

### Component 3
Features in this component are related to financial structure and valuation:

- **Graham Net-Net, Tangible Book Value per Share**: Indicators of potentially undervalued stocks based on asset-based valuation metrics.
- **companyEquityMultiplier**: A measure of financial leverage.
- **Net Cash/Marketcap, PTB ratio (Price to Book ratio)**: Indicators of financial stability and valuation.

This component captures aspects of company valuation and financial position, which can be crucial for assessing investment potential and risk.


# Now let's make a cluster analisys and see how it goes.

In [None]:
# Make clustering
def kmeans_clustering_analysis(data, pca_components=2, num_clusters=4):
    """
    Perform K-means clustering on the provided dataset and visualize the clusters.
    
    Parameters:
    - data (DataFrame): The dataset to perform clustering on.
    - pca_components (int): Number of PCA components to use for clustering.
    - num_clusters (int): Number of clusters to form.
    
    Returns:
    - clusters: The cluster labels for each data point.
    - cluster_plot: Matplotlib figure object for the cluster plot.
    """
    # Isolate numeric data for clustering
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=pca_components)
    pca_data = pca.fit_transform(scaled_data)
    
    # Determine the optimal number of clusters using the elbow method
    wcss = []
    for i in range(1, 15):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(pca_data)
        wcss.append(kmeans.inertia_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, 15), wcss, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.title('Elbow Method to Determine Optimal Number of Clusters')
    plt.grid(True)
    plt.show()

    # Apply K-means clustering with the chosen number of clusters
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
    clusters = kmeans.fit_predict(pca_data)

    # Calculate silhouette scores
    silhouette_avg = silhouette_score(pca_data, clusters)
    silhouette_values = silhouette_samples(pca_data, clusters)
    
    # Add cluster labels to the original data
    data['Cluster'] = clusters
    
    # Visualize the clusters
    plt.figure(figsize=(10, 6))
    plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap='viridis', marker='o')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering')
    plt.colorbar(label='Cluster')
    plt.grid(True)
    cluster_plot = plt
    
    return clusters, cluster_plot, silhouette_avg, silhouette_values

# Run silhouette
def plot_silhouette_scores(clusters, silhouette_values, num_clusters, silhouette_avg):
    """
    Plot silhouette scores for each sample in each cluster.
    
    Parameters:
    - clusters: Cluster labels for each data point.
    - silhouette_values: Silhouette values for each data point.
    - num_clusters: Number of clusters.
    - silhouette_avg: Average silhouette score.
    
    Returns:
    - silhouette_plot: Matplotlib figure object for the silhouette plot.
    """
    plt.figure(figsize=(10, 6))
    y_lower = 10
    for i in range(num_clusters):
        ith_cluster_silhouette_values = silhouette_values[clusters == i]
        ith_cluster_silhouette_values.sort()
        
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        plt.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values)
        
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        
        y_lower = y_upper + 10

    plt.xlabel("Silhouette coefficient values")
    plt.ylabel("Cluster label")
    plt.title("Silhouette plot for the various clusters")
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.show()

## Elbow Method Plot

This plot helps determine the optimal number of clusters. The "elbow" point appears around 3-4 clusters, which suggests that this is a reasonable choice for the number of clusters for the K-means algorithm.

## K-means Clustering Plot

This plot shows the results of K-means clustering using the first two principal components. Each color represents a different cluster.

## Insights from the Clustering Analysis

### Optimal Number of Clusters

The elbow plot suggests that 3-4 clusters are optimal. You can choose either 3 or 4 based on further domain knowledge or additional validation techniques.

### Cluster Visualization

- The K-means clustering plot reveals how the data points are grouped in the reduced PCA space.
- The distinct separation between clusters suggests that the PCA transformation effectively captures the underlying structure of the data, and K-means clustering identifies meaningful groups.


# Year 2014

## Silhouette Plot and Clustering Analysis

### Silhouette Plot:
The silhouette plot shows the silhouette coefficient values for each sample in the clusters. The average silhouette score is approximately 0.63, indicating that the clusters are reasonably well-defined.

### Conclusions:

1. **Cluster Quality**:
   - The average silhouette score of 0.63 suggests that the clusters are well-defined and distinct.
   - The majority of samples have high silhouette scores, indicating good cohesion within clusters and separation between clusters.

2. **Cluster Sizes**:
   - Cluster 0 (blue) is the largest, followed by clusters 1 (orange), 2 (green), and 3 (red).
   - The thickness of each cluster's silhouette indicates the relative size of each cluster.


In [None]:
clusters, cluster_plot, silhouette_avg, silhouette_values = kmeans_clustering_analysis(data_2014, pca_components=2, num_clusters=4)
cluster_plot.show()

# Silhouette Score
print(f"Average Silhouette Score: {silhouette_avg}")
plot_silhouette_scores(clusters, silhouette_values, num_clusters=4, silhouette_avg=silhouette_avg)

# Display the first few rows of the dataframe with cluster labels
data_2014.head()

# Year 2015

## Silhouette Plot and Clustering Analysis

### Silhouette Plot:
The silhouette plot shows the silhouette coefficient values for each sample in the clusters. The average silhouette score is approximately 0.61, indicating that the clusters are reasonably well-defined.

### Conclusions:

1. **Cluster Quality**:
   - The average silhouette score of 0.61 suggests that the clusters are well-defined and distinct.
   - The majority of samples have high silhouette scores, indicating good cohesion within clusters and separation between clusters.

2. **Cluster Sizes**:
   - Cluster 0 (blue) is the largest, followed by clusters 1 (orange), 2 (green), and 3 (red).
   - The thickness of each cluster's silhouette indicates the relative size of each cluster.


In [None]:
clusters, cluster_plot, silhouette_avg, silhouette_values = kmeans_clustering_analysis(data_2015, pca_components=2, num_clusters=4)
cluster_plot.show()

# Silhouette Score
print(f"Average Silhouette Score: {silhouette_avg}")
plot_silhouette_scores(clusters, silhouette_values, num_clusters=4, silhouette_avg=silhouette_avg)

# Display the first few rows of the dataframe with cluster labels
data_2015.head()

# Year 2016

## Silhouette Plot and Clustering Analysis (2016)

### Silhouette Plot:
The silhouette plot shows the silhouette coefficient values for each sample in the clusters. The average silhouette score is approximately 0.57, indicating that the clusters are moderately well-defined.

### Conclusions:

1. **Cluster Quality**:
   - The average silhouette score of 0.57 suggests that the clusters are moderately well-defined.
   - While many samples have high silhouette scores, indicating good cohesion within clusters, there are also some samples with lower scores, indicating some overlap between clusters.

2. **Cluster Sizes**:
   - Cluster 2 (green) is the largest, followed by clusters 1 (orange), 3 (red), and 0 (blue).
   - The thickness of each cluster's silhouette indicates the relative size of each cluster.


In [None]:
clusters, cluster_plot, silhouette_avg, silhouette_values = kmeans_clustering_analysis(data_2016, pca_components=2, num_clusters=4)
cluster_plot.show()

# Silhouette Score
print(f"Average Silhouette Score: {silhouette_avg}")
plot_silhouette_scores(clusters, silhouette_values, num_clusters=4, silhouette_avg=silhouette_avg)

# Display the first few rows of the dataframe with cluster labels
data_2016.head()

# Year 2017

## Silhouette Plot and Clustering Analysis

### Silhouette Plot:
The silhouette plot shows the silhouette coefficient values for each sample in the clusters. The average silhouette score is approximately 0.60, indicating that the clusters are moderately well-defined.

### Conclusions:

1. **Cluster Quality**:
   - The average silhouette score of 0.60 suggests that the clusters are moderately well-defined.
   - While many samples have high silhouette scores, indicating good cohesion within clusters, there are also some samples with lower scores, indicating some overlap between clusters.

2. **Cluster Sizes**:
   - Cluster 3 (red) is the largest, followed by clusters 2 (green), 1 (orange), and 0 (blue).
   - The thickness of each cluster's silhouette indicates the relative size of each cluster.

In [None]:
clusters, cluster_plot, silhouette_avg, silhouette_values = kmeans_clustering_analysis(data_2017, pca_components=2, num_clusters=4)
cluster_plot.show()

# Silhouette Score
print(f"Average Silhouette Score: {silhouette_avg}")
plot_silhouette_scores(clusters, silhouette_values, num_clusters=4, silhouette_avg=silhouette_avg)

# Display the first few rows of the dataframe with cluster labels
data_2017.head()

# Year 2018

## Silhouette Plot and Clustering Analysis

### Silhouette Plot:
The silhouette plot shows the silhouette coefficient values for each sample in the clusters. The average silhouette score is approximately 0.62, indicating that the clusters are well-defined.

### Conclusions:

1. **Cluster Quality**:
   - The average silhouette score of 0.62 suggests that the clusters are well-defined and distinct.
   - The majority of samples have high silhouette scores, indicating good cohesion within clusters and separation between clusters.

2. **Cluster Sizes**:
   - Cluster 0 (blue) is the largest, followed by clusters 1 (orange), 2 (green), and 3 (red).
   - The thickness of each cluster's silhouette indicates the relative size of each cluster.


In [None]:
clusters, cluster_plot, silhouette_avg, silhouette_values = kmeans_clustering_analysis(data_2018, pca_components=2, num_clusters=4)
cluster_plot.show()

# Silhouette Score
print(f"Average Silhouette Score: {silhouette_avg}")
plot_silhouette_scores(clusters, silhouette_values, num_clusters=4, silhouette_avg=silhouette_avg)

# Display the first few rows of the dataframe with cluster labels
data_2018.head()

# Overal Conclusions :

### Operational Efficiency and Profitability:

Component 1 captures the variance related to a company's operational efficiency and profitability. The high loadings on features like EBITDA, EBIT, and Operating Income suggest that these clusters represent different levels of operational performance among companies.

### Profitability Ratios:

Component 2 focuses on profitability ratios. The clusters separated along this component likely reflect differences in how efficiently companies convert their revenues into profits, considering various profitability margins.

### Cluster Interpretation:

- **Cluster 0**: This cluster might represent companies with high profitability and strong operational efficiency, as indicated by the high values along both principal components.
- **Cluster 1**: Companies in this cluster could have moderate operational performance but better profitability ratios than those in other clusters.
- **Cluster 2**: This cluster could represent companies with moderate to low operational performance and profitability ratios.
- **Cluster 3**: Companies in this cluster might have the lowest operational performance and profitability ratios among the groups.

### Investment Strategy:

- Investors can use this clustering analysis to identify groups of companies with similar financial characteristics. For example, clusters with high operational efficiency and profitability might be more attractive investment targets.
- Conversely, clusters representing companies with lower performance might be candidates for further investigation or exclusion from investment portfolios.


# Now Let's see what companies are in each cluster

In [None]:
def perform_pca_analysis(data, num_top_features=10):
    # Isolate numeric data for PCA
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Perform PCA
    pca = PCA()
    pca.fit(scaled_data)
    
    return pca, scaled_data

def kmeans_clustering_analysis(scaled_data, pca_components=2, num_clusters=4):
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=pca_components)
    pca_data = pca.fit_transform(scaled_data)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
    clusters = kmeans.fit_predict(pca_data)
    
    return clusters, pca_data

def companies_in_clusters(data, num_clusters=4, pca_components=2):
    # Perform PCA analysis and scale the data
    pca, scaled_data = perform_pca_analysis(data)
    
    # Perform K-means clustering
    clusters, pca_data = kmeans_clustering_analysis(scaled_data, pca_components, num_clusters)
    
    # Add cluster labels to the original data
    data['Cluster'] = clusters
    
    # Group companies by their cluster
    cluster_groups = data.groupby('Cluster')
    
    # Create a dictionary to store company names in each cluster
    companies_dict = {i: cluster_groups.get_group(i)['Symbol'].tolist() for i in range(num_clusters)}
    
    return companies_dict

## Year 2014

In [None]:
# Example usage with the 2014 dataset
data_2014 = pd.read_csv('processed_data/clean_df_2014.csv')
companies_dict_2014 = companies_in_clusters(data_2014, num_clusters=4, pca_components=2)

# Display companies in each cluster for the 2014 dataset
for cluster, companies in companies_dict_2014.items():
    print(f"Cluster {cluster}:")
    print(companies)
    print()

### Visual Analysis of K-means Clustering Results for 2014

Based on the K-means clustering plot, we can observe the following clusters and their potential implications:

- **Cluster 0 (Purple)**: This cluster is densely packed on the left side of the plot, indicating that the companies in this cluster share similar financial characteristics that distinguish them significantly from others.
- **Cluster 1 (Yellow)**: This cluster is well-separated and occupies a central position, potentially representing companies with balanced characteristics between operational efficiency and profitability.
- **Cluster 2 (Green)**: This cluster extends vertically on the far left, indicating a strong influence of the second principal component. Companies here might have distinct financial metrics related to profitability ratios.
- **Cluster 3 (Blue)**: This cluster is spread out on the right side and appears to have high values along the first principal component, suggesting strong overall financial health and operational efficiency.

### Cluster Composition and Potential Best Cluster
Based on the given component loadings and visual observations, we can hypothesize the following about each cluster:

- **Cluster 0 (Purple)**: Contains companies like RAD, GIS, BRFS, KHC, etc., which might have unique financial characteristics setting them apart. This cluster might include companies with specific market or sectorial advantages.
  
- **Cluster 1 (Yellow)**: Includes companies such as VIPS, TAL, NWL, HRL, etc., that might represent a balanced mix of profitability and operational performance. This cluster could be indicative of companies with stable financial performance.

- **Cluster 2 (Green)**: Comprising large and well-established companies like PG, KR, PM, KO, etc., this cluster likely represents the best-performing companies in terms of overall financial health and operational efficiency. This cluster could be seen as the "best" due to the inclusion of high-performing, large-cap companies.

- **Cluster 3 (Blue)**: This cluster consists of companies such as FRPT, COE, MUX, AGFS, etc., and might represent companies with distinct profitability ratios but potentially smaller in size or emerging markets.

### Conclusion
Visually, **Cluster 2 (Green)** appears to be the best cluster due to the presence of large, well-established companies with strong financial health and operational efficiency. This conclusion is further supported by the high loadings on financial metrics such as EBITDA, EBIT, Operating Income, and various profitability margins.

For a more detailed analysis, further investigation into the specific financial metrics and performance of companies within each cluster is recommended. This could involve examining the average values of key financial indicators within each cluster and comparing them against your criteria for identifying the "best" companies.


# Let's assess price growth potential, for each cluster by year

In [None]:
def analyze_cluster_performance(data, companies_clusters):
    # Create a dictionary to hold performance data for each cluster
    performance_dict = {}
    
    title = f'{str(round(data['Year'][0] + 1))} PRICE VAR [%]'
    
    for cluster, companies in companies_clusters.items():
        cluster_data = data[data['Symbol'].isin(companies)]
        mean_var = cluster_data[title].mean()
        median_var = cluster_data[title].median()
        std_var = cluster_data[title].std()
        performance_dict[cluster] = {'Mean': mean_var, 'Median': median_var, 'Standard Deviation': std_var}

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Cluster', y=title, data=data)
    plt.title(f'{title} Distribution by Cluster')
    plt.xlabel('Cluster')
    plt.ylabel(title)
    plt.grid(True)
    plt.show()
    return performance_dict


## 2014

In [None]:
# Calculate performance for each cluster
cluster_performance = analyze_cluster_performance(data_2014, companies_dict_2014)

# Display the performance
cluster_performance

## 2015

In [None]:
companies_dict_2015 = companies_in_clusters(data_2015, num_clusters=4, pca_components=2)

# Calculate performance for each cluster
cluster_performance = analyze_cluster_performance(data_2015, companies_dict_2015)

# Display the performance
cluster_performance

## 2016

In [None]:
companies_dict_2016 = companies_in_clusters(data_2016, num_clusters=4, pca_components=2)

# Calculate performance for each cluster
cluster_performance = analyze_cluster_performance(data_2016, companies_dict_2016)

# Display the performance
cluster_performance

## 2017

In [None]:
companies_dict_2017 = companies_in_clusters(data_2017, num_clusters=4, pca_components=2)

# Calculate performance for each cluster
cluster_performance = analyze_cluster_performance(data_2017, companies_dict_2017)

# Display the performance
cluster_performance

## 2018

In [None]:
companies_dict_2018 = companies_in_clusters(data_2018, num_clusters=4, pca_components=2)

# Calculate performance for each cluster
cluster_performance = analyze_cluster_performance(data_2018, companies_dict_2018)

# Display the performance
cluster_performance