# note
- The input file of enbedding cell is act3301_processed_data.csv  
- parameter **explained_variance_ratio** decide how many components to use

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# embedding

In [2]:
input_file = r"V:\20240920\github\results\act3301_processed_data.csv"

csv_output_file = r"V:\20240920\theme_analysis_act3301\act3301_processed_data_clean.csv"
npy_output_file = r"V:\20240920\theme_analysis_act3301\text_embeddings_clean_lb2.npy"

print("Loading data...")

df = pd.read_csv(input_file)

# Remove None values and show statistics
print("\nData statistics:")
print(f"Total number of texts: {len(df)}")
print(f"Number of None values: {df['letter'].isnull().sum()}")
   
# Remove rows with None values
df_clean = df.dropna(subset=['letter'])
print(f"Number of valid texts after removing None: {len(df_clean)}")

# Creating embeddings
print("Creating embeddings...")
texts = df_clean['letter'].tolist()
# texts = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer(r"V:\huggingface\model\sentence-transformers/all-mpnet-base-v2") # off-line mode
# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') # on-line mode

embeddings = model.encode(texts, max_length=2048)

# Saving embeddings  
print("\nSaving embeddings...")
# np_embeddings = embeddings.numpy()
np.save(npy_output_file, embeddings)

# compare length of embedding and data file 
print("\n --- compare length of embedding and data file ---")
print(f"Embeddings shape: {embeddings.shape}")
print(f"length of csv_output_file: {len(df_clean['letter'])}")

# Save clean data for reference
df_clean.to_csv(csv_output_file, index=False)

Loading data...

Data statistics:
Total number of texts: 8805
Number of None values: 3219
Number of valid texts after removing None: 5586
Creating embeddings...

Saving embeddings...

 --- compare length of embedding and data file ---
Embeddings shape: (5586, 768)
length of csv_output_file: 5586


# PCA

## explain variance percentage

In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def components_for_variance(embeddings_path, n):
    embed = np.load(embeddings_path)
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embed)

    pca = PCA()
    pca.fit(scaled_embeddings)
    
    # Calculate cumulative variance ratio
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    num_components = np.argmax(cumulative_variance >= n) + 1  
    
    print(f"Need {num_components} components to explain {n * 100}% variance.")
    return num_components


embeddings_path = r"V:\20240920\theme_analysis_act3301\text_embeddings_clean_lb2.npy"

components_for_variance(embeddings_path, 0.7)
components_for_variance(embeddings_path, 0.8)
components_for_variance(embeddings_path, 0.9)


Need 34 components to explain 70.0% variance.
Need 63 components to explain 80.0% variance.
Need 135 components to explain 90.0% variance.


135

## Themes analysis

In [4]:
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# select explained variance ratio
explained_variance_ratio = 0.7
number_components = components_for_variance(embeddings_path, explained_variance_ratio)
print(f"\n --- need {number_components} components for explained variance ratio {explained_variance_ratio} ---")

# define function analyze_pca_themes
def analyze_pca_themes(embeddings_path=r"V:\20240920\theme_analysis_act3301\text_embeddings_clean_lb2.npy", 
                       data_path=r"V:\20240920\theme_analysis_act3301\act3301_processed_data_clean.csv",
                       n_components = 34,   
                       n_top_texts=5):
    # Load data
    print("Loading data...")
    embeddings = np.load(embeddings_path)
    df = pd.read_csv(data_path)

    # compare length of embedding and data file 
    print("\n --- compare length of embedding and data file ---")
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"length of data file: {len(df['letter'])}")
    
    # Check data length consistency
    if embeddings.shape[0] != len(df):
        raise ValueError("The number of embeddings does not match the number of texts in the data file.")
  
    # Standardize embeddings
    print("\n--- Standardizing embeddings...")
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
   
    # Apply PCA
    print(f"\n--- Applying PCA with {n_components} components...")
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(scaled_embeddings)
   
    print(f"\n --- Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")
    
    # Print explained variance ratio per component
    print("\n--- Explained Variance Ratio per Component ---")
    for i, ratio in enumerate(pca.explained_variance_ratio_):
        print(f"Component {i+1}: {ratio:.4f}")
   
    # Analyze themes
    print("\n\n --- Analyzing themes in each component...")
    themes = {}
    for i in range(n_components):
        # Get component scores
        scores = pca_result[:, i]
        
        # Get top and bottom texts
        top_indices = np.argsort(scores)[-n_top_texts:]
        bottom_indices = np.argsort(scores)[:n_top_texts]
        
        # Add statistics
        themes[f'Component_{i+1}'] = {
            'positive': [(df['letter'].iloc[idx], scores[idx]) 
                         for idx in reversed(top_indices)],
            'negative': [(df['letter'].iloc[idx], scores[idx]) 
                         for idx in bottom_indices],
            'score_stats': {
                'mean': np.mean(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
        }
   
    return pca_result, themes

def save_themes_to_excel(themes, output_path):
    """
    Save the thematic analysis results to an Excel file.
    """
    # Step 1: Check if the file exists and delete it
    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"old File '{output_path}' has been deleted.")
    else:
        print(f"File '{output_path}' does not exist. Creating a new file.")
    
    # Create a Pandas Excel writer using XlsxWriter as the engine
    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
        for component, theme_data in themes.items():
            # Create a DataFrame for positive examples
            positive_df = pd.DataFrame(theme_data['positive'], columns=['Text', 'Score'])
            positive_df.to_excel(writer, sheet_name=f"{component}_Positive", index=False)
            
            # Create a DataFrame for negative examples
            negative_df = pd.DataFrame(theme_data['negative'], columns=['Text', 'Score'])
            negative_df.to_excel(writer, sheet_name=f"{component}_Negative", index=False)
            
            # Add statistics to the Excel file
            stats_df = pd.DataFrame([theme_data['score_stats']])
            stats_df.to_excel(writer, sheet_name=f"{component}_Stats", index=False)

if __name__ == "__main__":
    # Apply PCA and analyze themes
    pca_result, themes = analyze_pca_themes(n_components = number_components)
    
    # Save PCA results
    np.save(r"V:\20240920\theme_analysis_act3301\pca_results.npy", pca_result)
    
    # Save themes to Excel file
    save_themes_to_excel(themes, r"V:\20240920\theme_analysis_act3301\thematic_analysis_results.xlsx")

Need 34 components to explain 70.0% variance.

 --- need 34 components for explained variance ratio 0.7 ---
Loading data...

 --- compare length of embedding and data file ---
Embeddings shape: (5586, 768)
length of data file: 5586

--- Standardizing embeddings...

--- Applying PCA with 34 components...

 --- Total explained variance: 0.7025

--- Explained Variance Ratio per Component ---
Component 1: 0.1112
Component 2: 0.0743
Component 3: 0.0731
Component 4: 0.0517
Component 5: 0.0388
Component 6: 0.0344
Component 7: 0.0289
Component 8: 0.0241
Component 9: 0.0211
Component 10: 0.0188
Component 11: 0.0175
Component 12: 0.0157
Component 13: 0.0147
Component 14: 0.0144
Component 15: 0.0128
Component 16: 0.0121
Component 17: 0.0117
Component 18: 0.0111
Component 19: 0.0098
Component 20: 0.0097
Component 21: 0.0093
Component 22: 0.0089
Component 23: 0.0086
Component 24: 0.0081
Component 25: 0.0079
Component 26: 0.0072
Component 27: 0.0071
Component 28: 0.0065
Component 29: 0.0063
Componen

# Varimax Rotation

In [5]:
from factor_analyzer.rotator import Rotator

# load data
embeddings_path=r"V:\20240920\theme_analysis_act3301\text_embeddings_clean_lb2.npy"
embeddings = np.load(embeddings_path)

# Standardize embeddings
print("Standardizing embeddings...")
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# percentage of varianc explanation is set by parameter explained_variance_ratio
pca = PCA()
pca_result = pca.fit_transform(scaled_embeddings)
cumulative_variance = pca.explained_variance_ratio_.cumsum()
threshold = explained_variance_ratio         # keep same as themes analysis 
n_components = np.argmax(cumulative_variance >= threshold) + 1
pca = PCA(n_components = n_components)
pca.fit(scaled_embeddings)

# transposes the matrix so that each row corresponds to a feature and each column corresponds to a component.
loadings = pca.components_.T  
print("\n---Loadings:\n", loadings)
print(loadings.shape)
print(loadings.T.shape)

rotator = Rotator(method='varimax')
rotated_loadings = rotator.fit_transform(loadings)

print("\n---Rotated Loadings:\n", rotated_loadings)
print(f"Rotated Loadings's shape: {rotated_loadings.shape}")
print(pca.components_.shape)
print(pca.components_.T.shape)

Standardizing embeddings...

---Loadings:
 [[-0.0629569  -0.02778213 -0.0300207  ... -0.02277198 -0.02369534
  -0.02629992]
 [-0.03351099  0.00943519 -0.04230314 ... -0.00102278 -0.01460425
  -0.02007923]
 [-0.00152517 -0.02726444  0.01039183 ...  0.01405986  0.03401203
   0.10734208]
 ...
 [-0.07952639 -0.00083943 -0.00485312 ...  0.02925705 -0.05475898
   0.01240202]
 [-0.07967237  0.0260902  -0.01103387 ... -0.04205215 -0.06119183
  -0.00925014]
 [ 0.02355927  0.01461541  0.07652417 ... -0.02912033 -0.00964162
   0.04764943]]
(768, 34)
(34, 768)

---Rotated Loadings:
 [[-0.07914463 -0.01695378  0.00733535 ...  0.00404747 -0.05561031
   0.01636958]
 [-0.00332911  0.04855251 -0.01596029 ... -0.00571843 -0.04996122
  -0.03579001]
 [-0.00214464 -0.0271831  -0.01474453 ...  0.01297485 -0.02828156
   0.00481755]
 ...
 [-0.04120945 -0.01330149  0.0123803  ...  0.06255983 -0.01065167
  -0.00270828]
 [-0.05680602  0.02780359 -0.02851707 ...  0.01432151  0.00285793
  -0.04501653]
 [ 0.0272041

## get the new score after varimax rotation

In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from factor_analyzer.rotator import Rotator

# Step 1: Load the Data
embeddings_path = r"V:\20240920\theme_analysis_act3301\text_embeddings_clean_lb2.npy"
data_path = r"V:\20240920\theme_analysis_act3301\act3301_processed_data_clean.csv"

embeddings = np.load(embeddings_path)
df = pd.read_csv(data_path)

# Step 2: Standardize the Embeddings
print("Standardizing embeddings...")
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# Step 3: Perform PCA and Varimax Rotation
# Perform PCA to determine the number of components explaining 90% variance
pca = PCA()
pca_result = pca.fit_transform(scaled_embeddings)
cumulative_variance = pca.explained_variance_ratio_.cumsum()
threshold = explained_variance_ratio         # keep same as themes analysis
n_components = np.argmax(cumulative_variance >= threshold) + 1

# Fit PCA with the selected number of components
pca = PCA(n_components=n_components)
pca.fit(scaled_embeddings)

# Extract PCA loadings and transpose them
loadings = pca.components_.T

# Apply Varimax rotation
rotator = Rotator(method='varimax')
rotated_loadings = rotator.fit_transform(loadings)

# Step 4: Project Data onto Rotated Components
# Calculate the new scores by multiplying the standardized embeddings with the rotated loadings
rotated_scores = np.dot(scaled_embeddings, rotated_loadings)

# Step 5: Save the New Scores
# Save the rotated scores to a new file
output_path = r"V:\20240920\theme_analysis_act3301\rotated_scores.npy"
np.save(output_path, rotated_scores)

# Print the shape of the rotated scores
print(f"Rotated Scores' shape: {rotated_scores.shape}")

# Optional: Save the rotated scores with corresponding text data to a CSV file
rotated_scores_df = pd.DataFrame(rotated_scores, columns=[f"Rotated_Component_{i+1}" for i in range(n_components)])
rotated_scores_df["letter"] = df["letter"].values
rotated_scores_df.to_csv(r"V:\20240920\theme_analysis_act3301\rotated_scores_with_text.csv", index=False)

Standardizing embeddings...
Rotated Scores' shape: (5586, 34)


## get top_n positive and negative score letter of each Rotated Component 

In [7]:
import pandas as pd

# Define the number of top scores to extract
top_n = 5

# Extract Top `top_n` Positive and Negative Scores for Each Rotated Component
results = []
for component in rotated_scores_df.columns[:-1]:  # Exclude the "letter" column
    # Get the top_n positive scores
    top_positive = rotated_scores_df.nlargest(top_n, component)[["letter", component]]
    
    # Get the top_n negative scores
    top_negative = rotated_scores_df.nsmallest(top_n, component)[["letter", component]]
    
    # Store the results for positive scores
    for letter, score in top_positive.values:
        results.append({
            "Component": component,
            "Type": "Positive",
            "Letter": letter,
            "Score": score
        })
    
    # Store the results for negative scores
    for letter, score in top_negative.values:
        results.append({
            "Component": component,
            "Type": "Negative",
            "Letter": letter,
            "Score": score
        })

# Step 3: Save the Results to a CSV File
# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

# Save the results to a CSV file
output_path = r"V:\20240920\theme_analysis_act3301\top_scores_per_rotated_component.csv"
results_df.to_csv(output_path, index=False)

# Print the results
print(results_df)

                Component      Type  \
0     Rotated_Component_1  Positive   
1     Rotated_Component_1  Positive   
2     Rotated_Component_1  Positive   
3     Rotated_Component_1  Positive   
4     Rotated_Component_1  Positive   
..                    ...       ...   
335  Rotated_Component_34  Negative   
336  Rotated_Component_34  Negative   
337  Rotated_Component_34  Negative   
338  Rotated_Component_34  Negative   
339  Rotated_Component_34  Negative   

                                                Letter      Score  
0    dear senator,\n \n i am writing to encourage y...  16.019330  
1    please do not reintroduce act3301. i am oppose...  15.874529  
2    don't reintroduce act3301. we can do better. w...  15.294627  
3    please do not re-introduce act3301. we are alr...  14.988981  
4    dear senator,\n \n the act3301 is a vital act3...  14.677087  
..                                                 ...        ...  
335  women entrepreneurs also face major obstacles ... 