In [None]:
import pandas as pd
import json
import os

# Load configuration
def load_config():
    try:
        with open('config.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        print("Warning: config.json not found, using default paths")
        return {}

config = load_config()

# Get file paths from config
video_stats_file = config.get("output_files", {}).get("video_statistics", "output/videos_statistics.csv")

# Load the videos statistics CSV file
df = pd.read_csv(video_stats_file)

print(f"Original videos data: {len(df)} rows")
print("Columns available:", list(df.columns))

# Keep only the first 250 records
df_250 = df.head(250)

# Save the trial file with 250 videos
df_250.to_csv('videos_statistics_250_trial.csv', index=False)

Original videos data: 28554 rows
Columns available: ['videoId', 'title', 'description', 'publishedAt', 'channelId', 'channelTitle', 'thumbnails', 'duration', 'definition', 'caption', 'licensedContent', 'dimension', 'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'unique_id', 'video_url']

Trial file created: videos_statistics_250_trial.csv
Trial file contains: 250 videos

Sample of the first few videos:
       videoId                                              title  viewCount  \
0  0nJG5axbhk4  S&P Global: Transforming DevOps with AWS GenAI...        338   
1  9NfWmOrSsmI  AI Agents and Tools in AWS Marketplace | Amazo...        446   
2  IBaU9Ify-bE  Amazon Bedrock AgentCore - Accelerate your AI ...       1127   
3  usFIb9aEd1U  Amazon Bedrock AgentCore: Deploy & Operate AI ...       9980   
4  YDqTaZ4dpXc  Amazon Bedrock AgentCore - Accelerate your AI ...       2165   

  duration  
0  0:03:18  
1  0:00:33  
2  0:00:31  
3  0:06:57  
4  0:01:48  


In [None]:
import pandas as pd
import os

# Use config from previous cell or reload if needed
if 'config' not in locals():
    import json
    with open('config.json', 'r') as f:
        config = json.load(f)

# Get file paths from config
basic_features_file = config.get("output_files", {}).get("basic_features", "output/basic_features.csv")
image_features_file = config.get("output_files", {}).get("image_features", "output/image_features_analysis.csv")
audio_features_file = config.get("output_files", {}).get("audio_features", "output/audio_features_analysis.csv")
text_features_file = config.get("output_files", {}).get("text_features", "output/text_features_analysis.csv")

# Load all the output CSV files
basic_df = pd.read_csv(basic_features_file)
image_df = pd.read_csv(image_features_file)
audio_df = pd.read_csv(audio_features_file)
text_df = pd.read_csv(text_features_file)

print("Data loaded successfully!")
print(f"Basic features: {len(basic_df)} rows")
print(f"Image features: {len(image_df)} rows")
print(f"Audio features: {len(audio_df)} rows")
print(f"Text features: {len(text_df)} rows")

Data loaded successfully!
Basic features: 2 rows
Image features: 2 rows
Audio features: 2 rows
Text features: 2 rows

Merged data: 2 rows, 26 columns
Columns: ['channelId', 'videoId', 'Followers', 'videoAge', 'videoLength', 'sceneNumber', 'averageSceneLength', 'humanPresence', 'faceSum', 'Gender', 'Smile', 'motionMagnitude', 'motionDirection', 'Saturation', 'Brightness', 'Arousal', 'Valence', 'Pitch', 'titleSentiment', 'titleTechnicality', 'descriptionSentiment', 'descriptionTechnicality', 'hashtagsDescription', 'URLDescription', 'scriptSentiment', 'scriptTechnicality']


In [2]:
# Add missing columns with NaN values
merged_df['Engagement'] = None
merged_df['Naration'] = None
merged_df['audioCongruency'] = None
merged_df['visualCongruency'] = None

# Define the desired column order from main.csv
desired_columns = [
    'Engagement', 'Naration', 'audioCongruency', 'visualCongruency', 'channelId', 'videoId',
    'Followers', 'videoAge', 'videoLength', 'sceneNumber', 'averageSceneLength',
    'humanPresence', 'faceSum', 'Gender', 'Smile', 'motionMagnitude', 'motionDirection',
    'Saturation', 'Brightness', 'Arousal', 'Valence', 'Pitch', 'titleSentiment',
    'titleTechnicality', 'descriptionSentiment', 'descriptionTechnicality',
    'hashtagsDescription', 'URLDescription', 'scriptSentiment', 'scriptTechnicality'
]

# Reorder columns
final_df = merged_df[desired_columns]

print("Final dataframe shape:", final_df.shape)
print("Final columns:", list(final_df.columns))
print("\nFirst few rows:")
print(final_df.head())

Final dataframe shape: (2, 30)
Final columns: ['Engagement', 'Naration', 'audioCongruency', 'visualCongruency', 'channelId', 'videoId', 'Followers', 'videoAge', 'videoLength', 'sceneNumber', 'averageSceneLength', 'humanPresence', 'faceSum', 'Gender', 'Smile', 'motionMagnitude', 'motionDirection', 'Saturation', 'Brightness', 'Arousal', 'Valence', 'Pitch', 'titleSentiment', 'titleTechnicality', 'descriptionSentiment', 'descriptionTechnicality', 'hashtagsDescription', 'URLDescription', 'scriptSentiment', 'scriptTechnicality']

First few rows:
  Engagement Naration audioCongruency visualCongruency  \
0       None     None            None             None   
1       None     None            None             None   

                  channelId      videoId  Followers  videoAge  videoLength  \
0  UCd6MoB9NC6uYN2grvUNT-Zg  0nJG5axbhk4     815000        20          198   
1  UCd6MoB9NC6uYN2grvUNT-Zg  9NfWmOrSsmI     815000        21           33   

   sceneNumber  ...  Valence  Pitch  titleSe

In [None]:
# Use config from previous cell or reload if needed
if 'config' not in locals():
    import json
    with open('config.json', 'r') as f:
        config = json.load(f)

# Get main output file path from config
main_output_file = config.get("output_files", {}).get("main_output", "main.csv")

# Save the merged data to main output file
final_df.to_csv(main_output_file, index=False)

print(f"Data successfully saved to {main_output_file}!")
print(f"Total records: {len(final_df)}")
print(f"Total columns: {len(final_df.columns)}")

# Verify the saved file
verification_df = pd.read_csv(main_output_file)
print(f"\nVerification - {main_output_file} contains {len(verification_df)} rows and {len(verification_df.columns)} columns")
print(f"Sample data from {main_output_file}:")
print(verification_df.head())

Data successfully saved to main.csv!
Total records: 2
Total columns: 30

Verification - main.csv contains 2 rows and 30 columns
Sample data from main.csv:
   Engagement  Naration  audioCongruency  visualCongruency  \
0         NaN       NaN              NaN               NaN   
1         NaN       NaN              NaN               NaN   

                  channelId      videoId  Followers  videoAge  videoLength  \
0  UCd6MoB9NC6uYN2grvUNT-Zg  0nJG5axbhk4     815000        20          198   
1  UCd6MoB9NC6uYN2grvUNT-Zg  9NfWmOrSsmI     815000        21           33   

   sceneNumber  ...  Valence  Pitch  titleSentiment  titleTechnicality  \
0           30  ...    0.721  0.916           0.649              0.555   
1            2  ...    0.701  1.000           0.496              0.251   

   descriptionSentiment  descriptionTechnicality  hashtagsDescription  \
0                 0.423                    0.383                  0.3   
1                 0.276                    0.368      