In [9]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

# directory where the input files are located
input_dir = '/Users/john/projects/rot-tom/sentiment_score_by_year/reviews'

# directory where the output files will be saved
output_dir = '/Users/john/projects/rot-tom/sentiment_score_by_year/filtered'

# sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Loop over each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read in the CSV file
        df = pd.read_csv(os.path.join(input_dir, filename))
        
        # Filter out rows where the review_year is before 2000
        df = df[df['review_year'] >= 2000]
        
        # Get the genre name from the filename
        genre = os.path.splitext(filename)[0].split("_")[0]
        
        # Calculate sentiment scores for each review and add them to the DataFrame
        df['sentiment_scores'] = df['review_content'].apply(lambda x: sid.polarity_scores(x))
        
        # Extract the compound score from the sentiment dictionary and add it as a new column
        df['compound_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])
        
        # Group the DataFrame by year and calculate the average sentiment score
        df_grouped = df.groupby('review_year')['compound_score'].mean().reset_index()
        
        # Add a column for the genre
        df_grouped['genre'] = genre
        
        # Add a column for the number of reviews in each year
        df_grouped['num_reviews'] = df.groupby('review_year')['review_content'].count().values
        
        # Save the results to a new CSV file
        output_filename = os.path.join(output_dir, genre + '_sentiment.csv')
        df_grouped.to_csv(output_filename, index=False)


In [10]:
import pandas as pd
import os

# directory where the input files are located
input_dir = '/Users/john/projects/rot-tom/sentiment_score_by_year/filtered'

# directory where the output file will be saved
output_dir = '/Users/john/projects/rot-tom/sentiment_score_by_year'

# create an empty list to store the DataFrames
dfs = []

# loop over each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # read in the CSV file
        df = pd.read_csv(os.path.join(input_dir, filename))
        
        # append the DataFrame to the list
        dfs.append(df)

# concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dfs)

# save the combined DataFrame as a new CSV file
output_filename = os.path.join(output_dir, 'sentiment_score_by_year.csv')
combined_df.to_csv(output_filename, index=False)
