In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

# dir where the input files are located
input_dir = '/Users/john/projects/rot-tom/gender_analysis/reviews_with_gender'

# dir where the output files will be saved
output_dir = '/Users/john/projects/rot-tom/sentiment_analysis'

# sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Loop over each file in the input directory
for filename in os.listdir(input_dir):
    # Read in the CSV file
    df = pd.read_csv(os.path.join(input_dir, filename))
    
    # Calculate sentiment scores for each review and add them to the DataFrame
    df['sentiment_scores'] = df['review_content'].apply(lambda x: sid.polarity_scores(x))
    
    # Extract the compound score from the sentiment dictionary and add it as a new column
    df['compound_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])
    
    # Sort the DataFrame by compound score
    df = df.sort_values(by='compound_score', ascending=False)
    
    # Save the output to a new CSV file in the output directory
    output_filename = os.path.join(output_dir, filename)
    df.to_csv(output_filename, index=False)


In [5]:
import pandas as pd

# Read in the sentiment scores CSV file
df = pd.read_csv('horror_reviews.csv')

# Create a new column to combine gender values
df['gender_group'] = df['gender'].replace({'mostly_female': 'female'})
df['gender_group'] = df['gender'].replace({'mostly_male': 'male'})

# Group by the combined gender column and calculate the average sentiment score
avg_sentiment = df.groupby('gender_group')['compound_score'].mean()

# Print the average sentiment scores
print(avg_sentiment)

gender_group
andy             0.123704
female           0.032684
male             0.015760
mostly_female    0.054016
unknown          0.031447
Name: compound_score, dtype: float64


In [7]:
import os
import pandas as pd

# get all the reviews from the directory
directory = '/Users/john/projects/rot-tom/sentiment_analysis/reviews_with_sentiment_scores'
file_names = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

# calculate average sentiment scores and counts, and store them in a dictionary
results = []
for file_name in file_names:
    df = pd.read_csv(file_name)
    df['gender_group'] = df['gender'].replace({'mostly_female': 'female', 'mostly_male': 'male'})
    gender_counts = df['gender_group'].value_counts()
    male_count = gender_counts.get('male', 0)
    female_count = gender_counts.get('female', 0)
    avg_sentiment = df.groupby('gender_group')['compound_score'].mean()
    genre = os.path.splitext(os.path.basename(file_name))[0].split('_')[0]
    male_avg = avg_sentiment.loc['male']
    female_avg = avg_sentiment.loc['female']
    result = {'genre': genre, 'male_review_average': male_avg, 'female_review_average': female_avg,
              'male_review_count': male_count, 'female_review_count': female_count}
    results.append(result)

# write to new csv
df = pd.DataFrame.from_dict(results)
df.to_csv('genre_sentiment_averages.csv', index=False)

In [1]:
import pandas as pd
import os

# Set the path to the directory containing the CSV files
dir_path = "/Users/john/projects/rot-tom/sentiment_analysis/reviews_with_sentiment_scores"
output_path = "/Users/john/projects/rot-tom/sentiment_analysis/score_subsets"

# Loop through each file in the directory
for file_name in os.listdir(dir_path):
    if file_name.endswith(".csv"):
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(os.path.join(dir_path, file_name))
        
        # Get the first and last 20 rows
        top_positive = df.head(20)
        top_negative = df.tail(20)
        
        # Concatenate the first and last 20 rows into a new DataFrame
        new_df = pd.concat([top_positive, top_negative])
        
        # Write the new DataFrame to a new CSV file
        new_file_name = os.path.splitext(file_name)[0] + ".csv"
        new_df.to_csv(os.path.join(output_path, new_file_name), index=False)
