In [None]:
import pandas as pd
import numpy as np

def redistribute_counts(file_path, output_path=None):
    """
    Redistributes counts of non-series words into their series counterparts.
    
    Args:
        file_path: Path to the input CSV file
        output_path: Optional path for the output CSV file. If not provided, returns DataFrame
    """
    # Try to read the file and determine if it has a header
    try:
        # First, try reading the file with the first row as header
        data = pd.read_csv(file_path)
        # If successful and 'count' is a column name, then we have a header
        if 'count' in data.columns:
            # We already have the right column names
            pass
        else:
            # The file has a header but not our expected column names
            # Read again with no header and assign column names
            data = pd.read_csv(file_path, header=None)
            data.columns = ['id', 'character', 'series', 'word', 'count']
    except:
        # Reading file without a header
        data = pd.read_csv(file_path, header=None)
        data.columns = ['id', 'character', 'series', 'word', 'count']
    
    # Convert the count column to integer
    data['count'] = pd.to_numeric(data['count'], errors='coerce')
    
    # Drop any rows where count couldn't be converted to numeric
    data = data.dropna(subset=['count'])
    data['count'] = data['count'].astype(int)
    
    # Identify series and non-series data
    # Non-series data has empty or NA series field
    series_data = data[data['series'].notna() & (data['series'] != '')]
    non_series_data = data[(data['series'].isna()) | (data['series'] == '')]
    
    # Process each word in non-series data
    for _, row in non_series_data.iterrows():
        word = row['word']
        non_series_count = row['count']
        
        # Find all occurrences of this word in series data
        matching_series_rows = series_data[series_data['word'] == word]
        
        if len(matching_series_rows) > 0:
            # Calculate how much to add to each series entry
            # We'll distribute the count proportionally based on existing counts
            total_series_count = matching_series_rows['count'].sum()
            
            # Redistribute non-series count proportionally
            for idx in matching_series_rows.index:
                # Calculate proportion of the non-series count to add
                proportion = series_data.loc[idx, 'count'] / total_series_count
                # Add the proportional amount to the series count
                series_data.loc[idx, 'count'] += round(proportion * non_series_count)
    
    # Convert back to integer to ensure no decimal values
    series_data['count'] = series_data['count'].astype(int)
    
    # Write the updated series data to output file or return DataFrame
    if output_path:
        series_data.to_csv(output_path, index=False, header=False)
        return f"Data saved to {output_path}"
    else:
        return series_data

# Example usage
# redistribute_counts('Amy_top_words.csv', 'Amy_top_words_redistributed.csv')

In [2]:
redistribute_counts('Amy_top_words.csv', 'Amy_top_words_redistributed.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_data['count'] = series_data['count'].astype(int)


'Data saved to Amy_top_words_redistributed.csv'

In [3]:
input_files = ['Bernadette_top_words.csv',
         'Howard_top_words.csv', 
         'Leonard_top_words.csv', 
         'Penny_top_words.csv', 
         'Sheldon_top_words.csv',
         'Raj_top_words.csv']
output_files = ['Bernadette_top_words_redistributed.csv',
            'Howard_top_words_redistributed.csv', 
            'Leonard_top_words_redistributed.csv', 
            'Penny_top_words_redistributed.csv', 
            'Sheldon_top_words_redistributed.csv',
            'Raj_top_words_redistributed.csv']

for input_file, output_file in zip(input_files, output_files):
    redistribute_counts(input_file, output_file)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_data['count'] = series_data['count'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_data['count'] = series_data['count'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_data['count'] = series_data['count'].astype(int)
A value is trying to be set on a copy