In [10]:
import pandas as pd
from collections import Counter
import string

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Strip extra whitespace
    text = ' '.join(text.split())
    return text

# Initialize a counter for word count
word_count = Counter()

# Define the path to the CSV file
file_path = 'ds2.csv'

# Define the chunk size
chunk_size = 10000

# Read the CSV file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Filter the chunk for pop songs and extract titles
    pop_titles = chunk[chunk['tag'] == 'pop']['title']
    
    # Preprocess titles and update word count
    for title in pop_titles:
        preprocessed_title = preprocess_text(title)
        if preprocessed_title:  # Only process non-empty titles
            words = preprocessed_title.split()
            word_count.update(words)

# Convert word count to a DataFrame for better visualization
word_count_df = pd.DataFrame(word_count.items(), columns=['Word', 'Count'])

# Sort the DataFrame by count in descending order
word_count_df = word_count_df.sort_values(by='Count', ascending=False)
word_count_df = word_count_df.head(50)

# Output the words and their counts
print(word_count_df)

# If you want to save the word count to a CSV file
word_count_df.to_csv('word_count.csv', index=False)


           Word   Count
42          the  237584
12          you  106699
79           of   93116
39            a   89859
156           i   84914
107          me   80645
41           in   76577
158          to   67863
9          love   63628
29           my   55641
186          on   39447
16        remix   37903
55           no   37101
242         and   36960
1055         de   36524
26           it   36072
47          for   35559
722          la   32513
251          is   31457
8          your   29477
252        live   26702
146     version   25717
370         all   24396
51           be   22644
268        dont   21237
171        song   20142
281        time   20042
56          one   19426
13           up   19207
222        with   18761
395          do   18033
368         mix   17437
80         this   16772
162          we   16687
95         like   16393
565          go   15768
359       night   15621
712          el   15408
433        from   15292
119        what   14481
309         out 