# Exploratory Text Data Analysis

### Load Data

In [1]:
import pandas as pd
from collections import Counter

# ==== CONFIG ====
CSV_PATH = "../sample_data/processed/genius-clean-with-title-artist-5000.csv"

# Load the entire CSV
df = pd.read_csv(CSV_PATH)

### Basic Counts

In [2]:
# --- Basic counts ---
total_lines = len(df)
unique_artists = df["artist"].nunique()
unique_tags = df["tag"].nunique()

print(f"\nTotal lines (songs): {total_lines}")
print(f"Unique artists: {unique_artists}")
print(f"Unique tags: {unique_tags}")


Total lines (songs): 5000
Unique artists: 1069
Unique tags: 6


### Top 10 Artists

In [3]:
# --- Top 10 artists ---
top_artists = df["artist"].value_counts().head(10).reset_index()
top_artists.columns = ["Artist", "Count"]
print("\nTop 10 artists:")
print(top_artists.to_string(index=False))


Top 10 artists:
     Artist  Count
     Eminem    161
  Lil Wayne    153
      JAY-Z    138
        Nas    100
 Kanye West     99
Lupe Fiasco     98
      Drake     78
       2Pac     63
    50 Cent     57
    J. Cole     56


### Top 10 Tags

In [4]:
# --- Top 10 tags ---
top_tags = df["tag"].value_counts().head(10).reset_index()
top_tags.columns = ["Tag", "Count"]
print("\nTop 10 tags:")
print(top_tags.to_string(index=False))


Top 10 tags:
    Tag  Count
    rap   4615
   rock    125
    pop    121
     rb     81
   misc     50
country      8


### Top 10 Words in Lyrics

In [6]:
# --- Top 10 words ---
print("\nComputing top 10 words...")
word_counter = Counter()
for lyric in df["lyrics"].fillna(""):
    # Since already preprocessed, simple whitespace split is fine
    word_counter.update(lyric.split())

top_words = pd.DataFrame(word_counter.most_common(10), columns=["Word", "Count"])

print("\nTop 10 words:")
print(top_words.to_string(index=False))


Computing top 10 words...

Top 10 words:
Word  Count
 the 110617
   I  86580
   a  59293
 you  55575
  to  52954
 and  42026
  my  38186
  in  34803
 I'm  30584
  it  29746
