# We are going to analyze all the titles from the dataframe and make a separated dataframe with every word used and count number to see what words were used the most

In [None]:
import pandas as pd                           #dataframe manipulations
import re                                     #python regular expression library
from nltk.stem import WordNetLemmatizer       #import WordNetLemmatizer class to reduce words to their base form
from collections import Counter               #class for python collection module
from nltk.corpus import stopwords             #import stopwords (e.g., "and", "the", "is")
from nltk import pos_tag                      #function that assigns a part of speech tag (POS) (e.g., noun, verb, adjective)
from nltk.corpus import wordnet as wn         #wordnet is a large lexical database of English
import nltk                                   #import nltk functions



nltk.download('stopwords')                    #download stopwords list
nltk.download('punkt')                        #downloads the Punkt Tokenizer model, which is used for sentence and word tokenization (splitting text into sentences or words)
nltk.download('averaged_perceptron_tagger')   #downloads the POS tagger model, to assign grammatical roles (e.g., noun, verb, etc.) to words
nltk.download('wordnet')                      #downloads the WordNet lexical database


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:

stop_words = set(stopwords.words('english'))                           #load the stopwords list

def remove_punctuation(input_string):                                  #function to remove punctuation

    return re.sub(r'[^\w\s]', '', input_string)#re.sub(pattern, replacement, string)
    #r for replacement
    #^ for "not" or "exclude".
    #\w: Matches any alphanumeric character (letters, digits, and underscore _)
    #\s: Matches any whitespace character (spaces, tabs, newlines)
    #[^\w\s]: Matches any character that is not an alphanumeric character or whitespace (i.e., punctuation or symbols)


def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']                        #check if a noun and tag it
    #'NN': Singular noun.
    #'NNS': Plural noun.
    #'NNP': Proper singular noun.
    #'NNPS': Proper plural noun.

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']          #check if a verb and tag it
    #'VB': Base form.
    #'VBD': Past tense.
    #'VBG': Gerund/present participle.
    #'VBN': Past participle.
    #'VBP': Non-third person singular present.
    #'VBZ': Third person singular present.

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']                               #check if an adverb and tag it
    #'RB': Regular adverb.
    #'RBR': Comparative adverb.
    #'RBS': Superlative adverb.

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']                               #check if an adjective and tag it
    #'JJ': Regular adjective.
    #'JJR': Comparative adjective.
    #'JJS': Superlative adjective.

def penn_to_wn(tag):                                                 #Converts POS tags (used by pos_tag) to WordNet-compatible POS tags (used by WordNetLemmatizer).

    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

lemmatizer = WordNetLemmatizer()


df = pd.read_csv('/content/video_kids_diana.csv')                                   #load the dataset


word_counter = Counter()                                             #initialize a counter for words

for title in df['title']:                                            #process each title in the dataset

    # Step 1: Remove punctuation
    clean_title = remove_punctuation(title)

    # Step 2: Lowercase and tokenize
    words = nltk.word_tokenize(clean_title.lower())

    # Step 3: POS tagging and lemmatization
    for word, tag in pos_tag(words):                                # Get POS tags
        if word not in stop_words:                                  # ignore stop words
            wn_pos = penn_to_wn(tag)                                # Convert to WordNet POS
            if wn_pos:                                              # If a valid WordNet POS is found
                lemma = lemmatizer.lemmatize(word, pos=wn_pos)      # Lemmatize with correct POS
            else:
                lemma = lemmatizer.lemmatize(word)                  # Default to noun if no valid POS
            word_counter[lemma] += 1                                # Increment word count


word_counts_df = pd.DataFrame(word_counter.items(), columns=['word', 'count'])       # Convert the Counter to a DataFrame


word_counts_df = word_counts_df.sort_values(by='count', ascending=False)             # Sort the DataFrame by count in descending order


print(word_counts_df.head(20))

# Save the result to a CSV file
word_counts_df.to_csv('diana_title.csv', index=False)


          word  count
0        diana    397
4          rom    208
8       oliver     98
15       story     85
9         play     85
35         kid     83
7    adventure     58
113  challenge     44
48        roma     44
95     pretend     43
86         toy     41
77         fun     36
43       video     32
139      funny     30
97         new     29
90         mom     24
73        baby     24
55      family     24
133      learn     22
147     school     21


In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download stopwords
nltk.download('stopwords')

# Set of French stopwords
stop_words = set(stopwords.words('french'))

# Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Load the dataset (replace with your actual file path)
df = pd.read_csv('df_FR_2024.csv')  # Replace with the correct file name

# Initialize a Counter for word frequencies
word_counter = Counter()

# Loop through the `title` column
for title in df['title']:
    # Clean the title
    clean_title = remove_punctuation(title)

    # Split the title into words and convert to lowercase
    words = clean_title.lower().split()

    # Lemmatize and count words, excluding stopwords
    for word in words:
        if word not in stop_words:  # Skip French stopwords
            lemma = lemmatizer.lemmatize(word, pos='n')  # Lemmatize as noun
            word_counter[lemma] += 1  # Increment word count

# Convert the Counter to a DataFrame
word_counts_df = pd.DataFrame(word_counter.items(), columns=['word', 'count'])

# Sort the DataFrame by count in descending order
word_counts_df = word_counts_df.sort_values(by='count', ascending=False)

# Display the most frequent words
word_counts_df.head(20)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,word,count
5,résumé,15
51,ft,11
262,officiel,9
180,plus,9
261,clip,9
80,jai,8
248,1,7
137,of,7
37,v,7
16,fait,7


In [None]:
!pip install pymorphy2


Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt

In [None]:
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
import nltk
import pymorphy2

# Download stopwords
nltk.download('stopwords')

# Set of Russian stopwords
stop_words = set(stopwords.words('russian'))

# Initialize pymorphy2 lemmatizer for Russian
morph = pymorphy2.MorphAnalyzer()

# Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

# Load the dataset (replace with your actual file path)
df = pd.read_csv('df_RU_2024.csv')  # Replace with your dataset file

# Initialize a Counter for word frequencies
word_counter = Counter()

# Loop through the `title` column
for title in df['title']:
    # Clean the title
    clean_title = remove_punctuation(title)

    # Split the title into words and convert to lowercase
    words = clean_title.lower().split()

    # Lemmatize and count words, excluding stopwords
    for word in words:
        if word not in stop_words:  # Skip Russian stopwords
            lemma = morph.parse(word)[0].normal_form  # Lemmatize using pymorphy2
            word_counter[lemma] += 1  # Increment word count

# Convert the Counter to a DataFrame
word_counts_df = pd.DataFrame(word_counter.items(), columns=['word', 'count'])

# Sort the DataFrame by count in descending order
word_counts_df = word_counts_df.sort_values(by='count', ascending=False)

# Display the most frequent words
word_counts_df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,word,count
52,the,7
41,дом,5
25,shorts,5
384,сирия,5
215,год,4
...,...,...
173,de,1
172,copie,1
171,petite,1
170,une,1


In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download stopwords
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Load the dataset (replace with your actual file path)
df = pd.read_csv('mr_beast_videos.csv')  # Ensure you load the correct CSV file

# Initialize a Counter for word frequencies
word_counter = Counter()

# Loop through the `title` column
for title in df['Title']:
    # Clean the title
    clean_title = remove_punctuation(title)

    # Split the title into words and convert to lowercase
    words = clean_title.lower().split()

    # Lemmatize and count words, excluding stopwords
    for word in words:
        if word not in stop_words:  # Skip stopwords
            lemma = lemmatizer.lemmatize(word, pos='n')  # Lemmatize as noun
            word_counter[lemma] += 1  # Increment word count

# Convert the Counter to a DataFrame
word_counts_df1 = pd.DataFrame(word_counter.items(), columns=['word', 'count'])

# Sort the DataFrame by count in descending order
word_counts_df1 = word_counts_df1.sort_values(by='count', ascending=False)

# Display the most frequent words
print(word_counts_df1.head(20))



          word  count
74       10000     29
171    youtube     29
428        win     28
0        world     27
112          v     27
93      100000     26
127       hour     24
85           1     24
37         car     24
148      video     22
4         last     21
11         100     20
145       make     20
144  youtubers     19
320     people     18
7         keep     18
8    challenge     17
62        game     17
95         day     17
414         24     17


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load spaCy's French language model
!python -m spacy download fr_core_news_sm
nlp_fr = spacy.load('fr_core_news_sm')

In [None]:
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
import spacy

# Load French stop words
from nltk import download
download('stopwords')
stop_words_fr = set(stopwords.words('french'))


In [None]:

# Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

df_french = pd.read_csv('df_FR_2024.csv')

# Initialize a Counter for French words
word_counter_fr = Counter()

# Process each French title in the dataset
for title in df_french['title']:
    # Step 1: Remove punctuation
    clean_title_fr = remove_punctuation(title)

    # Step 2: Lowercase and process with spaCy
    doc_fr = nlp_fr(clean_title_fr.lower())  # Process the title with spaCy

    # Step 3: Lemmatize and count words
    for token in doc_fr:
        if token.text not in stop_words_fr and token.is_alpha:  # Exclude stop words and non-alphabetic tokens
            lemma_fr = token.lemma_  # Get the lemma
            word_counter_fr[lemma_fr] += 1  # Increment word count

# Convert the Counter to a DataFrame
word_counts_df_fr = pd.DataFrame(word_counter_fr.items(), columns=['word_fr', 'count_fr'])

# Sort the DataFrame by count in descending order
word_counts_df_fr = word_counts_df_fr.sort_values(by='count_fr', ascending=False)

# Display the top 20 words in French
print(word_counts_df_fr.head(20))

word_counts_df_fr.to_csv('french_title_word_counts_2024.csv', index=False)


       word_fr  count_fr
5       résumé        15
45          ft        11
237       clip         9
238   officiel         9
165       plus         9
14       faire         8
73         jai         8
127         of         7
32           v         7
10    champion         7
25        tout         6
219       cest         6
411       feat         6
279     tester         5
64        star         5
171        jeu         5
88       video         5
39       avoir         5
36   highlight         5
9        ligue         5


In [None]:
# Load spaCy's Portuguese language model
!python -m spacy download pt_core_news_sm
nlp_br = spacy.load('pt_core_news_sm')

Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

# Load Portuguese stopwords from NLTK
stop_words_br = set(stopwords.words('portuguese'))


#  Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

#  Load the Brazilian Portuguese dataset
df_br = pd.read_csv('df_BR_2024.csv')  # Updated to BR for Brazilian Portuguese data

#  Initialize a Counter for BR words
word_counter_br = Counter()

# Process each title in the dataset
for title in df_br['title']:
    # Remove punctuation
    clean_title_br = remove_punctuation(title)

    # Tokenize and process with spaCy
    doc_br = nlp_br(clean_title_br.lower())

    # Lemmatize and count words
    for token in doc_br:
        if token.text not in stop_words_br and token.is_alpha:  # Exclude stop words and non-alphabetic tokens
            lemma_br = token.lemma_  # Get the lemma
            word_counter_br[lemma_br] += 1  # Increment word count

#  Convert the Counter to a DataFrame
word_counts_df_br = pd.DataFrame(word_counter_br.items(), columns=['word_br', 'count_br'])

#  Sort the DataFrame by count in descending order
word_counts_df_br = word_counts_df_br.sort_values(by='count_br', ascending=False)

#  Save the result to a CSV file
word_counts_df_br.to_csv('brazilian_title_word_counts_2024.csv', index=False)


         word_br  count_br
3              x        17
160      oficial        17
0            bom        11
130         vivo        10
176          pra         9
1        momento         9
108         novo         9
25     minecraft         8
333        video         8
495        clipe         8
83            mc         8
212  brasileirão         7
101      trailer         7
173           ir         7
46         ficar         6
99      official         6
319     coletivo         6
223         casa         6
13           dar         6
124   brasileiro         5


In [None]:
#  Load spaCy's Spanish language model
!python -m spacy download es_core_news_sm
nlp_mx = spacy.load('es_core_news_sm')


Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

# Step 2: Load Spanish stopwords
stop_words_mx = set(stopwords.words('spanish'))

# Step 4: Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

# Step 5: Load the Mexican Spanish dataset
df_mx = pd.read_csv('df_MX_2024.csv')  # Updated to MX for Mexican Spanish data

# Step 6: Initialize a Counter for MX words
word_counter_mx = Counter()

# Step 7: Process each title in the dataset
for title in df_mx['title']:
    # Remove punctuation
    clean_title_mx = remove_punctuation(title)

    # Tokenize and process with spaCy
    doc_mx = nlp_mx(clean_title_mx.lower())

    # Lemmatize and count words
    for token in doc_mx:
        if token.text not in stop_words_mx and token.is_alpha:  # Exclude stop words and non-alphabetic tokens
            lemma_mx = token.lemma_  # Get the lemma
            word_counter_mx[lemma_mx] += 1  # Increment word count

# Step 8: Convert the Counter to a DataFrame
word_counts_df_mx = pd.DataFrame(word_counter_mx.items(), columns=['word_mx', 'count_mx'])

# Step 9: Sort the DataFrame by count in descending order
word_counts_df_mx = word_counts_df_mx.sort_values(by='count_mx', ascending=False)

# Step 10: Save the result to a CSV file
word_counts_df_mx.to_csv('mexican_title_word_counts_2024.csv', index=False)

# Step 11: Display the top 20 words
print(word_counts_df_mx.head(20))

       word_mx  count_mx
16       video        42
17     oficial        27
66    official        20
206          x         9
0      resumen         8
98          vs         8
83        azul         8
82        cruz         8
185         ir         7
178        dia         6
86   semifinal         6
62         día         6
149      hacer         6
7         liga         6
4            i         6
2         real         5
15       remix         5
268      grupo         5
242        the         5
84     américa         5


In [None]:
!pip install spacy langdetect
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=6824e3c5b1c8ec6d395e9e86ac24254e498b0515efd82378dc6c8b81c2eecbd1
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m

In [None]:
!pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.12.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [None]:

# Download stopwords
nltk.download('stopwords')

# Load spaCy language models
nlp_en = spacy.load('en_core_web_sm')  # English model
nlp_fr = spacy.load('fr_core_news_sm')  # French model

# Load stopwords
stop_words_en = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))

# Initialize Google Translator
translator = Translator()

# Function to remove punctuation
def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

# Load the dataset
df_ca = pd.read_csv('df_CA_2024.csv')  # Replace with your dataset file

# Separate English and French data
df_en = pd.DataFrame(columns=['title'])
df_fr = pd.DataFrame(columns=['title'])

# Detect language and split the dataset
for title in df_ca['title']:
    try:
        language = detect(title)
        if language == 'en':
            df_en = pd.concat([df_en, pd.DataFrame({'title': [title]})], ignore_index=True)
        elif language == 'fr':
            df_fr = pd.concat([df_fr, pd.DataFrame({'title': [title]})], ignore_index=True)
    except:
        continue

# Initialize Counters for English and French words
word_counter_en = Counter()
word_counter_fr = Counter()

# Process English titles
for title in df_en['title']:
    clean_title = remove_punctuation(title)
    doc_en = nlp_en(clean_title.lower())
    for token in doc_en:
        if token.text not in stop_words_en and token.is_alpha:
            word_counter_en[token.lemma_] += 1

# Process French titles
for title in df_fr['title']:
    clean_title = remove_punctuation(title)
    doc_fr = nlp_fr(clean_title.lower())
    for token in doc_fr:
        if token.text not in stop_words_fr and token.is_alpha:
            word_counter_fr[token.lemma_] += 1

# Translate French words to English
translated_word_counter_fr = Counter()
for word, count in word_counter_fr.items():
    try:
        translated_word = translator.translate(word, src='fr', dest='en').text.lower()
        translated_word_counter_fr[translated_word] += count
    except:
        continue

# Merge English and Translated French word counts
final_word_counter = word_counter_en + translated_word_counter_fr

# Convert the Counter to a DataFrame
final_word_counts_df = pd.DataFrame(final_word_counter.items(), columns=['word', 'count'])

# Sort the DataFrame by count in descending order
final_word_counts_df = final_word_counts_df.sort_values(by='count', ascending=False)

# Save the final dataset to a CSV file
final_word_counts_df.to_csv('canadian_title_word_counts_2024.csv', index=False)

# Display the top 20 words
print(final_word_counts_df.head(20))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


          word  count
10    official     14
11     trailer     13
52       video     11
99        game     10
225         vs      9
38         new      8
343  highlight      7
327     season      7
51       music      6
103       full      6
184      rival      5
250   gameplay      5
35   christmas      5
90        real      5
245       play      5
181      every      5
183     marvel      5
1          buy      4
44      ariana      4
45      grande      4
