In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
# import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/cleaned_data.csv')
languages = df.columns

In [3]:
df.head()

Unnamed: 0,German,French
0,in meiner branche sind wir damit ganz weit vorn,du pacte des loups à juste la fin du monde la ...
1,,devant le sénat vincent bolloré relativise son...
2,ich vermisse beim stundeneintragen den punkt e...,la corée du nord annonce une possible reprise ...
3,n trotzki,joe biden n avait pas anticipé une telle oppos...
4,er hat uns die freude geschenkt,aux etats unis joe biden promet un désastre à ...


In [4]:
def remove_empty_rows(df, language):
    series = df[language]

    # Get the index of any empty rows
    empty_rows = df[(df[language] == '') | (df[language].isna())].index
    
    # Drop those rows using the index
    empty_removed = series.drop(empty_rows, axis=0)

    # Reset the index
    empty_removed.reset_index(drop=True, inplace=True)

    return empty_removed

## Create WordCloud

In [5]:
wordcloud_paths = {
    'French': './wordclouds/French_wordcloud.png',
    'German': './wordclouds/german_wordcloud.png'
}

for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Combine all the text into a single string
    long_string = " ".join(target_dropped)

    # Initialize the wordcloud
    wc = WordCloud(background_color='white', contour_color='steelblue', height=600, width=800)

    # Add data to the wordcloud
    wc.generate(long_string)

    # Save the cloud as a png file
    wc.to_file(wordcloud_paths[language])

In [6]:
# # Visualize wordcloud
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()

## Word Count

In [7]:
for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Initialize counter vectorizer
    vec = CountVectorizer()

    # Transform to document term matrix
    X = vec.fit_transform(target_dropped)

    # Create DataFrame
    tdm = pd.DataFrame(data=X.toarray(), columns=vec.get_feature_names_out())

    # Sort DataFrame by word frequency
    words = tdm.sum(axis=0).sort_values(ascending=False)

    words.to_csv(f'./data/common_words/{language}_common.csv', header=['count'], index_label='word')