In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

In [2]:
df = pd.read_csv('./data/cleaned_data.csv')
languages = df.columns

In [3]:
df.head()

Unnamed: 0,German,French,Italian,Spanish,English
0,naja gerade frankreich tatsächlich schrecklich...,début accalmie front incendies france,mondo vivere storia d amore dio abbracciare au...,escritor escenario agresor subi tarima asest r...,told dwell past nostalgia exception weekly new...
1,finde stadt hält gar fahre abends immer kurz w...,etats unis plan climat santé joe biden adopté ...,crede ricco vincente sicuro fonda sé chiude di...,caso lee reafirma concepci n popular l deres e...,despite intuition wilson says actually good pr...
2,ach hitzewellen klimawandel schlimm gleich mal...,marchés europe stevenson luttes féministes rep...,vecchiaia fase vita adatta diffondere lieta no...,orden registro hogar expresidente expone trump...,results women saw profile poor predicting eith...
3,harte äh trockene zeiten,comment ginko financial plus célèbre banque se...,prezioso quel senso familiarità comunità tanto...,novela versos sat nicos salman rushdie conside...,well know speed dating study women rated much ...
4,,actrice américaine anne heche déclarée morte s...,rinascita dialogo passa parole silenzio impunt...,corea sur conglomerados gigantes dominan econo...,feel like guardian angel subconscious remind c...


In [4]:
def remove_empty_rows(df, language):
    series = df[language]

    # Get the index of any empty rows
    empty_rows = df[(df[language] == '') | (df[language].isna())].index
    
    # Drop those rows using the index
    empty_removed = series.drop(empty_rows, axis=0)

    # Reset the index
    empty_removed.reset_index(drop=True, inplace=True)

    return empty_removed

## Create WordCloud

In [5]:
wordcloud_paths = {
    'German' : './wordclouds/german_wordcloud.png',
    'French' : './wordclouds/French_wordcloud.png',
    'Italian': './wordclouds/Italian_wordcloud.png',
    'Spanish': './wordclouds/Spanish_wordcloud.png',
    'English': './wordclouds/English_wordcloud.png'
}

for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Combine all the text into a single string
    long_string = " ".join(target_dropped)

    # Initialize the wordcloud
    wc = WordCloud(background_color='white', contour_color='steelblue', height=600, width=800)

    # Add data to the wordcloud
    wc.generate(long_string)

    # Save the cloud as a png file
    wc.to_file(wordcloud_paths[language])

In [6]:
# # Visualize wordcloud
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()

## Word Count

In [6]:
for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Initialize counter vectorizer
    vec = CountVectorizer()

    # Transform to document term matrix
    X = vec.fit_transform(target_dropped)

    # Create DataFrame
    tdm = pd.DataFrame(data=X.toarray(), columns=vec.get_feature_names_out())

    # Sort DataFrame by word frequency
    words = tdm.sum(axis=0).sort_values(ascending=False)

    words.to_csv(f'./data/common_words/{language}_common.csv', header=['count'], index_label='word')