In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

In [2]:
df = pd.read_csv('./data/cleaned_data.csv')
languages = df.columns

In [3]:
df.head()

Unnamed: 0,German,French,Italian,Spanish,English
0,mist durchschaut egal mach einfach,sénat bernard arnault dépeint bienfaiteur presse,pregare insieme uni altri darci fare insieme c...,pandemia silenciosa infecciones bacterias resi...,psychologist robert coplan studies concept alo...
1,oh süße watschelente,récap politique jour,popolazioni isole tonga colpite giorni scorsi ...,creo sesgo masculino despreciar quejas femenin...,things like love trust caring simply work unde...
2,,ex otage ingrid betancourt précandidate présid...,oggi coloro carcere tenerezza dio raggiunga ca...,fiscal peruana anunci comenzar investigaci n p...,temperature kinds factors shape views decision...
3,stups doof,exil france kamal mouzawak partage saveurs liban,tenerezza questione emotiva sentimentale esper...,temores mosc est tratando dividir desestabiliz...,days warmer average people would give money co...
4,stupst,egypte capture opposant islamiste après atterr...,magi venuti oriente betlemme onorare re messia...,putin amenaz apropiadas medidas técnico milita...,although people quite aware global warming bel...


In [4]:
def remove_empty_rows(df, language):
    series = df[language]

    # Get the index of any empty rows
    empty_rows = df[(df[language] == '') | (df[language].isna())].index
    
    # Drop those rows using the index
    empty_removed = series.drop(empty_rows, axis=0)

    # Reset the index
    empty_removed.reset_index(drop=True, inplace=True)

    return empty_removed

## Create WordCloud

In [5]:
wordcloud_paths = {
    'German' : './wordclouds/german_wordcloud.png',
    'French' : './wordclouds/French_wordcloud.png',
    'Italian': './wordclouds/Italian_wordcloud.png',
    'Spanish': './wordclouds/Spanish_wordcloud.png',
    'English': './wordclouds/English_wordcloud.png'
}

for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Combine all the text into a single string
    long_string = " ".join(target_dropped)

    # Initialize the wordcloud
    wc = WordCloud(background_color='white', contour_color='steelblue', height=600, width=800)

    # Add data to the wordcloud
    wc.generate(long_string)

    # Save the cloud as a png file
    wc.to_file(wordcloud_paths[language])

In [6]:
# # Visualize wordcloud
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()

## Word Count

In [7]:
for language in languages:
    # Drop the empty rows
    target_dropped = remove_empty_rows(df, language)

    # Initialize counter vectorizer
    vec = CountVectorizer()

    # Transform to document term matrix
    X = vec.fit_transform(target_dropped)

    # Create DataFrame
    tdm = pd.DataFrame(data=X.toarray(), columns=vec.get_feature_names_out())

    # Sort DataFrame by word frequency
    words = tdm.sum(axis=0).sort_values(ascending=False)

    words.to_csv(f'./data/common_words/{language}_common.csv', header=['count'], index_label='word')