# Exploratory Data Analysis: Word Frequency & Word Clouds
Author: Johanna Garthe

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import itertools
import collections
import spacy
import re
import warnings
warnings.filterwarnings("ignore")
en = spacy.load('en_core_web_sm')
stopwords_en = en.Defaults.stop_words
de = spacy.load("de_core_news_sm")
stopwords_de = de.Defaults.stop_words

In [2]:
f_name = " "
df = pd.read_csv(f_name)
target_class = df[(df['predicted_label']=='against')]
tweets = target_class['text_cleaned']

##### Lemmatization & Stop Word Removal

In [4]:
def lemmatize(tweet):
    """ Takes a string and lemmatize it """
    tweet_lemma = []
    text = de(tweet)
    lemmatas = ' '.join([x.lemma_ for x in text]) 
    tweet_lemma.append(lemmatas)
    tweet = ' '.join(tweet_lemma)
    tweet = re.sub('--', '', tweet) # Remove lemma punctuation symbol
    tweet = tweet.lower()
    tweet = tweet.rstrip()
    return tweet

words_in_tweet = [lemmatize(tweet).split() for tweet in tweets]
# Stop word removal
tweets_nsw = [[word for word in tweet_words if not word in stopwords_de and not word in stopwords_en]
              for tweet_words in words_in_tweet]
# Optional: Removal of query or keywords, e.g. Ukraine/Russia, from the list
#keywords = ["russisch", "ukrainisch"]
#tweets_nsw = [[w for w in word if not w in keywords] for word in tweets_nsw]

##### Calculation of Word Frequency

In [6]:
all_words_nsw = list(itertools.chain(*tweets_nsw))
counts_nsw = collections.Counter(all_words_nsw)

# Optional: Save word counts result
#savefile = " "
#df_counts = pd.DataFrame(counts_nsw.most_common(),columns=['words', 'count'])
#df_counts.to_csv(savefile, index=False, header=True)

##### Plot of Word Frequency

In [None]:
print('Number of unique words:', len(counts_nsw))
clean_tweets = pd.DataFrame(counts_nsw.most_common(30),
                             columns=['words', 'count'])

tlt_name = "Common Words in Dataset"
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'
plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']=0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'
plt.rcParams['text.color']='#333F4B'

df = clean_tweets.sort_values(by='count')
my_range=list(range(1,len(df['words'])+1))
fig, ax = plt.subplots(figsize=(15,10))

plt.style.use('fivethirtyeight')
plt.hlines(y=my_range, xmin=0, xmax=df['count'], color='#007ACC', alpha=0.2, linewidth=5)
plt.plot(df['count'], my_range, "o", markersize=5, color='#007ACC', alpha=0.6)
ax.set_xlabel('\nCount', fontsize=16, fontweight='bold', color = '#333F4B')
ax.set_ylabel('Word', fontsize=16, fontweight='bold', color = '#333F4B')
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df['words'])
fig.text(0, 0.92, tlt_name, fontsize=20, fontweight='bold', color = '#333F4B')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_bounds((1, len(my_range)))
ax.set_xlim(0,66000)
ax.spines['left'].set_position(('outward', 8))
ax.spines['bottom'].set_position(('outward', 5))
plt.savefig(' ', dpi=300, bbox_inches='tight')

##### Visualization as a Word Cloud

In [None]:
# Optional: Use word count file
#f_name = "../data/data_unlabeled/all_24feb31dec/3_predictions/SLI-favor_wordcounts250.csv"
#counter_df =  pd.read_csv(f_name)
#counter_dict = counter_df.set_index('words')['count'].to_dict()

# Creation of the WordCloud object
wordcloud = WordCloud(min_word_length =3,
                    background_color='black', #'whitesmoke'
                    #colormap="ocean",
                    max_words=250,
                    width=1600, height=800,
                    )
# Generation of word cloud
wordcloud.generate_from_frequencies(counts_nsw)
# Plot
#plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10),facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()