# Krystian Gronek & Katarzyna Piotrowska
# Text Mining and Social Media Mining, final project - Analyzing men and women comments using NLP methods

# Loading packages

In [24]:
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import seaborn as sns
from collections import Counter


%matplotlib inline 

# some basic visualization
import matplotlib.pyplot as plt 


In [20]:
# Functions

# Function to convert list of strings represented itself as string to a normal list with strings
def destring_list(input_list):
    input_list = input_list.strip('][')
    output_list = re.compile("\]\[").sub(', ', input_list)
    output_list = re.compile("\'").sub('', output_list).split(', ') 
    return output_list


In [21]:
men = pd.read_csv('data/final_askmen.csv', sep = ';')
women = pd.read_csv('data/final_askwomen.csv', sep = ';')

In [22]:
men.info()
print("\n")
women.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16664 entries, 0 to 16663
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   username                          16664 non-null  object 
 1   com_original                      16664 non-null  object 
 2   cleaned                           16664 non-null  object 
 3   cleaned_wo_sw                     16664 non-null  object 
 4   tokenized                         16664 non-null  object 
 5   stemmed                           16664 non-null  object 
 6   tokenized_wo_sw                   16664 non-null  object 
 7   submission_title                  16664 non-null  object 
 8   submission_title_cleaned          16664 non-null  object 
 9   submission_title_cleaned_wo_sw    16664 non-null  object 
 10  submission_title_tokenized        16664 non-null  object 
 11  submission_title_stemmed          16664 non-null  object 
 12  subm

# Frequency of words

In [35]:
# Individual words from all comments in one list
# Men - with stopwords
all_men_tokenized = men.tokenized.sum()
all_men_tokenized = destring_list(all_men_tokenized)
all_men_tokenized_counts = Counter(all_men_tokenized)

# Men - without stopwords
all_men_tokenized2 = men.tokenized_wo_sw.sum()
all_men_tokenized2 = destring_list(all_men_tokenized2)
all_men_tokenized2_counts = Counter(all_men_tokenized2) 

# Women - with stopwords
all_women_tokenized = women.tokenized.sum()
all_women_tokenized = destring_list(all_women_tokenized)
all_women_tokenized_counts = Counter(all_women_tokenized)

# Women - without stopwords
all_women_tokenized2 = women.tokenized_wo_sw.sum()
all_women_tokenized2 = destring_list(all_women_tokenized2)
all_women_tokenized2_counts = Counter(all_women_tokenized2)


In [62]:
df1 = pd.DataFrame.from_records(list(dict(all_men_tokenized_counts).items()), columns = ['word', 'count'])
df2 = pd.DataFrame.from_records(list(dict(all_men_tokenized2_counts).items()), columns = ['word', 'count'])
df3 = pd.DataFrame.from_records(list(dict(all_women_tokenized_counts).items()), columns = ['word', 'count'])
df4 = pd.DataFrame.from_records(list(dict(all_women_tokenized2_counts).items()), columns = ['word', 'count'])

df1 = df1.sort_values(by = ['count'], ascending = False)
df2 = df2.sort_values(by = ['count'], ascending = False)
df3 = df3.sort_values(by = ['count'], ascending = False)
df4 = df4.sort_values(by = ['count'], ascending = False)

In [63]:
df1

Unnamed: 0,word,count
76,the,15495
4,to,15485
50,i,14965
62,and,14880
70,a,14664
...,...,...
14546,devoured,1
14547,tbol,1
6407,lieu,1
6404,irq,1


In [64]:
df2

Unnamed: 0,word,count
38,like,2658
361,dont,2512
5,im,2246
292,get,1893
114,people,1804
...,...,...
14536,royalty,1
14537,mexicans,1
14538,buds,1
14539,seltzer,1


In [58]:
df

Unnamed: 0,word,count
76,the,15495
4,to,15485
50,i,14965
62,and,14880
70,a,14664
...,...,...
21705,naprest,1
492,tryhard,1
21709,twothirds,1
21710,carriers,1


In [56]:
type(df)

pandas.core.frame.DataFrame

In [50]:
type(men)

pandas.core.frame.DataFrame

In [31]:
df = pd.DataFrame(all_men_tokenized_counts)
all_men_tokenized2_counts
all_women_tokenized_counts
all_women_tokenized2_counts

ValueError: If using all scalar values, you must pass an index

In [33]:
type(all_men_tokenized_counts)

collections.Counter

# Simple distribution visualization of the comment_score variable

Let's look at the the only 2 numerical variables in our dataset. 

Submission upvotes are number of upvotes or likes that a post containing comments has. It mainly conveys how the title of the post or the discussion is attractive to the user to participate in. Basically one can treat the number of upvotes as indicator how good online conversation topic it is, with higher values being more interesting. 

Score variable is equal to the difference between upvotes and downvotes. It usually conveys information how useful information does the comment give to the reader or weather other users agree with the comment or disagree.

Below we can see summary statistics of these variables for men and women datasets

# Simple distribution visualization of the 'comment_score' variable

Let's first look at the distribution of comment_score variable. We will look into numerical variables more in the next step of analysis.

Score (comment_score) variable is equal to the difference between upvotes and downvotes. It usually conveys information how useful information does the comment give to the reader or weather other users agree with the comment or disagree.

Below we can see summary statistics of these variables for men and women datasets

In [None]:
print("/r/AskMen summary statistics")
men.describe()
print("/r/AskWomen summary statistics")
women.describe()

In [None]:
# /r/AskMen comment score histogram and density plots
plt.hist(men['comment_score'], color = 'blue', edgecolor = 'black', bins = 100);
sns.displot(data = men, x = 'comment_score', kind = 'kde');

In [None]:
# /r/AskWomen comment score histogram and density plots
plt.hist(women['comment_score'], color = 'blue', edgecolor = 'black', bins = 100);
sns.displot(data = women, x = 'comment_score', kind = 'kde');

We could add to the whole series both for men and women dataset such values that we could log transform these datasets and try visualizing the data again.

In [None]:
men_min = np.min(men['comment_score'])
women_min = np.min(women['comment_score'])

men['comment_score2'] = men['comment_score'] + (np.abs(men_min) + 1)
women['comment_score2'] = women['comment_score'] + (np.abs(women_min) + 1)

men['log_cs'] = np.log(men['comment_score2'])
women['log_cs'] = np.log(women['comment_score2'])

In [None]:
# /r/AskMen logarithm comment score histogram and density plots
plt.hist(men['log_cs'], color = 'blue', edgecolor = 'black', bins = 50);
sns.displot(data = men, x = 'log_cs', kind = 'kde');

In [None]:
# /r/AskWomen logarithm comment score histogram and density plots
plt.hist(women['log_cs'], color = 'blue', edgecolor = 'black', bins = 10);
sns.displot(data = women, x = 'log_cs', kind = 'kde');

We can see a little better the distribution of the comment_score variable that 

# Z-scoring 'comment_score' numerical variable distribution

In [None]:
sns.displot(data = men.zscore, kind = 'kde');
sns.displot(data = men.zscore_grouped, kind = 'kde');

In [None]:
sns.displot(data = women.zscore, kind = 'kde');
sns.displot(data = women.zscore_grouped, kind = 'kde');

# MINMAX scaling of variable 'comment_score' - distribution

In [None]:
sns.displot(data = men.minmax, kind = 'kde');
sns.displot(data = men.minmax_grouped, kind = 'kde');

In [None]:
sns.displot(data = women.minmax, kind = 'kde');
sns.displot(data = women.minmax_grouped, kind = 'kde');

# Wordcloud visualisations

In [None]:
# shared stopword list
stopwords = set(STOPWORDS)

# Men word cloud - comments
cleaned_words_men = ' '.join(men['cleaned_wo_sw'])

wordcloud_men = WordCloud(stopwords=stopwords,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_words_men)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud_men)
plt.axis('off')
print("/r/AskMen Wordcloud plot")
plt.show();

# Women word cloud - comments
cleaned_words_women = ' '.join(women['cleaned_wo_sw'])

wordcloud_women = WordCloud(stopwords=stopwords,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_words_women)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud_women)
plt.axis('off')
print("/r/AskWomen Wordcloud plot")
plt.show();



As we can see the similarity of the most frequently used words by men and women is truly striking. The biggest diferrences here that we can see is that 'people' seem to be more common in men language than women while words like 'dont', 'feel'

We should delete the most used words for example 'time', 'im', 'dont', 'one', etc.

In [None]:
# Men word cloud - comments
cleaned_words_men = ' '.join(men['cleaned_wo_sw'])

# Women word cloud - comments
cleaned_words_women = ' '.join(women['cleaned_wo_sw'])

# Shared stopword list
stopwords = set(STOPWORDS)
stopwords.add("one")
stopwords.add("im")
stopwords.add("time")
stopwords.add("dont")
stopwords.add("people")
stopwords.add("thing")
stopwords.add("work")
stopwords.add("want")
stopwords.add("think")
stopwords.add("know")
stopwords.add("really")

# Men word cloud - comments
wordcloud_men = WordCloud(stopwords=stopwords,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_words_men)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud_men)
plt.axis('off')
print("/r/AskMen Wordcloud plot")
plt.show();

# Women word cloud - comments
wordcloud_women = WordCloud(stopwords=stopwords,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_words_women)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud_women)
plt.axis('off')
print("/r/AskWomen Wordcloud plot")
plt.show();

