# Loading packages

In [50]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.tokenize import word_tokenize # tokenization
nltk.download('punkt')
from nltk.corpus import stopwords # stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
from nltk.stem import PorterStemmer # stemming
ps = PorterStemmer() 
from nltk.stem import WordNetLemmatizer # lemmization
lemmatizer = WordNetLemmatizer()

nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
import os
os.getcwd()

'E:\\Projects\\TMSMM_Project'

# Loading data

In [57]:
men = pd.read_csv('data/askmen.csv', sep = ';')
women = pd.read_csv('data/askwomen.csv', sep = ';')

In [64]:
men

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,virusamongus,942,"Oh no, I was so interested in the nuanced deba...",BONK! Overly sexual questions are no longer al...,13949
1,CoffeeEnjoyerFrog,4172,"Can't wait to see : men, what gets you at 25% ...",BONK! Overly sexual questions are no longer al...,13949
2,5altyShoe,4326,Looks like someone got some post ~~nut~~ legum...,BONK! Overly sexual questions are no longer al...,13949
3,mightyjake,1876,> remember to read the Bible\n\nEzekiel 23:20\...,BONK! Overly sexual questions are no longer al...,13949
4,TallCombination6,764,How am I supposed to know that men enjoy blowj...,BONK! Overly sexual questions are no longer al...,13949
...,...,...,...,...,...
17275,randylahey2883,3,I had a bear come into my camp on more than on...,What is a fact or story you don't get to tell ...,9
17276,BalloonPilotDude,3,A story that I think about allot but I doubt t...,What is a fact or story you don't get to tell ...,9
17277,JetBrink,3,I'm awesome in bed,What is a fact or story you don't get to tell ...,9
17278,Stabbmaster,9,The first slaveowner in the United States was ...,What is a fact or story you don't get to tell ...,9


In [65]:
women

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100
2,KBXGazelle,15,February 14 was my mom's birthday so I usually...,Valentines Day Mega thread! Check in here for ...,100
3,meloaf,13,I'm sending all my friends Valentine's Day car...,Valentines Day Mega thread! Check in here for ...,100
4,OriginalOestrus,12,I have reservations at my favorite steakhouse!...,Valentines Day Mega thread! Check in here for ...,100
...,...,...,...,...,...
16849,D-Spornak,1,I'm hoping it's coming up soon.,What age do you consider to be “in your prime”?,373
16850,Non-Priority-98,1,Hmmmm I would think that it is in stable times...,What age do you consider to be “in your prime”?,373
16851,CutWeary9135,1,12,What age do you consider to be “in your prime”?,373
16852,Irinakusx,1,20 it is advisable to take care of your health...,What age do you consider to be “in your prime”?,373


In [6]:
print(men['submission_downs'].unique())
print(women['submission_downs'].unique())

[0]
[0]


We should delete 'submission_downs' column as it is only 0. Reddit doesn't let to scrape the number of downvotes on a post.

In [60]:
men = men.drop(columns = "submission_downs")
women = women.drop(columns = "submission_downs")
men.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,virusamongus,942,"Oh no, I was so interested in the nuanced deba...",BONK! Overly sexual questions are no longer al...,13949
1,CoffeeEnjoyerFrog,4172,"Can't wait to see : men, what gets you at 25% ...",BONK! Overly sexual questions are no longer al...,13949


In [61]:
women['comment_text'].loc[women['comment_text'] == "[removed]"]
men['comment_text'].loc[men['comment_text'] == "[removed]"]

34       [removed]
47       [removed]
48       [removed]
49       [removed]
50       [removed]
           ...    
16841    [removed]
16842    [removed]
16845    [removed]
16853    [removed]
16855    [removed]
Name: comment_text, Length: 1277, dtype: object

338      [removed]
720      [removed]
4943     [removed]
6676     [removed]
10034    [removed]
12665    [removed]
12872    [removed]
12904    [removed]
13351    [removed]
13394    [removed]
14807    [removed]
15123    [removed]
15593    [removed]
16142    [removed]
Name: comment_text, dtype: object

We need to remove comments removed by moderators on reddit

In [62]:
wrong_comments = ['[removed]']

In [63]:
men = men[men['comment_text'].isin(wrong_comments) == False]
women = women[women['comment_text'].isin(wrong_comments) == False]
women['comment_text'].loc[women['comment_text'] == "[removed]"]
men['comment_text'].loc[men['comment_text'] == "[removed]"]

Series([], Name: comment_text, dtype: object)

Series([], Name: comment_text, dtype: object)

None of the non-desired comments are present in datasets

# Text preprocessing

In [11]:
REPLACE_NO_SPACE = re.compile("[.@;:!\'?,\"-/()\[\]]")
REPLACE_WITH_SPACE = re.compile("(\\n\\n)|(\\n)")
MULTIPLE_SPACES = re.compile("\s+")
NUMBERS = re.compile("[0-9]+")
EMOJIS = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese characters
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"                 # dingbats
        u"\u3030"
                      "]+", flags = re.UNICODE)

In [45]:
# Preprocessing for sentiment analysis
def preprocess_text(text):
    text = [REPLACE_WITH_SPACE.sub(' ', line) for line in text]
    text = [REPLACE_NO_SPACE.sub('', line) for line in text]
    text = [MULTIPLE_SPACES.sub(' ', line) for line in text]
    text = [NUMBERS.sub('', line) for line in text]
    text = [EMOJIS.sub('', line) for line in text]
    text = [line.lower() for line in text]
    return text

def tokenization(text):
    text_vectors = [word_tokenize(line) for line in text]
    return text_vectors

def stop_words_removal(text):
    text = [line.lower() for line in text] # makes text lower case
    tokenized = tokenization(text)
    filtered_sentences = list()
    for line in tokenized:
        filtered_line = [word for word in line if not word in stop_words]
        filtered_sentences.append(filtered_line)
    text = [' '.join(line) for line in filtered_sentences]
    return text

def custom_stop_words_removal(text, stop_words_list):
    tokenized = tokenization(text)
    filtered_sentences = list()
    for line in tokenized:
        filtered_line = [word for word in line if not word in stop_words_list]
        filtered_sentences.append(filtered_line)
    text = [' '.join(line) for line in filtered_sentences]
    return text

def stemming(text):
    tokenized = tokenization(text)
    filtered_sentences = list()
    for line in tokenized:
        filtered_line = [ps.stem(word) for word in line]
        filtered_sentences.append(filtered_line)
    text = [' '.join(line) for line in filtered_sentences]
    return text

# lemmatization uses words existing in language
# maybe will delete this later
def lemmatization(text):
    return text


## Men

In [13]:
men.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100


In [14]:
# men comments
men_com = men.drop(columns = ['username', 'comment_score', 'submission_title', 'submission_ups'])["comment_text"]
men_com


0        Waiting for the cvs candy to be 75% off on Feb...
1        My second valentines day alone, and i am feeli...
2        February 14 was my mom's birthday so I usually...
3        I'm sending all my friends Valentine's Day car...
4        I have reservations at my favorite steakhouse!...
                               ...                        
16849                      I'm hoping it's coming up soon.
16850    Hmmmm I would think that it is in stable times...
16851                                                   12
16852    20 it is advisable to take care of your health...
16854    If i feel that this is the Part of my Life whe...
Name: comment_text, Length: 15579, dtype: object

## Women

In [15]:
women.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100


In [71]:
# women comments
women_com = women.drop(columns = ['username', 'comment_score', 'submission_title', 'submission_ups'])["comment_text"]
women_com = preprocess_text(women_com)
women_stw = stop_words_removal(women_com)
women_tok = tokenization(women_com)
women_stm = stemming(women_stw) # with stop words (stw) removed
women_tok_stw = tokenization(stop_words_removal(women_com)) # with stop words (stw) removed

In [76]:
women2 = women[['username']].copy()
women2['com_original'] = women[['comment_text']].copy()
women2['cleaned'] = women_com
women2['tokenized'] = women_tok
women2['stemmed'] = women_stm
women2

Unnamed: 0,username,com_original,cleaned,tokenized,stemmed
0,shockedpikachu123,Waiting for the cvs candy to be 75% off on Feb...,waiting for the cvs candy to be off on februa...,"[waiting, for, the, cvs, candy, to, be, off, o...",wait cv candi februari
1,secrethedgehog5,"My second valentines day alone, and i am feeli...",my second valentines day alone and i am feelin...,"[my, second, valentines, day, alone, and, i, a...",second valentin day alon feel bit meh okay
2,KBXGazelle,February 14 was my mom's birthday so I usually...,february was my moms birthday so i usually sp...,"[february, was, my, moms, birthday, so, i, usu...",februari mom birthday usual spend day realli e...
3,meloaf,I'm sending all my friends Valentine's Day car...,im sending all my friends valentines day cards...,"[im, sending, all, my, friends, valentines, da...",im send friend valentin day card snail mail wa...
4,OriginalOestrus,I have reservations at my favorite steakhouse!...,i have reservations at my favorite steakhouse ...,"[i, have, reservations, at, my, favorite, stea...",reserv favorit steakhous last year wore nice j...
...,...,...,...,...,...
16849,D-Spornak,I'm hoping it's coming up soon.,im hoping its coming up soon,"[im, hoping, its, coming, up, soon]",im hope come soon
16850,Non-Priority-98,Hmmmm I would think that it is in stable times...,hmmmm i would think that it is in stable times...,"[hmmmm, i, would, think, that, it, is, in, sta...",hmmmm would think stabl time short substanti e...
16851,CutWeary9135,12,,[],
16852,Irinakusx,20 it is advisable to take care of your health...,it is advisable to take care of your health a...,"[it, is, advisable, to, take, care, of, your, ...",advis take care health time alway activ desir
