# Loading packages

In [17]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.tokenize import word_tokenize # tokenization
nltk.download('punkt')
from nltk.corpus import stopwords # stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer # stemming
from nltk.stem import WordNetLemmatizer # lemmization
lemmatizer = WordNetLemmatizer()

nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Krystian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
import os
os.getcwd()

'E:\\Projects\\TMSMM_Project'

# Loading data

In [3]:
men = pd.read_csv('data/askmen.csv', sep = ';')
women = pd.read_csv('data/askwomen.csv', sep = ';')

In [4]:
men.head()

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups,submission_downs
0,virusamongus,942,"Oh no, I was so interested in the nuanced deba...",BONK! Overly sexual questions are no longer al...,13949,0
1,CoffeeEnjoyerFrog,4172,"Can't wait to see : men, what gets you at 25% ...",BONK! Overly sexual questions are no longer al...,13949,0
2,5altyShoe,4326,Looks like someone got some post ~~nut~~ legum...,BONK! Overly sexual questions are no longer al...,13949,0
3,mightyjake,1876,> remember to read the Bible\n\nEzekiel 23:20\...,BONK! Overly sexual questions are no longer al...,13949,0
4,TallCombination6,764,How am I supposed to know that men enjoy blowj...,BONK! Overly sexual questions are no longer al...,13949,0


In [5]:
women.head()

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups,submission_downs
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100,0
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100,0
2,KBXGazelle,15,February 14 was my mom's birthday so I usually...,Valentines Day Mega thread! Check in here for ...,100,0
3,meloaf,13,I'm sending all my friends Valentine's Day car...,Valentines Day Mega thread! Check in here for ...,100,0
4,OriginalOestrus,12,I have reservations at my favorite steakhouse!...,Valentines Day Mega thread! Check in here for ...,100,0


In [6]:
print(men['submission_downs'].unique())
print(women['submission_downs'].unique())

[0]
[0]


We should delete 'submission_downs' column as it is only 0. Reddit doesn't let to scrape the number of downvotes on a post.

In [7]:
men = women.drop(columns = "submission_downs")
women = women.drop(columns = "submission_downs")
men.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100


In [8]:
women['comment_text'].loc[women['comment_text'] == "[removed]"]
men['comment_text'].loc[men['comment_text'] == "[removed]"]

34       [removed]
47       [removed]
48       [removed]
49       [removed]
50       [removed]
           ...    
16841    [removed]
16842    [removed]
16845    [removed]
16853    [removed]
16855    [removed]
Name: comment_text, Length: 1277, dtype: object

34       [removed]
47       [removed]
48       [removed]
49       [removed]
50       [removed]
           ...    
16841    [removed]
16842    [removed]
16845    [removed]
16853    [removed]
16855    [removed]
Name: comment_text, Length: 1277, dtype: object

We need to remove comments removed by moderators on reddit

In [9]:
wrong_comments = ['[removed]']

In [10]:
men = men[men['comment_text'].isin(wrong_comments) == False]
women = women[women['comment_text'].isin(wrong_comments) == False]
women['comment_text'].loc[women['comment_text'] == "[removed]"]
men['comment_text'].loc[men['comment_text'] == "[removed]"]

Series([], Name: comment_text, dtype: object)

Series([], Name: comment_text, dtype: object)

None of the non-desired comments are present in datasets

# Text preprocessing

In [11]:
REPLACE_NO_SPACE = re.compile("[.@;:!\'?,\"-/()\[\]]")
REPLACE_WITH_SPACE = re.compile("(\\n\\n)|(\\n)")
MULTIPLE_SPACES = re.compile("\s+")
NUMBERS = re.compile("[0-9]+")
EMOJIS = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese characters
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"                 # dingbats
        u"\u3030"
                      "]+", flags = re.UNICODE)

In [12]:
# Preprocessing for sentiment analysis
def preprocess_text(text):
    text = [REPLACE_WITH_SPACE.sub(' ', line) for line in text]
    text = [REPLACE_NO_SPACE.sub('', line) for line in text]
    text = [MULTIPLE_SPACES.sub(' ', line) for line in text]
    text = [NUMBERS.sub('', line) for line in text]
    text = [EMOJIS.sub('', line) for line in text]
    text = [line.lower() for line in text]
    return text

def tokenization(text):
    
    return text



## Men

In [13]:
men.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100


In [14]:
# men comments
men_com = men.drop(columns = ['username', 'comment_score', 'submission_title', 'submission_ups'])["comment_text"]
men_com


0        Waiting for the cvs candy to be 75% off on Feb...
1        My second valentines day alone, and i am feeli...
2        February 14 was my mom's birthday so I usually...
3        I'm sending all my friends Valentine's Day car...
4        I have reservations at my favorite steakhouse!...
                               ...                        
16849                      I'm hoping it's coming up soon.
16850    Hmmmm I would think that it is in stable times...
16851                                                   12
16852    20 it is advisable to take care of your health...
16854    If i feel that this is the Part of my Life whe...
Name: comment_text, Length: 15579, dtype: object

## Women

In [15]:
women.head(2)

Unnamed: 0,username,comment_score,comment_text,submission_title,submission_ups
0,shockedpikachu123,28,Waiting for the cvs candy to be 75% off on Feb...,Valentines Day Mega thread! Check in here for ...,100
1,secrethedgehog5,16,"My second valentines day alone, and i am feeli...",Valentines Day Mega thread! Check in here for ...,100


In [16]:
# women comments
women_com = women.drop(columns = ['username', 'comment_score', 'submission_title', 'submission_ups'])["comment_text"]
women_com = preprocess_text(women_com)
women_com

['waiting for the cvs candy to be  off on february  ',
 'my second valentines day alone and i am feeling a bit meh but i will be okay',
 'february  was my moms birthday so i usually spend the day really emotional and not wanting to celebrate well celebrate our anniversary at the end of february though',
 'im sending all my friends valentines day cards through snail mail i wanted them to feel special ',
 'i have reservations at my favorite steakhouse last year i wore a nice jean jacket and a sweater dress the server recognized me as a regular and gave me a flower they were handing out to all the women that night i felt really special',
 'i got my so flowers for valentines day last year and he was thrilled so im doing it again lol he always has fresh flowers in his home but said no one had ever bought him flowers before',
 'i have a new boyfriend mo and we are fully in the honeymoon phase a great phase to be in for valentines day he loves white chocolate so im getting him a box of all wh