In [102]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
import itertools
import seaborn as sn
from textblob import TextBlob
%matplotlib inline
import nltk
from nltk.stem import *
import re 
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 


In [109]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/swethapola/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [140]:
def preprocessor(data):
    
    # TfidfVectorizer puts all letters in lowercase by default
    data = data.lower()

    # sub hyphens with no space
    data = re.sub("\-", '', data)
    
    # sub non letter & non digit characters w/ a space
    data = re.sub("[^A-Za-z0-9 \\n]","", data)
    
    # sub digit characters with '#' character
    #data = re.sub("[\d]", "\#", data) 
    
    #removing stop words
    word_tokens = word_tokenize(data)
    sw = stopwords.words("english")
    filtered_sentence = []
    for w in word_tokens: 
        if w not in sw: 
            filtered_sentence.append(w) 
    
    #turn list of tokens back into str
    data = ' '.join(filtered_sentence)
    
    # shorten long words of length 20 or more
    data = re.sub('(\w{20})\w+', '\\1', data) #mainly gets rid of hyperlinks
    
    
    # stemming words
    #stem = PorterStemmer()
    #for word in data:
     #   data = data.replace(word, stem.stem(word))
    
    return data

In [145]:
def tvect(data):
    vect = TfidfVectorizer(min_df = 2, ngram_range = (2,4))
    fitted = vect.fit_transform(data)
    return fitted, vect.get_feature_names()

def get_polarity(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [146]:
def featurize(csv, path):
    #read in csv & make a df
    df = pd.read_csv(path + csv)
    df = df.drop("Unnamed: 0", axis = 1)


    df = df.rename(columns={'0': 'text'})
    
    #add polarity column
    df['polarity'] = df['text'].apply(get_polarity)
    
    #add preprocessed column
    df['preprocess_full_text'] = df.text.apply(lambda x: preprocessor(x))
    
    #add feature names from TfidfVectorizer
    featurized = tvect(df.preprocess_full_text)
    df1 = pd.DataFrame(featurized[0].toarray(), columns=featurized[1])
    df = pd.concat([df, df1], axis=1)
    
    
    return df, featurized[1]


In [150]:
w = featurize('AskWomen_Data.csv' , 'data/')
w_df = w[0]
w_features = w[1]

In [151]:
w_df

Unnamed: 0,text,polarity,preprocess_full_text,10 hours,10 minutes,10 minutes without,10 minutes without rest,10 yawned,10 yawned walked,10 yawned walked slowly,...,youve stopped,youve stopped altogether,youve stopped altogether week,youve together,youve wasted,youve wasted time,youve wasted time always,zero woman,zero woman uses,zero woman uses condoms
0,My dad had a heart attack and spent over 7 min...,0.038258,dad heart attack spent 7 minutes without oxyge...,0.317523,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,I had dangerous open heart surgery due to mult...,-0.072619,dangerous open heart surgery due multiple cong...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"In June of 2020, my Dad collapsed in our yard ...",0.054932,june 2020 dad collapsed yard helping us mow la...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"after my twin aunts were born, my grandma beca...",0.145833,twin aunts born grandma became severely ill cl...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Not me but my mother is the most severe case I...,-0.192593,mother severe case think around 25 started reg...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,Edit: also what if this hall pass was in respo...,0.000000,edit also hall pass response needing space,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171,My best friend passed away and when I found ou...,0.169444,best friend passed away found threw kitchen cr...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1172,I was recently informed by a member of this co...,0.125000,recently informed member community women love ...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1173,"Hello, I could really use some help. I am a 17...",0.005058,hello could really use help 17 year old girl r...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
m = featurize('AskMen_Data.csv' , 'data/')
m_df = m[0]
m_features = m[1]

In [153]:
m_df

Unnamed: 0,text,polarity,preprocess_full_text,10 miles,10 minutes,10 months,10 seconds,10 years,10 years ago,100 time,...,youve done,youve got,youve made,youve made right,youve made right decision,youve meaning,youve meaning activity,youve meaning activity occupy,yr old,yrs old
0,"I don't fetishize scars, but so long as they'r...",0.100000,dont fetishize scars long theyre face wouldnt ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Doesn't matter. I had a big crush on this gir...,0.000000,doesnt matter big crush girl one chemistry cla...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2 years ago I briefly was seeing a girl with a...,-0.165000,2 years ago briefly seeing girl scar open hear...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Oh no. Buddy. Look, you’re trying and I thin...",-0.188889,oh buddy look youre trying think see youre try...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,I have pictures in the OR from all the surgeri...,0.166667,pictures surgeries except recent open heart su...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608,My Dad and brother don't care as long as I'm t...,-0.035714,dad brother dont care long im one lugging arou...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1609,"I'm throwing it out, so I'm not worried about ...",0.046296,im throwing im worried breaking need fold fit ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1610,I had a terribly emotionally abusive mother an...,-0.054431,terribly emotionally abusive mother father aro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1611,"Hi,\n\nI'm back in a relationship and the sex ...",0.123669,hi im back relationship sex great kinda short ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
intersection = set(w_features).intersection(set(m_features))

len(intersection)


1776