In [1]:
import pandas as pd
import numpy as np
import regex as re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [17]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
df=pd.read_csv('processed_text')

In [7]:
df.head().columns

Index(['Unnamed: 0', 'id', 'target', 'comment_text', 'severe_toxicity',
       'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist',
       'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual',
       'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
       'other_religion', 'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'clean_text'],
      dtype='object')

In [8]:
df=df[['comment_text','target','clean_text','asian', 'atheist',
       'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual',
       'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
       'other_religion', 'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white']]

In [9]:
df=df[df['clean_text'].notna()]
df['target']=df['target'].apply(lambda x: 0 if x<0.5 else 1)

In [10]:
df['clean_text']=df['clean_text'].apply(lambda x: re.sub('lgbtq','lesbian gay bisexual transgender queer',x))

In [11]:
df['length']=df['comment_text'].apply(lambda x: len(x))

In [12]:
df['no_of_words']=df['comment_text'].apply(lambda x: len(x.split()))

In [13]:
df['avg_word_length']=df['comment_text'].apply(lambda x: np.mean([len(x) for x in x.split()]))

In [14]:
df['caps']=df['comment_text'].apply(lambda x: len(re.findall(r'[A-Z]',x)))

In [15]:
df['excl']=df['comment_text'].apply(lambda x: len(re.findall(r'\!',x)))

In [16]:
df['sentence_count'] = df['comment_text'].apply(lambda x: len(str(x).split(".")))

In [17]:
df['avg_sentence_length'] = df['comment_text'].apply(lambda x: np.mean([len(x) for x in x.split('.')]))

In [18]:
analyzer = SentimentIntensityAnalyzer()
df['sentiment']=df['comment_text'].apply(lambda x: analyzer.polarity_scores(x).get('compound'))

In [19]:
def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

In [20]:
df['subjectivity']=df['comment_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [None]:
df

In [21]:
pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

def pos_check(x, flag):
    x = re.sub("[^a-zA-Z]", " ", x)
    x = " ".join(x.split())
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt
df['noun_count'] = df['comment_text'].apply(lambda x: pos_check(x, 'noun'))
df['verb_count'] = df['comment_text'].apply(lambda x: pos_check(x, 'verb'))
df['adj_count'] = df['comment_text'].apply(lambda x: pos_check(x, 'adj'))
df['adv_count'] = df['comment_text'].apply(lambda x: pos_check(x, 'adv'))
df['pron_count'] = df['comment_text'].apply(lambda x: pos_check(x, 'pron'))

In [22]:
df.to_csv('feature_extracted')

In [5]:
df=pd.read_csv('feature_extracted',index_col=None)