In [1]:
!pip install pandas nltk tqdm spacy




[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [3]:
# NLTK RESOURCES

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
df = pd.read_csv('DATA_SET/sample.csv')
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16
...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40


In [15]:
df.head(10)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.20878,27,16
5,Humans are like penguins. Just as penguins hud...,0,2784,548,0.196839,64,37
6,Transportation is a large necessity in most co...,0,2743,528,0.19249,63,48
7,On my experiences I think It would be impossib...,0,1174,242,0.206133,18,22
8,What would you do if you have a problem? Any p...,0,1962,394,0.200815,50,44
9,Many authors write about how we should do cert...,0,1534,301,0.196219,43,43


In [17]:
# Basic NLP Features

df['char_count'] = df['text'].apply(len) # char count

df['word_count'] = df['text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['text'].apply(lambda x: parts_of_speech(x))

In [18]:
df.head(10)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,adj_count,pro_count
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,22,27
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,17,46
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,19,4
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,15,22
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.20878,27,16,48,15,33,15,12
5,Humans are like penguins. Just as penguins hud...,0,2784,548,0.196839,64,37,123,30,86,52,24
6,Transportation is a large necessity in most co...,0,2743,528,0.19249,63,48,152,21,86,40,10
7,On my experiences I think It would be impossib...,0,1174,242,0.206133,18,22,49,9,39,19,32
8,What would you do if you have a problem? Any p...,0,1962,394,0.200815,50,44,90,24,78,22,50
9,Many authors write about how we should do cert...,0,1534,301,0.196219,43,43,70,17,52,23,20


In [19]:
df.to_csv('DATA_SET/pre_test.csv', index=False)