In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

# preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# first 5 datapoints
df.head()

(28614, 2)

Unnamed: 0,tweet_text,cyberbullying_type
0,i hate ppl from high school y’all used to bull...,age
1,Kat and Andre are such assholes OMG #mkr,not_cyberbullying
2,"if she is new,she will not have access to go t...",age
3,Fuck David duke racist who thinks America belo...,ethnicity
4,I May not say it a lot but I hate apologetic A...,other_cyberbullying


In [4]:
# preprocess documents
# remove special characters, stopwords
# lemmatization
clean_tweets = preprocess_corpus(df.tweet_text)

clean_tweets.head()

0    hate ppl high school used bully hot omg love m...
1                            kat andre asshole omg mkr
2    new access trading cause need high level opini...
3    fuck david duke racist think america belong du...
4    may say lot hate apologetic army hope choke ev...
Name: tweet_text, dtype: object

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer()

# spawn a tfidf vectorizer with min_df filter
vectorizer_df_filter = TfidfVectorizer(min_df=30)

# train and construct tfidf vectors
vectors = vectorizer.fit_transform(clean_tweets)

# train and construct filtered tfidf vectors
vectors_df_filter = vectorizer_df_filter.fit_transform(clean_tweets)

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# extract filtered tfidf vectors as dataframe
df_tfidf_df_filter = pd.DataFrame(vectors_df_filter.toarray(), columns=vectorizer_df_filter.get_feature_names_out())

# include class label column
df_tfidf['cyberbullying_type'] = df['cyberbullying_type']
df_tfidf_df_filter['cyberbullying_type'] = df['cyberbullying_type']

# tfidf vectors
# shape
display(df_tfidf.shape)

# first 5 datapoints
display(df_tfidf.head())

# filtered tfidf vectors
# shape
display(df_tfidf_df_filter.shape)

# first 5 datapoints
display(df_tfidf_df_filter.head())

(28614, 36282)

Unnamed: 0,aaa,aaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaah,aaaaaaaaaah,aaaaaaaaaajajajajajajajahahahajahaja,aaaaah,aaaaargh,aaaag,aaaah,...,zvakaoma,zvlahos,zyampii,zyeth,zyme,zynga,zython,zzoegrimm,zzz,cyberbullying_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,not_cyberbullying
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ethnicity
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,other_cyberbullying


(28614, 1581)

Unnamed: 0,able,abortion,absolute,absolutely,abt,abuse,abusive,accept,acceptable,accepted,...,yesterday,yesyouresexist,yet,young,younger,youre,youtube,yup,zero,cyberbullying_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,not_cyberbullying
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,age
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ethnicity
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,other_cyberbullying
