# Load libraries

In [1]:
import pandas as pd
import numpy  as np
import string 
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Data import

In [2]:
pd.set_option('max_colwidth', None) 
df = pd.read_csv("~/Downloads/banData/MeTooHate.csv", nrows = 10000)
#df.head()
df.category.value_counts()

0    8582
1    1418
Name: category, dtype: int64

# Sample data

## Annotation rule

- 0: neutral  content or positive sentiment
- 1: negative sentiment or negative fact but no abusive wording
- 2: Abusive/ hate language

In [3]:
df_mini = df[df["category"]==1].iloc[0:101].head(100)
df_mini.reset_index(inplace =True)

label=[2, 2, 1,  1, 1, 1,  1, 2,  1, 0,  2,   1,  1,  1,  1,
  1, 1, 1,0,2,1,    1,    1,    1,    1,   2,     1,    2,    0,    1,
  0,1,2, 0,1,2, 0,1,1,0, 
  1,0,1,0,1,1,1,2,1,2,
  2,2,2,0,1, 0,2,1,1,1,
  1,1,1,0,1,2,1,0,1,0,
  2,  1,1,   2, 1,2,   1, 1,0,1,1 ,
  1,   2,   2,   0,   2,   0,   1,   1,   1,   1,   1,   1,   0,
   1,2,1, 0, 1,1]
df_mini["label"] = label

tmp = df[df["category"]==0].iloc[0:39]
tmp.reset_index(inplace =True)
tmp["label"] = 0
tmp.index = pd.Series(np.array(tmp.index)+100)

df_mini = pd.concat([df_mini, tmp],  axis=0 )
df_mini.drop(columns = ["status_id",'created_at', 'location'], inplace = True)
df_mini.reset_index(inplace  = True)

df_mini.drop_duplicates(inplace = True)
df_mini.label.value_counts()

1    58
0    58
2    23
Name: label, dtype: int64

# Litterature & github account

- label propagation:  https://towardsdatascience.com/semi-supervised-learning-how-to-assign-labels-with-label-propagation-algorithm-9f1683f4d0eb
- Lexicon of abusive words: https://github.com/uds-lsv/lexicon-of-abusive-words     
- Deep learning and clustering

# Alternative modeling plans

 - Manually relabel 2%-5% of tweets in category 1 (hate speech)
 - Apply label propagation algo to label the remaining tweets
 - All tweets labeled,  proceed with classification/DL model

# Data preprocessing



## Data cleaning

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/mimi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mimi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mimi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mimi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
text = hate_text.iloc[59]

# lowercase
text = text.lower() 

# remove numbers
text = ''.join(word for word in text if not word.isdigit())

# punctuation
#string.punctuation
for punctuation in string.punctuation:
    text = text.replace(punctuation, '') 
    
# stopwords
stop_words = set(stopwords.words('english'))


In [None]:
# tokenizing - transform to list
word_tokens = word_tokenize(text) 
text = [w for w in word_tokens if not w in stop_words] 
text

## Lemantizing

In [None]:
# Stemming or Lemmatizing- finding root word
# stemmer = PorterStemmer()
# stemmed = [stemmer.stem(word) for word in text]
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text]
text = lemmatized
lemmatized


## Vectorizing

In [None]:
# text numerical ----> representation

## Bag of words: counting ocuurences of each word (mx with word in col)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text)
X.toarray()   
#vectorizer.get_feature_names()  
#pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names())  

In [None]:
## Tf-Idf: Term Frequency - Inverse Document Frequency --> importance of word in document--> frequency
# pros: - frequency robust to document length
#       - measures importance
# cons: doesn't capture context

# texts = ['i love football',
#          'football is a game i love',
#         'football football football']

tf_idf_vectorizer = TfidfVectorizer()
X = tf_idf_vectorizer.fit_transform(text)
#X.toarray()
#pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

In [None]:
# Key paramters BOW and Td-Idf :
# max_df = exclude "corpus specific stopwords", most freq words
# min_df = exclude words that are very infrequent in the dataset
# max_features = specify the number of features to keep when vectorizing, useful to reduce the dimension of the data

In [None]:
### N-grams: Instead of considering individual words, 
#N-grams consists of considering word sequences. 
#This representation captures context. N is the number of words to be consiered as a one
texts =  [
         'i do not love football',
         'i love football not basketball']
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

# Test label propagation on minimal data

In [None]:
! pip install plotly

In [None]:
# Visualization
import plotly.express as px # for data visualization
import plotly.graph_objects as go # for data visualization
import matplotlib.pyplot as plt # for displaying confusion matrix

# Skleran
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # for showing confusion matrix
from sklearn.preprocessing import MinMaxScaler # for feature scaling
from sklearn.semi_supervised import LabelPropagation # for assigning labels to unlabeled data


## Data preprocessing

In [None]:
# lowercase
text = text.lower() 

# remove numbers
text = ''.join(word for word in text if not word.isdigit())

# punctuation
#string.punctuation
for punctuation in string.punctuation:
    text = text.replace(punctuation, '') 
    
# stopwords
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text]
text = lemmatized
lemmatized
tf_idf_vectorizer = TfidfVectorizer()
X = tf_idf_vectorizer.fit_transform(text)
#X.toarray()
#pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

# External abusive words list


In [None]:
abusive_words_df = pd.read_csv('../raw_data/abusive_words_list.txt', delimiter = "\t").iloc[:,0:2]
abusive_words_df.columns = ["word", "neg_polarity"]
word2 = [str(x).split("_")[0] for x in abusive_words_df.word]
word2 = pd.Series(word2)
word2
abusive_words_df["word2"] = word2
abusive_words_df

word3 = pd.DataFrame(abusive_words_df["word2"].unique())
len(word3)
#'../raw_data/abusive_words_list.txt'
word3.to_csv('../project_BAN/data/abusive_words.txt',header="words", index=None, sep=' ', mode='a')


# Useful notes

imoji heart in negative comment 
Because is true and powerful (thanks, ❤️), I'm unearthing so much of my past. I just remembered that time a platonic male friend came to my house with a fish bat, repeatedly brandishing it in a threatening manner, and how calm I had to remain to get him ou

## Some features are:

- Vocabulary Richness
- Number of words per tweet
- punctuation/Character ratio
- emoji/Character ratio
- Contains abusive words
- Contains Words in Capital letters

In [None]:
def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

vocab_richness = df_mini.text.apply(vocab_richness)

vocab_richness

## Machine learning models
- Naive base fro classification
- LatentDirichletAllocation for clustering

## Deep learning model

https://kitt.lewagon.com/camps/773/lectures/content/06-DL_05-Natural-Language-Processing.html
    
X.shape = (n_sentences, max_sentence_length, embedding_dim)

steps:
    - tokenize text: tf.keras.preprocessing.text.Tokenizer
    - embedding layer (creating vector representation of each word): https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding
                - custome embedding : can be done but computationally intensive
                - independent embedding woth Word2vec (Gensim) --- transfer learning
                
Powerful embedding that is very fast and easy to train!

✅ you give it a list of sentences
✅ it automatically learns a representation - an embedding - for each word it was trained on!