Installing libraries

In [1]:
import stanza
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Importing bad words vocab and dataset

In [3]:
bad_words=[]
with open('./data/bad-words.csv') as f:
    for line in f.readlines():
        bad_words.append(line.split('\n')[0])

bad_words_set = set(bad_words)
dataset = pd.read_csv('news_comments.csv')

Importing Stanza POS tagger

In [9]:
pos_tagger = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos',use_gpu=True)

2023-03-15 13:31:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 66.7MB/s]                    
2023-03-15 13:31:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-03-15 13:31:59 INFO: Using device: cuda
2023-03-15 13:31:59 INFO: Loading: tokenize
2023-03-15 13:31:59 INFO: Loading: pos
2023-03-15 13:31:59 INFO: Done loading processors!


POS Tags for our dataset

In [34]:
pos_tags_dataset =[]
for sentence in dataset.iloc[:,1].values.tolist():
    doc = pos_tagger(sentence)
    pos_tags_dataset.append([word.xpos for sent in doc.sentences for word in sent.words])

Writing the tags to a txt file (Simply)

In [37]:
with open('pos_tags_dataset.txt','w') as f:
    f.writelines([str(i) for i in pos_tags_dataset])

In [5]:
pos_tags_dataset=[]
with open('pos_tags_dataset.txt','r') as f:
    pos_tags_dataset.append(f.readlines())

Labelling the dataset

In [7]:
labels= []
word_indices=[]
for i,sentence in enumerate(dataset['comments']):
    #capture bw indices for each sentence
    flag = False
    for j,word in enumerate(sentence.split()):
        if word.lower() in bad_words_set:
            flag = True
            # pos_tags_dataset[i][j] = 'BW'
            continue
    labels.append(1) if flag else labels.append(0)

Percentage of bad sentences

In [8]:
np.count_nonzero(labels)

14439

In [9]:
np.count_nonzero(labels)/len(labels)

0.27926812757480224

Separating good and bad sentences

In [14]:
dataset['labels'] = labels

In [18]:
good_sentences = dataset[dataset['labels']==0]
bad_sentences = dataset[dataset['labels']==1]

In [19]:
len(good_sentences)

37264

In [20]:
len(bad_sentences)

14439

In [21]:
sample_bad = bad_sentences.sample(10)

In [23]:
sample_bad

Unnamed: 0.1,Unnamed: 0,comments,labels
33865,33865,Basically what I'm saying is that if they clai...,1
15899,15899,The Ferenghi do have a general reservation aga...,1
24288,24288,PPP Fraud helped business owners buy all their...,1
50279,50279,These are a great alternative to abandoning yo...,1
16376,16376,No vinyl chloride or pre-product has been dete...,1
33306,33306,"What the fuck, you are regularly finding dead ...",1
24605,24605,"*If* he's being sarcastic, fascists managed to...",1
44891,44891,It'd be terrifying as fuck when you awake and ...,1
12769,12769,> Gaetz’s father is famous for fixing things f...,1
46345,46345,Union Carbide killed over 3700 people and inju...,1


Lucene

In [None]:
# import lucene
# from java.io import StringReader
# from org.apache.lucene.analysis.standard import StandardAnalyzer
# from org.apache.lucene.index import DirectoryReader
# from org.apache.lucene.search.similarities import ClassicSimilarity
# from org.apache.lucene.search import IndexSearcher
# from org.apache.lucene.queryparser.classic import QueryParser
# from org.apache.lucene.store import SimpleFSDirectory
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Initialize Lucene
# lucene.initVM()

# # Set up the analyzer and similarity algorithm
# analyzer = StandardAnalyzer()
# similarity = ClassicSimilarity()

# # Set up the index
# index_dir = SimpleFSDirectory(File("index"))
# searcher = IndexSearcher(DirectoryReader.open(index_dir))
# searcher.setSimilarity(similarity)

# # Define the query and tags
# query = "python programming"
# tags = ["python", "programming"]

# # Tokenize and vectorize the tags using TF-IDF
# tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
# tfidf_matrix = tfidf.fit_transform(tags)

# # Search the index for similar sentences
# query_parser = QueryParser("content", analyzer)
# query = query_parser.parse(query)
# top_docs = searcher.search(query, 10)
# for score_doc in top_docs.scoreDocs:
#     doc = searcher.doc(score_doc.doc)
#     sentence = doc.get("content")
#     tfidf_score = cosine_similarity(tfidf_matrix, tfidf.transform([sentence]))[0][0]
#     if tfidf_score > 0.5:
#         print(f"Similar sentence found: {sentence}")

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the sentences
sentences = [
    # "The quick brown cat jumps over the lazy dog"
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Compute the tf-idf matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Print the similarity matrix
print(cosine_sim_matrix)


[[1.         0.25861529 0.25861529]
 [0.25861529 1.         0.25861529]
 [0.25861529 0.25861529 1.        ]]
