In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
from spellchecker import SpellChecker
from tqdm import tqdm

tqdm.pandas()
import numpy as np
from matplotlib.ticker import MaxNLocator
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

In [None]:
data = pd.read_csv("data/train.csv")

# Class imbalance

In [None]:
counts = data["target"].value_counts()
sns.barplot(x=counts.index, y=counts.values)
plt.show()

# Get golden standard data

In [None]:
url = "https://en.wikipedia.org/wiki/Natural_disaster"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    paragraphs = soup.find_all("p")
    article_text = "\n".join([para.get_text() for para in paragraphs])

# Examine language of tweets

In [None]:
DetectorFactory.seed = 0

data["language"] = data["text"].apply(lambda text: detect(text))

counts = data["language"].value_counts()
sns.barplot(x=counts.index, y=counts.values)
plt.show()

In [None]:
data[~(data["language"] == "en")]

It looks like the "non-english-texts" are just incorrectly identified. Most likely all tweets are in english.

# Text quality metrics

## Spelling mistakes

In [None]:
spell = SpellChecker()

### Wiki article

In [None]:
num_words = len(article_text.split())
num_mispelled = len(spell.unknown(article_text.split()))
mispelled_ratio = num_mispelled / num_words
print(f"Mispelled ratio for article: {mispelled_ratio}")

### Disaster dataset

In [None]:
data["num_words"] = data["text"].apply(lambda text: len(text.split()))
data["num_mispelled"] = data["text"].apply(lambda text: len(spell.unknown(text.split())))

In [None]:
sum(data["num_mispelled"]) / sum(data["num_words"])

In [None]:
data["mispelled_ratio"] = data["num_mispelled"] / data["num_words"]

counts = data["mispelled_ratio"].value_counts()

bin_edges = np.arange(0, 1.1, 0.1)

data["ratio_bins"] = pd.cut(data["mispelled_ratio"], bins=bin_edges)

counts = data["ratio_bins"].value_counts().sort_index()

sns.barplot(x=counts.index, y=counts.values)
plt.xticks(rotation=90)
plt.show()

In [None]:
data["mispelled_ratio"]

In [None]:
data.sort_values(by="mispelled_ratio", ascending=False)[["mispelled_ratio", "text"]].iloc[:30]

## Grammar mistakes

TODO

# Information content

## Ratio of stopwords

In [None]:
def compute_stopwords_ratio(text):
    words = text.split()
    stopwords_count = sum(1 for word in words if word in stop_words)
    return stopwords_count / len(words)

In [None]:
print(f"Wiki stopwords ratio: {compute_stopwords_ratio(article_text)}")

In [None]:
data["stopwords_ratio"] = data["text"].apply(compute_stopwords_ratio)
print(f"Average stopword ratio: {np.mean(data['stopwords_ratio'])}")

In [None]:
counts = data["stopwords_ratio"].value_counts()

bin_edges = np.arange(0, 1.1, 0.1)

data["ratio_bins"] = pd.cut(data["stopwords_ratio"], bins=bin_edges)

counts = data["ratio_bins"].value_counts().sort_index()

sns.barplot(x=counts.index, y=counts.values)
plt.xticks(rotation=90)
plt.show()

In [None]:
data.sort_values(by="stopwords_ratio", ascending=True)[["stopwords_ratio", "text"]].iloc[:30]

# Average tf-idf scores

In [None]:
# Assuming 'text' is the column in your DataFrame that contains the text
vectorizer = TfidfVectorizer()

vectorizer.fit(list(data["text"]) + list(article_text))

tfidf_matrix = vectorizer.transform(data["text"])


# Compute the average tf-idf score for each text
avg_tfidf_scores = np.squeeze(np.asarray(tfidf_matrix.mean(axis=1)))

# Append it to the dataframe
data["avg_tfidf"] = avg_tfidf_scores

# Calculate and print the average tf-idf for the entire dataset
dataset_avg_tfidf = np.mean(avg_tfidf_scores)
print(f"Average tf-idf score for the dataset: {dataset_avg_tfidf}")

# 'article_text' is your variable that contains some text in string format
# To transform it into a tf-idf vector, we first need to put it in a list
article_vect = vectorizer.transform([article_text])

# Compute the average tf-idf score for the article
article_avg_tfidf = np.mean(article_vect.toarray())
print(f"Average tf-idf score for the article: {article_avg_tfidf}")

In [None]:
counts = data["avg_tfidf"].value_counts()

bin_edges = np.arange(0, 0.0001, 0.00001)

data["ratio_bins"] = pd.cut(data["avg_tfidf"], bins=bin_edges)

counts = data["ratio_bins"].value_counts().sort_index()

sns.barplot(x=counts.index, y=counts.values)
plt.xticks(rotation=90)
plt.show()

In [None]:
data.sort_values(by="avg_tfidf", ascending=True)[["avg_tfidf", "text"]].iloc[:30]

## Unknown BERT tokens

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def count_unknowns(text):
    tokenized_text = tokenizer.tokenize(text)
    unknown_tokens = [token for token in tokenized_text if token == "[UNK]"]
    return len(unknown_tokens)


data["unknowns_count"] = data["text"].progress_apply(count_unknowns)


average_unknowns = data["unknowns_count"].mean()
print(f"Average number of unknown tokens in texts: {average_unknowns}")

# Outliers

In [None]:
"""
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(data['text'].tolist(), convert_to_tensor=True)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0)

embeddings_np = embeddings.numpy()

clusters = kmeans.fit_predict(embeddings_np)

from sklearn.manifold import TSNE

# Use t-SNE to reduce the dimensionality of the embeddings to 2D
embeddings_2d = TSNE(n_components=2).fit_transform(embeddings)

# `embeddings_2d` is a 2D tensor where each row is a 2D representation of a text in your dataset

import seaborn as sns
import matplotlib.pyplot as plt

# Create a DataFrame for seaborn
plot_data = pd.DataFrame(embeddings_2d, columns=["Dim1", "Dim2"])
plot_data['Target'] = data['target'].values  # This assumes your "target" column is accessible here

# Create a scatter plot
plt.figure(figsize=(10, 10))
sns.scatterplot(data=plot_data, x="Dim1", y="Dim2", hue="Target", palette="deep")

plt.show()
"""