In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk

# Load data

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the path to your file on Google Drive
file_path = '/content/drive/My Drive/Data_spider_news_global.csv'

Mounted at /content/drive


In [None]:
# Load the CSV file using pandas
import pandas as pd
data_spider_news_global = pd.read_csv(file_path, delimiter='\t')

# Filter for English language titles
data_spider_news_global = data_spider_news_global[data_spider_news_global['Language'] == 'English']

# Display the first few rows to confirm loading
data_spider_news_global.head()

Unnamed: 0,ID,URL,Language,Country_search,Newspaper,Type_of_newspaper,Circulation,d,m,yr,...,Expert_doctor,Expert_others,Sensationalism,Taxonomic_error,Venom_error,Anatomy_error,Photo_error,Quality_check,Contributor,Notes
1780,NZ_001,https://www.odt.co.nz/news/dunedin/city-wide-r...,English,New Zealand,Otago Daily Times,Traditional newspaper,Regional,1.0,2.0,2010.0,...,0.0,1.0,0.0,0.0,,0.0,0.0,yes,Christina Painting & Leilani Walker,-
1781,NZ_002,http://www.stuff.co.nz/nelson-mail/editors-pic...,English,New Zealand,Nelson Mail (via Stuff),Traditional newspaper,Regional,20.0,4.0,2010.0,...,0.0,1.0,1.0,0.0,,0.0,0.0,yes,Christina Painting & Leilani Walker,-
1782,NZ_002,http://www.stuff.co.nz/nelson-mail/editors-pic...,English,New Zealand,Nelson Mail (via Stuff),Traditional newspaper,Regional,20.0,4.0,2010.0,...,0.0,1.0,1.0,0.0,,0.0,,yes,Christina Painting & Leilani Walker,-
1783,NZ_003,http://www.stuff.co.nz/national/3697571/Katipo...,English,New Zealand,Stuff,Online newspaper,National,26.0,5.0,2010.0,...,1.0,0.0,0.0,0.0,0.0,0.0,,yes,Christina Painting & Leilani Walker,Expert opinion is quotes from a medical journa...
1784,NZ_004,https://www.nzherald.co.nz/nz/news/article.cfm...,English,New Zealand,NZ Herald,Traditional newspaper,National,14.0,11.0,2010.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,yes,Christina Painting & Leilani Walker,First report in this story happened before 201...


In [None]:
len(data_spider_news_global)

2148

# Bias Analysis

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
def get_axis_vector(word_list):
    vectors = [nlp(word).vector for word in word_list]
    return np.mean(vectors, axis=0)

# Define words related to "aversion" and "fascination" axes
axis_words = {
    "aversion": ["fear", "danger", "scary", "disgusting", "creepy"],
    "fascination": ["fascinating", "cute", "lovely", "beneficial", "harmless"]
}

aversion_vector = get_axis_vector(axis_words["aversion"])
fascination_vector = get_axis_vector(axis_words["fascination"])
bias_axis = aversion_vector - fascination_vector

# Calculate bias scores for each title
def calculate_bias(text):
    doc_vector = nlp(text).vector
    return cosine_similarity([doc_vector], [bias_axis])[0][0]

In [None]:
data_spider_news_global['Bias_Score'] = data_spider_news_global['Title'].apply(calculate_bias)
print(data_spider_news_global[['Title', 'Bias_Score']].head())

                                                  Title  Bias_Score
1780                       City-wide reports of spiders   -0.146713
1781                         Big and scary but harmless    0.004076
1782                         Big and scary but harmless    0.004076
1783                Katipo bites skinny-dipping tourist   -0.160032
1784  Six years on, white-tail spider bites still sting   -0.109680


In [None]:
from scipy.stats import ttest_1samp

# Perform a one-sample t-test on bias scores
t_stat, p_value = ttest_1samp(data_spider_news_global['Bias_Score'], 0)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")

t-statistic: -9.270668166587813
p-value: 4.3598384006759527e-20


In [None]:
mean_bias = data_spider_news_global['Bias_Score'].mean()
std_bias = data_spider_news_global['Bias_Score'].std()
print(f"Mean bias score: {mean_bias}")
print(f"Standard deviation of bias score: {std_bias}")

Mean bias score: -0.01686360500752926
Standard deviation of bias score: 0.08430560678243637


*   "Big and scary but harmless" -> aversion? neutral?

*   A negative mean indicates that, on average, titles lean slightly toward "fascination" rather than "aversion." However, the score is quite close to zero, suggesting only a mild inclination.

*   A very small p-value suggests that the observed mean bias score is statistically significant and unlikely to occur by chance if there were no overall bias.

# Bootstrap

In [None]:
import numpy as np
import pandas as pd

bias_scores = data_spider_news_global['Bias_Score'].dropna().values  # Drop any NaN values in 'Bias_Score'

# Set up bootstrap parameters
n_iterations = 10000  # Number of bootstrap samples
bootstrap_means = []

# Perform bootstrap sampling
for _ in range(n_iterations):
    # Resample with replacement and calculate the mean of each sample
    bootstrap_sample = np.random.choice(bias_scores, size=len(bias_scores), replace=True)
    bootstrap_means.append(np.mean(bootstrap_sample))

# Calculate 95% confidence interval
ci_lower, ci_upper = np.percentile(bootstrap_means, [2.5, 97.5])

# Print results
print(f"Bootstrap Mean Bias Score: {np.mean(bootstrap_means):.4f}")
print(f"95% Confidence Interval: ({ci_lower:.4f}, {ci_upper:.4f})")

Bootstrap Mean Bias Score: -0.0169
95% Confidence Interval: (-0.0205, -0.0133)


# Topic Modeling

In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

processed_titles = [preprocess(title) for title in data_spider_news_global['Title']]
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words="english")
dtm = vectorizer.fit_transform(processed_titles)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Display topics
for index, topic in enumerate(lda.components_):
    print(f"Topic #{index + 1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print("\n")

Topic #1:
['venomous', 'false', 'grapes', 'finds', 'black', 'deadly', 'home', 'widow', 'spider', 'spiders']


Topic #2:
['mum', 'flesh', 'hospital', 'eating', 'widow', 'false', 'leg', 'bitten', 'bite', 'spider']


Topic #3:
['woman', 'left', 'venomous', 'life', 'bites', 'false', 'black', 'bite', 'widow', 'spider']


Topic #4:
['massive', 'man', 'finds', 'spiders', 'tarantula', 'moment', 'huge', 'giant', 'huntsman', 'spider']


Topic #5:
['michigan', 'ear', 'bitten', 'venomous', 'woman', 'bite', 'man', 'recluse', 'brown', 'spider']


