In [63]:
# All imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [64]:
def preprocess_text(text):

    if not isinstance(text, str):
        text = ""
        return ""

    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    text = re.sub(re.compile('<.*?>'), '', text) #removing HTML tags
    text = re.sub('[^A-Za-z0-9]+', ' ', text) #taking only words

    text = text.lower()

    tokens = nltk.word_tokenize(text)

    filtered_tokens = [token for token in tokens if token not in stop_words]

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [65]:
processed = 't'
articles = 0

if processed == 'f':
    # Load the data
    articles = pd.read_csv("../final_project/NYT/NYT_Dataset.csv")

    articles.drop(['Unnamed: 0'], axis=1, inplace=True)

    # Preprocess the text, save it to a csv
    articles['preprocessed_abstract'] = articles['abstract'].apply(preprocess_text)
    articles.to_csv("../final_project/NYT/NYT_Dataset_Preprocessed.csv", index=False)
else:
    articles = pd.read_csv("../final_project/NYT/NYT_Dataset_Preprocessed.csv")

In [67]:
articles.head()

Unnamed: 0,ID,title,topic,abstract,Date,keywords,preprocessed_abstract
0,nyt://article/178801fe-4679-5f12-985f-8344a86e...,"In Reversal, Pakistan Welcomes Outside Help Wi...",Foreign,Pakistan’s ambassador to the U.S. said his gov...,2008-01-01 05:00:00+00:00,['Assassinations and Attempted Assassinations'...,pakistan ambassador u said government would en...
1,nyt://article/21acedcb-a7f6-5131-99cf-d3a47e33...,Fighting Intensifies After Election in Kenya,Foreign,"Kenya sank deeper into trouble, with a curfew ...",2008-01-01 05:00:00+00:00,"['Kenya', 'Demonstrations and Riots', 'Odinga,...",kenya sank deeper trouble curfew imposed kisum...
2,nyt://article/357b5429-a9f8-5d33-a5eb-c013a201...,Israel: Olmert Curbs Settlements,Foreign,Prime Minister Ehud Olmert has sent a letter t...,2008-01-01 05:00:00+00:00,['West Bank'],prime minister ehud olmert sent letter defense...
3,nyt://article/619ca4ea-50e4-59e4-97bb-f206502c...,Gay Muslims Pack a Dance Floor of Their Own,Foreign,The monthly club night known as Gayhane is an ...,2008-01-01 05:00:00+00:00,"['Homosexuality', 'Islam', 'IMMIGRATION AND RE...",monthly club night known gayhane rare opportun...
4,nyt://article/73c49a5a-bcf1-5b8f-a15a-98d29003...,Iraqi Revelers Embrace the New Year,Foreign,"But even as partygoers embraced the New Year, ...",2008-01-01 05:00:00+00:00,"['ARMAMENT, DEFENSE AND MILITARY FORCES', 'Iraq']",even partygoer embraced new year surge attack ...


In [74]:
articles = articles[articles['preprocessed_abstract'] != '']

In [75]:
articles

Unnamed: 0,ID,title,topic,abstract,Date,keywords,preprocessed_abstract
0,nyt://article/178801fe-4679-5f12-985f-8344a86e...,"In Reversal, Pakistan Welcomes Outside Help Wi...",Foreign,Pakistan’s ambassador to the U.S. said his gov...,2008-01-01 05:00:00+00:00,['Assassinations and Attempted Assassinations'...,pakistan ambassador u said government would en...
1,nyt://article/21acedcb-a7f6-5131-99cf-d3a47e33...,Fighting Intensifies After Election in Kenya,Foreign,"Kenya sank deeper into trouble, with a curfew ...",2008-01-01 05:00:00+00:00,"['Kenya', 'Demonstrations and Riots', 'Odinga,...",kenya sank deeper trouble curfew imposed kisum...
2,nyt://article/357b5429-a9f8-5d33-a5eb-c013a201...,Israel: Olmert Curbs Settlements,Foreign,Prime Minister Ehud Olmert has sent a letter t...,2008-01-01 05:00:00+00:00,['West Bank'],prime minister ehud olmert sent letter defense...
3,nyt://article/619ca4ea-50e4-59e4-97bb-f206502c...,Gay Muslims Pack a Dance Floor of Their Own,Foreign,The monthly club night known as Gayhane is an ...,2008-01-01 05:00:00+00:00,"['Homosexuality', 'Islam', 'IMMIGRATION AND RE...",monthly club night known gayhane rare opportun...
4,nyt://article/73c49a5a-bcf1-5b8f-a15a-98d29003...,Iraqi Revelers Embrace the New Year,Foreign,"But even as partygoers embraced the New Year, ...",2008-01-01 05:00:00+00:00,"['ARMAMENT, DEFENSE AND MILITARY FORCES', 'Iraq']",even partygoer embraced new year surge attack ...
...,...,...,...,...,...,...,...
106501,nyt://article/7f2445bc-7094-52f2-866e-2dd09f7e...,BAFTA Suspends Award for Actor Noel Clarke Ami...,Foreign,The British actor and director has been accuse...,2021-04-30 13:52:12+00:00,"['Sexual Harassment', 'Actors and Actresses', ...",british actor director accused sexual assault ...
106502,nyt://article/509b840b-52b0-54ca-8e0e-21c2c8fd...,The Bureaucrat From Buffalo Who Pushed Somalia...,Foreign,"His bid to stay in office an extra two years, ...",2021-04-30 16:07:49+00:00,"['Mohamed, Mohamed Abdullahi', 'Mogadishu (Som...",bid stay office extra two year without electio...
106503,nyt://interactive/92d8a9aa-9deb-5ff0-9fc6-6661...,What to Know About the Census Data,U.S.,The count reflects the slowest population grow...,2021-04-30 16:40:05+00:00,"['Census', 'Population', 'United States', 'Rac...",count reflects slowest population growth since...
106504,nyt://article/b5a25627-cf5f-528f-a69c-afcfa681...,"After 500 Years, an Ancient Bronze Hand Is Rej...",Foreign,Researchers still don’t know how a finger belo...,2021-04-30 16:46:47+00:00,"['Art', 'Roman Civilization', 'Capitoline Muse...",researcher still know finger belonging colossa...


In [76]:
import gensim
import gensim.downloader as gensim_ai
embeddings = gensim_ai.load("word2vec-google-news-300")

In [26]:
import numpy as np

docs_vectors = []  # use a list to store document vectors for efficiency
stop_words = stopwords.words('english')  # removing stop words

for abstract in articles['preprocessed_abstract'].str.lower().str.replace('[^a-z ]', ''):
    temp = []  # store word vectors for each document in a list
    
    for word in abstract.split(' '):
        if word not in stop_words:  # if word is not a stopword
            try:
                word_vec = embeddings[word]  # get the word vector
                temp.append(word_vec)  # append word vector to the list
            except KeyError:
                pass  # if the word is not in the embeddings, skip it
    
    if temp:  # if we have word vectors, calculate the mean
        doc_vector = pd.Series(pd.DataFrame(temp).mean())
    else:  # if no vectors, use a zero vector with the same dimension as the embeddings
        doc_vector = pd.Series(np.zeros(300))  # assuming 300 dimensions for Word2Vec
    
    docs_vectors.append(doc_vector)  # append document vector to the list

# Convert list of document vectors to a DataFrame in one step (much faster)
docs_vectors_df = pd.DataFrame(docs_vectors)

# Check the resulting DataFrame
docs_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.115578,-0.012159,0.108231,0.04063,-0.08494,-0.056042,0.00082,-0.133003,0.104179,0.063309,...,-0.049278,0.037382,-0.049683,0.032234,-0.087547,-0.02685,-0.049118,-0.022751,-0.069792,0.046984
1,0.047835,0.010898,0.076684,0.031901,-0.018931,-0.046814,-0.015191,-0.128165,0.057636,0.058044,...,-0.057868,-0.043294,-0.105194,-0.014062,-0.024807,0.009752,-0.124854,-0.001261,0.110128,-0.043952
2,-0.047923,0.058918,0.053791,-0.022897,-0.031725,-0.10825,0.041186,-0.045916,0.139759,0.001881,...,-0.16112,0.005708,-0.055208,-0.027987,0.0357,0.066815,-0.044142,-0.151938,0.071614,0.047729
3,0.012184,0.018106,0.029343,0.101912,-0.122892,0.001239,-0.040147,-0.015653,0.101877,0.065505,...,-0.007287,0.014789,-0.174978,0.015663,-0.113187,0.057716,0.071395,0.007958,0.071644,0.018273
4,0.062007,0.017569,0.04466,0.060863,-0.030151,-0.072897,-0.017597,-0.17264,0.157114,0.185622,...,-0.010911,-0.11611,-0.047119,0.090604,-0.099539,-0.060829,-0.023013,-0.126671,0.043495,0.027776


In [77]:
len(docs_vectors_df)

105897

In [78]:
len(articles['title'].values)

105897

In [79]:
# Separate the data for the Word2Vec splits
features_w2v = docs_vectors_df.copy()
labels_w2v = articles['title'].values

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(features_w2v, labels_w2v, test_size=0.3, shuffle=True)

In [81]:
X_train_w2v.shape

(74127, 300)

In [80]:
model = RandomForestClassifier(n_estimators=15)

In [82]:
# Identify the indices with NaN in y_train_w2v
nan_train_indices = pd.isnull(y_train_w2v)

# Filter out the rows with NaN from both X_train_w2v and y_train_w2v
X_train_w2v = X_train_w2v[~nan_train_indices]
y_train_w2v = y_train_w2v[~nan_train_indices]

In [83]:
# Identify the indices with NaN in y_train_w2v
nan_test_indices = pd.isnull(y_test_w2v)

# Filter out the rows with NaN from both X_train_w2v and y_train_w2v
X_test_w2v = X_test_w2v[~nan_test_indices]
y_test_w2v = y_test_w2v[~nan_test_indices]

In [85]:
import random

train_indices = random.sample(range(len(X_train_w2v)), 2500)

In [86]:
print("Length of X_train_w2v:", len(X_train_w2v))
print("Subset of train_indices:", train_indices[:10])  # Just a small sample
print("Max index in train_indices:", max(train_indices))


Length of X_train_w2v: 74126
Subset of train_indices: [14904, 60638, 54124, 67851, 8967, 56784, 5667, 10564, 8768, 53608]
Max index in train_indices: 74115


In [87]:
X_train_w2v = X_train_w2v.iloc[train_indices]
y_train_w2v = y_train_w2v[train_indices]

In [88]:
X_train_w2v.shape

(10000, 300)

In [None]:
model.fit(X_train_w2v, y_train_w2v)

In [60]:
predictions = model.predict(X_test_w2v)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from difflib import SequenceMatcher
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
import pandas as pd

# Define functions for the metrics

# 1. Cosine Similarity
def calculate_cosine_similarity(predictions, actuals):
    vectorizer = TfidfVectorizer().fit(actuals + predictions)  # Fit TF-IDF on both predicted and actual titles
    actual_vectors = vectorizer.transform(actuals)
    prediction_vectors = vectorizer.transform(predictions)
    similarities = cosine_similarity(prediction_vectors, actual_vectors).diagonal()
    return similarities.mean()

# 2. Jaccard Similarity
def calculate_jaccard_similarity(prediction, actual):
    set_pred = set(prediction.split())
    set_act = set(actual.split())
    intersection = set_pred.intersection(set_act)
    union = set_pred.union(set_act)
    return len(intersection) / len(union) if union else 0

def average_jaccard_similarity(predictions, actuals):
    return np.mean([calculate_jaccard_similarity(p, a) for p, a in zip(predictions, actuals)])

# 3. Levenshtein Distance (normalized to be comparable)
def levenshtein_distance(prediction, actual):
    return SequenceMatcher(None, prediction, actual).ratio()

def average_levenshtein_similarity(predictions, actuals):
    return np.mean([levenshtein_distance(p, a) for p, a in zip(predictions, actuals)])

# Evaluate the model's predictions
predictions = model.predict(X_test_w2v)
y_test_titles = y_test_w2v.tolist()  # Convert y_test to a list if it's not already

cosine_sim = calculate_cosine_similarity(predictions, y_test_w2v)
jaccard_sim = average_jaccard_similarity(predictions, y_test_w2v)
levenshtein_sim = average_levenshtein_similarity(predictions, y_test_w2v)

print("Cosine Similarity:", cosine_sim)
print("Average Jaccard Similarity:", jaccard_sim)
print("Average Levenshtein Similarity:", levenshtein_sim)

Cosine Similarity: 0.023945201539323214
Average Jaccard Similarity: 0.03212355732208247
Average Levenshtein Similarity: 0.2564003937931116
