<a href="https://colab.research.google.com/github/jenboland/DailyKnowledge/blob/master/Internal_Link_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


# Load the CSV file from Google Drive
# Make sure that this file exists and is in the correct location
file_path = '/content/drive/My Drive/custom_extraction_full_text.csv'
df = pd.read_csv(file_path)
print(df.shape)
# Display the first few rows of the DataFrame
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(1599, 4)


Unnamed: 0,Address,Status Code,Status,body 1
0,https://eonetwork.org/blog/,200,OK,"<div class=""hero hero--tall-bkg dark-bg rich-t..."
1,https://eonetwork.org/blog/?&page=1,200,OK,"<div class=""hero hero--tall-bkg dark-bg rich-t..."
2,https://eonetwork.org/blog/?&page=2,200,OK,"<div class=""hero hero--tall-bkg dark-bg rich-t..."
3,https://eonetwork.org/blog/why-it-s-critical-t...,200,OK,"<div class=""hero mb-0 dark-bg rich-text-area"">..."
4,https://eonetwork.org/blog/?page=1,200,OK,"<div class=""hero hero--tall-bkg dark-bg rich-t..."


In [3]:


# Drop rows where 'Address' column contains 'page='
df = df[~df['Address'].str.contains('page=', na=False)]
df.shape

(1199, 4)

In [4]:
# prompt: rename column body 1 to content

df = df.rename(columns={'body 1': 'content'})

In [5]:
df.head()

Unnamed: 0,Address,Status Code,Status,content
0,https://eonetwork.org/blog/,200,OK,"<div class=""hero hero--tall-bkg dark-bg rich-t..."
3,https://eonetwork.org/blog/why-it-s-critical-t...,200,OK,"<div class=""hero mb-0 dark-bg rich-text-area"">..."
5,https://eonetwork.org/blog/ready-fire-oops-how...,200,OK,"<div class=""hero mb-0 dark-bg rich-text-area"">..."
7,https://eonetwork.org/blog/how-to-use-your-bus...,200,OK,"<div class=""hero mb-0 dark-bg rich-text-area"">..."
8,https://eonetwork.org/blog/how-human-skills-su...,200,OK,"<div class=""hero mb-0 dark-bg rich-text-area"">..."


In [6]:
def find_internal_link_opportunities(df):
    contents = df['content'].tolist()
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(contents)
    csim = cosine_similarity(vectors)
    print("Shape of csim:", csim.shape)  # Debugging
    print("Number of rows in df:", len(df))  # Debugging
    return csim

def save_link_opportunities_to_csv(df, csim, threshold=0.5, output_file="internal_link_opportunities.csv"):
    rows = []

    for idx, row in df.iterrows():
        if idx >= csim.shape[0]:  # Prevent out-of-bounds error
            print(f"Skipping row {idx} due to mismatch with csim dimensions.")
            continue

        similar_indices = [i for i, score in enumerate(csim[idx]) if score > threshold and i != idx]
        if similar_indices:
            for i in similar_indices:
                rows.append({
                    "Page": row['Address'],
                    "Potential Internal Links": df.iloc[i]['Address'],
                    "Similarity": csim[idx][i]
                })

    output_df = pd.DataFrame(rows)
    output_df.to_csv(output_file, index=False)
    print(f"Internal link opportunities saved to {output_file}")


In [7]:
# Example usage:
# Assuming 'df' is a DataFrame with 'Address' and 'content' columns
csim = find_internal_link_opportunities(df)
save_link_opportunities_to_csv(df, csim, threshold=0.5)


Shape of csim: (1199, 1199)
Number of rows in df: 1199
Skipping row 1199 due to mismatch with csim dimensions.
Skipping row 1200 due to mismatch with csim dimensions.
Skipping row 1202 due to mismatch with csim dimensions.
Skipping row 1203 due to mismatch with csim dimensions.
Skipping row 1205 due to mismatch with csim dimensions.
Skipping row 1206 due to mismatch with csim dimensions.
Skipping row 1207 due to mismatch with csim dimensions.
Skipping row 1208 due to mismatch with csim dimensions.
Skipping row 1209 due to mismatch with csim dimensions.
Skipping row 1211 due to mismatch with csim dimensions.
Skipping row 1212 due to mismatch with csim dimensions.
Skipping row 1213 due to mismatch with csim dimensions.
Skipping row 1215 due to mismatch with csim dimensions.
Skipping row 1217 due to mismatch with csim dimensions.
Skipping row 1218 due to mismatch with csim dimensions.
Skipping row 1219 due to mismatch with csim dimensions.
Skipping row 1220 due to mismatch with csim dimen