# Analyses Report Notebook

## Installing and importing necessary packages

In [None]:
!pip install extruct

In [None]:
from analyse import HTMLTextAnalysis, WebsiteAnalysis
from metadata import get_title_from_link, replace_name
from datetime import datetime
import urllib.request
import urllib.parse
import json
import pandas as pd
import re

## Create dataframe of complete article list

In [None]:
article_list_df = WebsiteAnalysis.get_articles_list_dataframe("../../")

In [None]:
article_list_df.sort_values(["first"], ascending=False).head(10)

In [None]:
article_list_df

### Filter out specific articles

In [None]:
lang_df = article_list_df[
    (article_list_df["language"] == "en")
    & (article_list_df["filepath"].str.contains("teaching"))
]
lang_df

In [None]:
lang_df = article_list_df

In [None]:
# lang_df["title"] = lang_df.filepath.apply(get_title)
lang_df = lang_df.assign(title=lang_df["filepath"].apply(get_title_from_link))
lang_df.sort_values(["first"], ascending=False)

## Get current titles of the articles

In [None]:
lang_df = lang_df.assign(created=lang_df["first"].apply(datetime.fromtimestamp))
lang_df

In [None]:
sorted_lang_df = lang_df.sort_values(["first"], ascending=True)
sorted_lang_df

In [None]:
sorted_lang_df.to_csv(
    "output.csv", index=False, columns=["filepath", "title", "created"], sep=";"
)

## Helper functions for Querying Wikibase

In [None]:
def quote_query(query):
    query = urllib.parse.quote(query, safe="")
    return query

In [None]:
def query_wikibase(query):
    url = "https://jsamwrites.wikibase.cloud/query/sparql?format=json&query="+ quote_query(query)
    response = urllib.request.urlopen(url)
    responsedata = json.loads(response.read().decode("utf-8"))
    
    array = []
    for data in responsedata["results"]["bindings"]:
        array.append([data["item"]["value"], data["title"]["value"], data["url"]["value"]])
    
    wikibase_dataframe = pd.DataFrame(array, columns=["item", "title", "url"])
    return wikibase_dataframe

## SPARQL query for getting the existing article list from Wikibase

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title;
        wdt:P10 ?time.
}
ORDER by ?time"""

wikibase_dataframe = query_wikibase(query)
wikibase_dataframe

Replace relative filepath with complete URL

In [None]:
sorted_lang_df["filepath"].replace("\.\.\/\.\.", "https://johnsamuel.info",inplace=True,regex=True)
sorted_lang_df["filepath"].replace(" ", "%20" ,inplace=True,regex=True)
sorted_lang_df["filepath"]

## Find the missing articles that are not yet on Wikibase

In [None]:
missing_articles = sorted_lang_df[~sorted_lang_df["filepath"].isin(wikibase_dataframe["url"])]
missing_articles

### Generate QuickStatements for the missing articles

In [None]:
with open("quickstatements.csv","w") as qw:
    for index,article in missing_articles.iterrows():
        qw.write("CREATE\n")
        qw.write('LAST|Den|"web page"\n')
        qw.write('LAST|Dfr|"page web"\n')
        qw.write(f"LAST|L{article['language']}|\"{article['title']}\"\n")
        qw.write(f"LAST|P27|{article['language']}:\"{article['title']}\"\n")
        qw.write("LAST|P17|Q48\n")
        qw.write("LAST|P8|Q45\n")
        qw.write(f"LAST|P3|\"{re.sub(' ', '%20', str(article['filepath']))}\"\n")
        qw.write("LAST|P13|Q1041\n")
        qw.write("LAST|P15|Q38\n")
        qw.write(f"LAST|P10|+{re.sub(r' ..:..:..','T00:00:00', str(article['created']))}Z/11\n\n")

## Verification of the titles
Verify whether the titles on the Wikibase are the same as those right now

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title;
        wdt:P10 ?time.
}
ORDER by ?time"""

wikibase_dataframe = query_wikibase(query)
wikibase_dataframe

sorted_lang_df["filepath"].replace("\.\.\/\.\.", "https://johnsamuel.info",inplace=True,regex=True)
sorted_lang_df["filepath"].replace(" ", "%20" ,inplace=True,regex=True)

In [None]:
stale_title_list = []
for index,article in wikibase_dataframe.iterrows():
    url = article["url"]
    wikibase_article = sorted_lang_df[sorted_lang_df["filepath"]==url]
    wikibase_title = wikibase_article["title"]
    if len(wikibase_title) > 0:
        wikibase_title = wikibase_title.values[0]
        if(wikibase_title != str(article["title"])):
            print("Incorrect", wikibase_title, str(article["title"]))
            stale_title_list.append((article["item"].replace("https://jsamwrites.wikibase.cloud/entity/", ""), str(wikibase_article["title"].values[0]), str(wikibase_article["language"].values[0])))
    else:
        #pass
        print(url, wikibase_article)


## Generate QuickStatements for correcting titles

In [None]:
with open("quickstatements.csv","w") as qw:
    for values in stale_title_list:
        qw.write(f"{values[0]}|P27|{values[2]}:\"{values[1]}\"\n")