# Analyses Report Notebook

## Installing and importing necessary packages

In [None]:
!pip install extruct

In [2]:
from analyse import HTMLTextAnalysis, WebsiteAnalysis
from metadata import get_title_from_link, replace_name
from datetime import datetime
import urllib.request
import urllib.parse
import json
import pandas as pd
import re

## Create dataframe of complete article list

In [None]:
article_list_df = WebsiteAnalysis.get_articles_list_dataframe("../../")

In [None]:
article_list_df.sort_values(["first"], ascending=False).head(10)

In [None]:
article_list_df

### Filter out specific articles

In [None]:
lang_df = article_list_df[
    (article_list_df["language"] == "en")
    & (article_list_df["filepath"].str.contains("teaching"))
]
lang_df

In [None]:
lang_df = article_list_df

In [None]:
# lang_df["title"] = lang_df.filepath.apply(get_title)
lang_df = lang_df.assign(title=lang_df["filepath"].apply(get_title_from_link))
lang_df.sort_values(["first"], ascending=False)

## Get current titles of the articles

In [None]:
lang_df = lang_df.assign(created=lang_df["first"].apply(datetime.fromtimestamp))
lang_df

In [None]:
sorted_lang_df = lang_df.sort_values(["first"], ascending=True)
sorted_lang_df

In [None]:
sorted_lang_df.to_csv(
    "output.csv", index=False, columns=["filepath", "title", "created"], sep=";"
)

## Helper functions for Querying Wikibase

In [3]:
def quote_query(query):
    query = urllib.parse.quote(query, safe="")
    return query

In [4]:
def query_wikibase(query):
    url = "https://jsamwrites.wikibase.cloud/query/sparql?format=json&query="+ quote_query(query)
    response = urllib.request.urlopen(url)
    responsedata = json.loads(response.read().decode("utf-8"))
    
    array = []
    for data in responsedata["results"]["bindings"]:
        array.append([data["item"]["value"], data["title"]["value"], data["url"]["value"]])
    
    wikibase_dataframe = pd.DataFrame(array, columns=["item", "title", "url"])
    return wikibase_dataframe

In [5]:
def query_wikibase(query, columns):
    url = "https://jsamwrites.wikibase.cloud/query/sparql?format=json&query="+ quote_query(query)
    response = urllib.request.urlopen(url)
    responsedata = json.loads(response.read().decode("utf-8"))
    
    array = []
    for data in responsedata["results"]["bindings"]:
        row = []
        for column in columns:
            row.append(data[column]["value"])
        array.append(row)
    
    wikibase_dataframe = pd.DataFrame(array, columns=columns)
    return wikibase_dataframe

## SPARQL query for getting the existing article list from Wikibase

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title;
        wdt:P10 ?time.
}
ORDER by ?time"""

wikibase_dataframe = query_wikibase(query)
wikibase_dataframe

Replace relative filepath with complete URL

In [None]:
sorted_lang_df["filepath"].replace("\.\.\/\.\.", "https://johnsamuel.info",inplace=True,regex=True)
sorted_lang_df["filepath"].replace(" ", "%20" ,inplace=True,regex=True)
sorted_lang_df["filepath"]

## Find the missing articles that are not yet on Wikibase

In [None]:
missing_articles = sorted_lang_df[~sorted_lang_df["filepath"].isin(wikibase_dataframe["url"])]
missing_articles

### Generate QuickStatements for the missing articles

In [None]:
with open("quickstatements.csv","w") as qw:
    for index,article in missing_articles.iterrows():
        qw.write("CREATE\n")
        qw.write('LAST|Den|"web page"\n')
        qw.write('LAST|Dfr|"page web"\n')
        qw.write(f"LAST|L{article['language']}|\"{article['title']}\"\n")
        qw.write(f"LAST|P27|{article['language']}:\"{article['title']}\"\n")
        qw.write("LAST|P17|Q48\n")
        qw.write("LAST|P8|Q45\n")
        qw.write(f"LAST|P3|\"{re.sub(' ', '%20', str(article['filepath']))}\"\n")
        qw.write("LAST|P13|Q1041\n")
        qw.write("LAST|P15|Q38\n")
        qw.write(f"LAST|P10|+{re.sub(r' ..:..:..','T00:00:00', str(article['created']))}Z/11\n\n")

## Verification of the titles
Verify whether the titles on the Wikibase are the same as those right now

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title;
        wdt:P10 ?time.
}
ORDER by ?time"""

wikibase_dataframe = query_wikibase(query)
wikibase_dataframe

sorted_lang_df["filepath"].replace("\.\.\/\.\.", "https://johnsamuel.info",inplace=True,regex=True)
sorted_lang_df["filepath"].replace(" ", "%20" ,inplace=True,regex=True)

In [None]:
stale_title_list = []
for index,article in wikibase_dataframe.iterrows():
    url = article["url"]
    wikibase_article = sorted_lang_df[sorted_lang_df["filepath"]==url]
    wikibase_title = wikibase_article["title"]
    if len(wikibase_title) > 0:
        wikibase_title = wikibase_title.values[0]
        if(wikibase_title != str(article["title"])):
            print("Incorrect", wikibase_title, str(article["title"]))
            stale_title_list.append((article["item"].replace("https://jsamwrites.wikibase.cloud/entity/", ""), str(wikibase_article["title"].values[0]), str(wikibase_article["language"].values[0])))
    else:
        #pass
        print(url, wikibase_article)


## Generate QuickStatements for correcting titles

In [None]:
with open("quickstatements.csv","w") as qw:
    for values in stale_title_list:
        qw.write(f"{values[0]}|P27|{values[2]}:\"{values[1]}\"\n")

### Generate QuickStatements for missing information

#### File format

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item {
  ?item wdt:P8 wd:Q45.
  FILTER NOT EXISTS {?item wdt:P13 []}
}"""

wikibase_dataframe = query_wikibase(query, columns=["item"])
wikibase_dataframe

In [None]:
with open("quickstatements.csv","w") as qw:
    for item in wikibase_dataframe["item"]:
        qw.write(f"{item.replace('https://jsamwrites.wikibase.cloud/entity/', '')}|P13|Q1041\n")

#### Creator

In [None]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item {
  ?item wdt:P8 wd:Q45.
  FILTER NOT EXISTS {?item wdt:P15 []}
}"""

wikibase_dataframe = query_wikibase(query, columns=["item"])
wikibase_dataframe

In [98]:
with open("quickstatements.csv","a") as qw:
    for item in wikibase_dataframe["item"]:
        qw.write(f"{item.replace('https://jsamwrites.wikibase.cloud/entity/', '')}|P16|Q1760\n")

#### Other possibilities
* Language of work (P17)
* Creation time (P10)
* full work available at URL (P3)
* form of creative work (P29)

#### Form of creative work

In [80]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title.
  
  FILTER (contains(str(?url), "enseignement") && contains(str(?url), "questions1.html")).
  
  FILTER NOT EXISTS {?item wdt:P29 wd:Q1046}

}"""

wikibase_dataframe = query_wikibase(query, columns=["item"])
wikibase_dataframe

Unnamed: 0,item
0,https://jsamwrites.wikibase.cloud/entity/Q108
1,https://jsamwrites.wikibase.cloud/entity/Q110
2,https://jsamwrites.wikibase.cloud/entity/Q112
3,https://jsamwrites.wikibase.cloud/entity/Q114
4,https://jsamwrites.wikibase.cloud/entity/Q145
5,https://jsamwrites.wikibase.cloud/entity/Q191
6,https://jsamwrites.wikibase.cloud/entity/Q193
7,https://jsamwrites.wikibase.cloud/entity/Q195
8,https://jsamwrites.wikibase.cloud/entity/Q297
9,https://jsamwrites.wikibase.cloud/entity/Q324


#### Main subject

In [97]:
query = """PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>
PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>

SELECT DISTINCT ?item ?title ?url{
  ?item wdt:P3 ?url;
        wdt:P27 ?title.
  
  FILTER (contains(str(?url), "teaching") && contains(LCASE(str(?title)), "data science")).
  
  FILTER NOT EXISTS {?item wdt:P16 wd:Q1760}.

}"""

wikibase_dataframe = query_wikibase(query, columns=["item"])
wikibase_dataframe

Unnamed: 0,item
0,https://jsamwrites.wikibase.cloud/entity/Q1472
1,https://jsamwrites.wikibase.cloud/entity/Q1574
2,https://jsamwrites.wikibase.cloud/entity/Q1575
3,https://jsamwrites.wikibase.cloud/entity/Q1576
4,https://jsamwrites.wikibase.cloud/entity/Q1577
5,https://jsamwrites.wikibase.cloud/entity/Q1578
6,https://jsamwrites.wikibase.cloud/entity/Q1579
7,https://jsamwrites.wikibase.cloud/entity/Q1660
8,https://jsamwrites.wikibase.cloud/entity/Q1661
9,https://jsamwrites.wikibase.cloud/entity/Q1381
