__Files created__

- review_api.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Extracting review history data through Wikipedia API

In [None]:
import requests
import pandas as pd
from datetime import datetime
import re

In [None]:
data = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/dataset.csv')
data

Unnamed: 0,title,page_id,quality
0,Mayan languages,182013,FA
1,Mu'awiya I,207068,FA
2,The Fountainhead,180464,FA
3,Northern pintail,218361,FA
4,Manhattan Project,19603,FA
...,...,...,...
5195,Party of Democratic Kampuchea,265468,Stub
5196,Minawara and Multultu,95240,Stub
5197,Theophylline/ephedra/hydroxyzine,262652,Stub
5198,"Channel Lake, Illinois",111450,Stub


In [None]:
# Function to extract review history data and page ID
def get_wikipedia_article_info(titles):
    def is_ip_address(user):
        # Check if a user is an IP address (IPv4 or IPv6)
        ipv4_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
        ipv6_pattern = re.compile(r'^(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}$')
        return ipv4_pattern.match(user) is not None or ipv6_pattern.match(user) is not None

    def get_revisions(page_id):
        endpoint = "https://en.wikipedia.org/w/api.php"
        revisions = []
        continue_flag = True
        rvcontinue = None

        while continue_flag:
            # Define parameters for the request (to get revisions)
            params = {
                'action': 'query',
                'format': 'json',
                'prop': 'revisions',
                'rvprop': 'timestamp|user|comment',
                'rvlimit': 'max',
                'pageids': page_id
            }
            if rvcontinue:
                params['rvcontinue'] = rvcontinue

            # Make the request to get revisions
            response = requests.get(endpoint, params=params)

            # Check if the request was successful
            if response.status_code != 200:
                raise Exception(f"Error: Unable to fetch revisions from Wikipedia API. Status code: {response.status_code}")

            # Parse the response
            data = response.json()
            page_revisions = data['query']['pages'][str(page_id)]['revisions']
            revisions.extend(page_revisions)

            # Check for continue flag for pagination
            if 'continue' in data:
                rvcontinue = data['continue']['rvcontinue']
            else:
                continue_flag = False

        return revisions

    def get_talk_page_id(title):
        endpoint = "https://en.wikipedia.org/w/api.php"
        params = {
            'action': 'query',
            'format': 'json',
            'prop': 'info',
            'titles': f"Talk:{title}"
        }

        response = requests.get(endpoint, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: Unable to fetch data from Wikipedia API. Status code: {response.status_code}")

        data = response.json()
        pages = data['query']['pages']
        talk_page_id = list(pages.keys())[0]

        if talk_page_id == '-1':  # No talk page
            return None

        return talk_page_id

    # Define the endpoint
    endpoint = "https://en.wikipedia.org/w/api.php"

    # Define parameters for the first request (to get page IDs and basic info)
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'info',
        'titles': '|'.join(titles)
    }

    # Make the request to get page IDs and basic info
    response = requests.get(endpoint, params=params)

    # Check if the request was successful
    if response.status_code != 200:
        return f"Error: Unable to fetch data from Wikipedia API. Status code: {response.status_code}"

    # Parse the response
    data = response.json()
    pages = data['query']['pages']

    # Prepare a structure to hold the article information
    article_info = []

    for page_id, page in pages.items():
        if 'missing' in page:
            continue  # Skip missing pages

        title = page.get('title')
        page_id = page.get('pageid')

        # Get revisions for the main article
        revisions = get_revisions(page_id)

        # Get revisions for the talk page
        talk_page_id = get_talk_page_id(title)
        talk_revisions = get_revisions(talk_page_id) if talk_page_id else []

        # Calculate the number of edits
        num_edits = len(revisions)

        # Calculate the number of unique editors
        editors = set(rev['user'] for rev in revisions if 'user' in rev)
        num_editors = len(editors)

        # Separate registered and anonymous editors
        registered_editors = set()
        anonymous_editors = set()
        editor_contributions = {}

        for rev in revisions:
            user = rev.get('user')
            if user:
                if is_ip_address(user):
                    anonymous_editors.add(user)
                else:
                    registered_editors.add(user)
            else:
                anonymous_editors.add('anonymous')  # Fallback in case user is missing completely

            if user not in editor_contributions:
                editor_contributions[user] = 0
            editor_contributions[user] += 1

        # Calculate occasional editors (less than 3 edits)
        occasional_editors = {user for user, count in editor_contributions.items() if count < 3}

        # Count reverts
        revert_keywords = ["revert", "undid", "rv"]
        revert_count = sum(1 for rev in revisions if any(keyword in rev.get('comment', '').lower() for keyword in revert_keywords))

        # Calculate the number of discussions (edits in the talk page)
        discussion_count = len(talk_revisions)

        # Get the creation date of the article
        creation_date = datetime.strptime(revisions[-1]['timestamp'], "%Y-%m-%dT%H:%M:%SZ")

        # Calculate the age of the article
        article_age = datetime.now() - creation_date

        # Append the information to the list
        article_info.append({
            'title': title,
            'page_id': page_id,
            'article_age_days': article_age.days,
            'num_edits': num_edits,
            'num_editors': num_editors,
            'num_registered_editors': len(registered_editors),
            'num_anonymous_editors': len(anonymous_editors),
            'num_occasional_editors': len(occasional_editors),
            'revert_count': revert_count,
            'discussion_count': discussion_count
        })

    return article_info

In [None]:
# Example
titles = ["Channel Lake, Illinois"]
info = get_wikipedia_article_info(titles)
print(info)

[{'title': 'Channel Lake, Illinois', 'page_id': 111450, 'article_age_days': 7923, 'num_edits': 72, 'num_editors': 47, 'num_registered_editors': 44, 'num_anonymous_editors': 3, 'num_occasional_editors': 42, 'revert_count': 0, 'discussion_count': 6}]


In [None]:
review1 = []
for i in range(0, 500, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review1.append(df_add)

In [None]:
len(review1)

10

In [None]:
review2 = []
for i in range(500, 1000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review2.append(df_add)

In [None]:
len(review2)

10

In [None]:
review3 = []
for i in range(1000, 1500, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review3.append(df_add)

In [None]:
len(review3)

10

In [None]:
review4 = []
for i in range(1500, 2000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review4.append(df_add)

In [None]:
len(review4)

10

In [None]:
review5 = []
for i in range(2000, 2500, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review5.append(df_add)

In [None]:
len(review5)

10

In [None]:
review6 = []
for i in range(2500, 3000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review6.append(df_add)

In [None]:
len(review6)

10

In [None]:
review7 = []
for i in range(3000, 3500, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review7.append(df_add)

In [None]:
len(review7)

10

In [None]:
review8 = []
for i in range(3500, 4000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review8.append(df_add)

In [None]:
len(review8)

10

In [None]:
review9 = []
for i in range(4000, 4500, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review9.append(df_add)

In [None]:
len(review9)

10

In [None]:
review10 = []
for i in range(4500, 5000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review10.append(df_add)

In [None]:
len(review10)

10

In [None]:
review11 = []
for i in range(5000, 5200, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review11.append(df_add)

In [None]:
len(review11)

4

In [None]:
review = []
lst = [review1, review2, review3, review4, review5, review6, review7, review8, review9, review10]
for i in lst:
    for j in range(10):
        review.append(i[j])

In [None]:
len(review)

In [None]:
for i in range(len(review11)):
    review.append(review11[i])

In [None]:
len(review)

In [None]:
# Combine all collected data into a dataframe
df_review = pd.DataFrame(review[0])
for i in range(1, len(review)):
    df_add = pd.DataFrame(review[i])
    df_review = pd.concat([df_review, df_add])

In [None]:
len(df_review)

5200

In [None]:
df_review

Unnamed: 0,title,page_id,article_age_days,num_edits,num_editors,num_registered_editors,num_anonymous_editors,num_occasional_editors,revert_count,discussion_count
0,1880 United States presidential election,40525,8159,1047,461,317,144,400,118,59
1,Aliens (film),213472,7739,6614,2729,1248,1481,2332,1234,618
2,Apollo 9,1774,8340,1377,525,356,169,456,155,124
3,Balfour Declaration,4820,8230,3373,1002,586,416,847,346,1319
4,Basiliscus,144810,7903,728,350,262,88,299,82,85
...,...,...,...,...,...,...,...,...,...,...
45,W Window System,301825,7616,65,54,42,12,51,1,6
46,Wavenumber–frequency diagram,164631,7847,50,39,35,4,37,3,5
47,Weapon (biology),34129,8191,43,29,27,2,24,1,6
48,Winkworth Arboretum,311111,7603,60,43,39,4,39,2,4


In [None]:
df_review.isnull().sum()

title                     0
page_id                   0
article_age_days          0
num_edits                 0
num_editors               0
num_registered_editors    0
num_anonymous_editors     0
num_occasional_editors    0
revert_count              0
discussion_count          0
dtype: int64

In [None]:
df_review.reset_index(drop=True, inplace=True)
df_review.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/review_api.csv')

- Check whether page ID matches

In [None]:
combined = data.merge(df_review, on='title')
combined

Unnamed: 0,title,page_id_x,quality,page_id_y,article_age_days,num_edits,num_editors,num_registered_editors,num_anonymous_editors,num_occasional_editors,revert_count,discussion_count
0,Mayan languages,182013,FA,182013,7807,2533,832,466,366,719,415,472
1,Mu'awiya I,207068,FA,207068,7752,3947,1377,657,720,1136,721,970
2,The Fountainhead,180464,FA,180464,7811,2318,1104,486,618,971,228,218
3,Northern pintail,218361,FA,218361,7730,763,376,282,94,334,86,54
4,Manhattan Project,19603,FA,19603,8368,6417,2647,1281,1366,2296,1112,847
...,...,...,...,...,...,...,...,...,...,...,...,...
5195,Party of Democratic Kampuchea,265468,Stub,265468,7657,89,53,41,12,46,3,12
5196,Minawara and Multultu,95240,Stub,95240,7943,34,29,27,2,29,1,3
5197,Theophylline/ephedra/hydroxyzine,262652,Stub,262652,7662,45,34,30,4,32,2,3
5198,"Channel Lake, Illinois",111450,Stub,111450,7923,72,47,44,3,42,0,6


In [None]:
combined[combined['page_id_x']!=combined['page_id_y']]

Unnamed: 0,title,page_id_x,quality,page_id_y,article_age_days,num_edits,num_editors,num_registered_editors,num_anonymous_editors,num_occasional_editors,revert_count,discussion_count
